1 /* Connection state tracking for netfilter.  This is separated from,
2    but required by, the NAT layer; it can also be used by an iptables
3    extension. */
4 
5 /* (c) 1999 Paul `Rusty' Russell.  Licenced under the GNU General
6  * Public Licence.
7  *
8  * 23 Apr 2001: Harald Welte <laforge@gnumonks.org>
9  * 	- new API and handling of conntrack/nat helpers
10  * 	- now capable of multiple expectations for one master
11  * 16 Jul 2002: Harald Welte <laforge@gnumonks.org>
12  * 	- add usage/reference counts to ip_conntrack_expect
13  *	- export ip_conntrack[_expect]_{find_get,put} functions
14  * */
15 
16 #include <linux/version.h>
17 #include <linux/config.h>
18 #include <linux/types.h>
19 #include <linux/ip.h>
20 #include <linux/netfilter.h>
21 #include <linux/netfilter_ipv4.h>
22 #include <linux/module.h>
23 #include <linux/skbuff.h>
24 #include <linux/proc_fs.h>
25 #include <linux/vmalloc.h>
26 #include <linux/brlock.h>
27 #include <net/checksum.h>
28 #include <linux/stddef.h>
29 #include <linux/sysctl.h>
30 #include <linux/slab.h>
31 #include <linux/random.h>
32 #include <linux/jhash.h>
33 /* For ERR_PTR().  Yeah, I know... --RR */
34 #include <linux/fs.h>
35 
36 /* This rwlock protects the main hash table, protocol/helper/expected
37    registrations, conntrack timers*/
38 #define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_conntrack_lock)
39 #define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_conntrack_lock)
40 
41 #include <linux/netfilter_ipv4/ip_conntrack.h>
42 #include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
43 #include <linux/netfilter_ipv4/ip_conntrack_helper.h>
44 #include <linux/netfilter_ipv4/ip_conntrack_core.h>
45 #include <linux/netfilter_ipv4/listhelp.h>
46 
47 #define IP_CONNTRACK_VERSION	"2.1"
48 
49 #if 0
50 #define DEBUGP printk
51 #else
52 #define DEBUGP(format, args...)
53 #endif
54 
55 DECLARE_RWLOCK(ip_conntrack_lock);
56 DECLARE_RWLOCK(ip_conntrack_expect_tuple_lock);
57 
58 void (*ip_conntrack_destroyed)(struct ip_conntrack *conntrack) = NULL;
59 LIST_HEAD(ip_conntrack_expect_list);
60 LIST_HEAD(protocol_list);
61 static LIST_HEAD(helpers);
62 unsigned int ip_conntrack_htable_size = 0;
63 int ip_conntrack_max = 0;
64 static atomic_t ip_conntrack_count = ATOMIC_INIT(0);
65 struct list_head *ip_conntrack_hash;
66 static kmem_cache_t *ip_conntrack_cachep;
67 static LIST_HEAD(unconfirmed);
68 
69 extern struct ip_conntrack_protocol ip_conntrack_generic_protocol;
70 
proto_cmpfn(const struct ip_conntrack_protocol * curr,u_int8_t protocol)71 static inline int proto_cmpfn(const struct ip_conntrack_protocol *curr,
72 			      u_int8_t protocol)
73 {
74 	return protocol == curr->proto;
75 }
76 
__ip_ct_find_proto(u_int8_t protocol)77 struct ip_conntrack_protocol *__ip_ct_find_proto(u_int8_t protocol)
78 {
79 	struct ip_conntrack_protocol *p;
80 
81 	MUST_BE_READ_LOCKED(&ip_conntrack_lock);
82 	p = LIST_FIND(&protocol_list, proto_cmpfn,
83 		      struct ip_conntrack_protocol *, protocol);
84 	if (!p)
85 		p = &ip_conntrack_generic_protocol;
86 
87 	return p;
88 }
89 
ip_ct_find_proto(u_int8_t protocol)90 struct ip_conntrack_protocol *ip_ct_find_proto(u_int8_t protocol)
91 {
92 	struct ip_conntrack_protocol *p;
93 
94 	READ_LOCK(&ip_conntrack_lock);
95 	p = __ip_ct_find_proto(protocol);
96 	READ_UNLOCK(&ip_conntrack_lock);
97 	return p;
98 }
99 
100 inline void
ip_conntrack_put(struct ip_conntrack * ct)101 ip_conntrack_put(struct ip_conntrack *ct)
102 {
103 	IP_NF_ASSERT(ct);
104 	IP_NF_ASSERT(ct->infos[0].master);
105 	/* nf_conntrack_put wants to go via an info struct, so feed it
106            one at random. */
107 	nf_conntrack_put(&ct->infos[0]);
108 }
109 
110 static int ip_conntrack_hash_rnd_initted;
111 static unsigned int ip_conntrack_hash_rnd;
112 
113 static u_int32_t
hash_conntrack(const struct ip_conntrack_tuple * tuple)114 hash_conntrack(const struct ip_conntrack_tuple *tuple)
115 {
116 #if 0
117 	dump_tuple(tuple);
118 #endif
119 	return (jhash_3words(tuple->src.ip,
120 	                     (tuple->dst.ip ^ tuple->dst.protonum),
121 	                     (tuple->src.u.all | (tuple->dst.u.all << 16)),
122 	                     ip_conntrack_hash_rnd) % ip_conntrack_htable_size);
123 }
124 
125 inline int
ip_ct_get_tuple(const struct iphdr * iph,size_t len,struct ip_conntrack_tuple * tuple,struct ip_conntrack_protocol * protocol)126 ip_ct_get_tuple(const struct iphdr *iph, size_t len,
127                 struct ip_conntrack_tuple *tuple,
128                 struct ip_conntrack_protocol *protocol)
129 {
130 	int ret;
131 
132 	/* Never happen */
133 	if (iph->frag_off & htons(IP_OFFSET)) {
134 		printk("ip_conntrack_core: Frag of proto %u.\n",
135 		       iph->protocol);
136 		return 0;
137 	}
138 	/* Guarantee 8 protocol bytes: if more wanted, use len param */
139 	else if (iph->ihl * 4 + 8 > len)
140 		return 0;
141 
142 	tuple->src.ip = iph->saddr;
143 	tuple->dst.ip = iph->daddr;
144 	tuple->dst.protonum = iph->protocol;
145 
146 	ret = protocol->pkt_to_tuple((u_int32_t *)iph + iph->ihl,
147 				     len - 4*iph->ihl,
148 				     tuple);
149 	return ret;
150 }
151 
152 static int
invert_tuple(struct ip_conntrack_tuple * inverse,const struct ip_conntrack_tuple * orig,const struct ip_conntrack_protocol * protocol)153 invert_tuple(struct ip_conntrack_tuple *inverse,
154 	     const struct ip_conntrack_tuple *orig,
155 	     const struct ip_conntrack_protocol *protocol)
156 {
157 	inverse->src.ip = orig->dst.ip;
158 	inverse->dst.ip = orig->src.ip;
159 	inverse->dst.protonum = orig->dst.protonum;
160 
161 	return protocol->invert_tuple(inverse, orig);
162 }
163 
164 
165 /* ip_conntrack_expect helper functions */
166 
167 /* Compare tuple parts depending on mask. */
expect_cmp(const struct ip_conntrack_expect * i,const struct ip_conntrack_tuple * tuple)168 static inline int expect_cmp(const struct ip_conntrack_expect *i,
169 			     const struct ip_conntrack_tuple *tuple)
170 {
171 	MUST_BE_READ_LOCKED(&ip_conntrack_expect_tuple_lock);
172 	return ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask);
173 }
174 
175 static void
destroy_expect(struct ip_conntrack_expect * exp)176 destroy_expect(struct ip_conntrack_expect *exp)
177 {
178 	DEBUGP("destroy_expect(%p) use=%d\n", exp, atomic_read(&exp->use));
179 	IP_NF_ASSERT(atomic_read(&exp->use) == 0);
180 	IP_NF_ASSERT(!timer_pending(&exp->timeout));
181 
182 	kfree(exp);
183 }
184 
ip_conntrack_expect_put(struct ip_conntrack_expect * exp)185 inline void ip_conntrack_expect_put(struct ip_conntrack_expect *exp)
186 {
187 	IP_NF_ASSERT(exp);
188 
189 	if (atomic_dec_and_test(&exp->use)) {
190 		/* usage count dropped to zero */
191 		destroy_expect(exp);
192 	}
193 }
194 
195 static inline struct ip_conntrack_expect *
__ip_ct_expect_find(const struct ip_conntrack_tuple * tuple)196 __ip_ct_expect_find(const struct ip_conntrack_tuple *tuple)
197 {
198 	MUST_BE_READ_LOCKED(&ip_conntrack_lock);
199 	MUST_BE_READ_LOCKED(&ip_conntrack_expect_tuple_lock);
200 	return LIST_FIND(&ip_conntrack_expect_list, expect_cmp,
201 			 struct ip_conntrack_expect *, tuple);
202 }
203 
204 /* Find a expectation corresponding to a tuple. */
205 struct ip_conntrack_expect *
ip_conntrack_expect_find_get(const struct ip_conntrack_tuple * tuple)206 ip_conntrack_expect_find_get(const struct ip_conntrack_tuple *tuple)
207 {
208 	struct ip_conntrack_expect *exp;
209 
210 	READ_LOCK(&ip_conntrack_lock);
211 	READ_LOCK(&ip_conntrack_expect_tuple_lock);
212 	exp = __ip_ct_expect_find(tuple);
213 	if (exp)
214 		atomic_inc(&exp->use);
215 	READ_UNLOCK(&ip_conntrack_expect_tuple_lock);
216 	READ_UNLOCK(&ip_conntrack_lock);
217 
218 	return exp;
219 }
220 
221 /* remove one specific expectation from all lists and drop refcount,
222  * does _NOT_ delete the timer. */
__unexpect_related(struct ip_conntrack_expect * expect)223 static void __unexpect_related(struct ip_conntrack_expect *expect)
224 {
225 	DEBUGP("unexpect_related(%p)\n", expect);
226 	MUST_BE_WRITE_LOCKED(&ip_conntrack_lock);
227 
228 	/* we're not allowed to unexpect a confirmed expectation! */
229 	IP_NF_ASSERT(!expect->sibling);
230 
231 	/* delete from global and local lists */
232 	list_del(&expect->list);
233 	list_del(&expect->expected_list);
234 
235 	/* decrement expect-count of master conntrack */
236 	if (expect->expectant)
237 		expect->expectant->expecting--;
238 
239 	ip_conntrack_expect_put(expect);
240 }
241 
242 /* remove one specific expecatation from all lists, drop refcount
243  * and expire timer.
244  * This function can _NOT_ be called for confirmed expects! */
unexpect_related(struct ip_conntrack_expect * expect)245 static void unexpect_related(struct ip_conntrack_expect *expect)
246 {
247 	IP_NF_ASSERT(expect->expectant);
248 	IP_NF_ASSERT(expect->expectant->helper);
249 	/* if we are supposed to have a timer, but we can't delete
250 	 * it: race condition.  __unexpect_related will
251 	 * be calledd by timeout function */
252 	if (expect->expectant->helper->timeout
253 	    && !del_timer(&expect->timeout))
254 		return;
255 
256 	__unexpect_related(expect);
257 }
258 
259 /* delete all unconfirmed expectations for this conntrack */
remove_expectations(struct ip_conntrack * ct,int drop_refcount)260 static void remove_expectations(struct ip_conntrack *ct, int drop_refcount)
261 {
262 	struct list_head *exp_entry, *next;
263 	struct ip_conntrack_expect *exp;
264 
265 	DEBUGP("remove_expectations(%p)\n", ct);
266 
267 	list_for_each_safe(exp_entry, next, &ct->sibling_list) {
268 		exp = list_entry(exp_entry, struct ip_conntrack_expect,
269 				 expected_list);
270 
271 		/* we skip established expectations, as we want to delete
272 		 * the un-established ones only */
273 		if (exp->sibling) {
274 			DEBUGP("remove_expectations: skipping established %p of %p\n", exp->sibling, ct);
275 			if (drop_refcount) {
276 				/* Indicate that this expectations parent is dead */
277 				ip_conntrack_put(exp->expectant);
278 				exp->expectant = NULL;
279 			}
280 			continue;
281 		}
282 
283 		IP_NF_ASSERT(list_inlist(&ip_conntrack_expect_list, exp));
284 		IP_NF_ASSERT(exp->expectant == ct);
285 
286 		/* delete expectation from global and private lists */
287 		unexpect_related(exp);
288 	}
289 }
290 
291 static void
clean_from_lists(struct ip_conntrack * ct)292 clean_from_lists(struct ip_conntrack *ct)
293 {
294 	unsigned int ho, hr;
295 
296 	DEBUGP("clean_from_lists(%p)\n", ct);
297 	MUST_BE_WRITE_LOCKED(&ip_conntrack_lock);
298 
299 	ho = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
300 	hr = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
301 	LIST_DELETE(&ip_conntrack_hash[ho], &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
302 	LIST_DELETE(&ip_conntrack_hash[hr], &ct->tuplehash[IP_CT_DIR_REPLY]);
303 
304 	/* Destroy all un-established, pending expectations */
305 	remove_expectations(ct, 1);
306 }
307 
308 static void
destroy_conntrack(struct nf_conntrack * nfct)309 destroy_conntrack(struct nf_conntrack *nfct)
310 {
311 	struct ip_conntrack *ct = (struct ip_conntrack *)nfct, *master = NULL;
312 	struct ip_conntrack_protocol *proto;
313 
314 	DEBUGP("destroy_conntrack(%p)\n", ct);
315 	IP_NF_ASSERT(atomic_read(&nfct->use) == 0);
316 	IP_NF_ASSERT(!timer_pending(&ct->timeout));
317 
318 	/* To make sure we don't get any weird locking issues here:
319 	 * destroy_conntrack() MUST NOT be called with a write lock
320 	 * to ip_conntrack_lock!!! -HW */
321 	proto = ip_ct_find_proto(ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum);
322 	if (proto && proto->destroy)
323 		proto->destroy(ct);
324 
325 	if (ip_conntrack_destroyed)
326 		ip_conntrack_destroyed(ct);
327 
328 	WRITE_LOCK(&ip_conntrack_lock);
329 	/* Make sure don't leave any orphaned expectations lying around */
330 	if (ct->expecting)
331 		remove_expectations(ct, 1);
332 
333 	/* We overload first tuple to link into unconfirmed list. */
334 	if (!is_confirmed(ct)) {
335 		BUG_ON(list_empty(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list));
336 		list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
337 	}
338 
339 	/* Delete our master expectation */
340 	if (ct->master) {
341 		if (ct->master->expectant) {
342 			/* can't call __unexpect_related here,
343 			 * since it would screw up expect_list */
344 			list_del(&ct->master->expected_list);
345 			master = ct->master->expectant;
346 		}
347 		kfree(ct->master);
348 	}
349 	WRITE_UNLOCK(&ip_conntrack_lock);
350 
351 	if (master)
352 		ip_conntrack_put(master);
353 
354 	DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct);
355 	kmem_cache_free(ip_conntrack_cachep, ct);
356 	atomic_dec(&ip_conntrack_count);
357 }
358 
death_by_timeout(unsigned long ul_conntrack)359 static void death_by_timeout(unsigned long ul_conntrack)
360 {
361 	struct ip_conntrack *ct = (void *)ul_conntrack;
362 
363 	WRITE_LOCK(&ip_conntrack_lock);
364 	clean_from_lists(ct);
365 	WRITE_UNLOCK(&ip_conntrack_lock);
366 	ip_conntrack_put(ct);
367 }
368 
369 static inline int
conntrack_tuple_cmp(const struct ip_conntrack_tuple_hash * i,const struct ip_conntrack_tuple * tuple,const struct ip_conntrack * ignored_conntrack)370 conntrack_tuple_cmp(const struct ip_conntrack_tuple_hash *i,
371 		    const struct ip_conntrack_tuple *tuple,
372 		    const struct ip_conntrack *ignored_conntrack)
373 {
374 	MUST_BE_READ_LOCKED(&ip_conntrack_lock);
375 	return i->ctrack != ignored_conntrack
376 		&& ip_ct_tuple_equal(tuple, &i->tuple);
377 }
378 
379 static struct ip_conntrack_tuple_hash *
__ip_conntrack_find(const struct ip_conntrack_tuple * tuple,const struct ip_conntrack * ignored_conntrack)380 __ip_conntrack_find(const struct ip_conntrack_tuple *tuple,
381 		    const struct ip_conntrack *ignored_conntrack)
382 {
383 	struct ip_conntrack_tuple_hash *h;
384 	unsigned int hash = hash_conntrack(tuple);
385 
386 	MUST_BE_READ_LOCKED(&ip_conntrack_lock);
387 	h = LIST_FIND(&ip_conntrack_hash[hash],
388 		      conntrack_tuple_cmp,
389 		      struct ip_conntrack_tuple_hash *,
390 		      tuple, ignored_conntrack);
391 	return h;
392 }
393 
394 /* Find a connection corresponding to a tuple. */
395 struct ip_conntrack_tuple_hash *
ip_conntrack_find_get(const struct ip_conntrack_tuple * tuple,const struct ip_conntrack * ignored_conntrack)396 ip_conntrack_find_get(const struct ip_conntrack_tuple *tuple,
397 		      const struct ip_conntrack *ignored_conntrack)
398 {
399 	struct ip_conntrack_tuple_hash *h;
400 
401 	READ_LOCK(&ip_conntrack_lock);
402 	h = __ip_conntrack_find(tuple, ignored_conntrack);
403 	if (h)
404 		atomic_inc(&h->ctrack->ct_general.use);
405 	READ_UNLOCK(&ip_conntrack_lock);
406 
407 	return h;
408 }
409 
410 static inline struct ip_conntrack *
__ip_conntrack_get(struct nf_ct_info * nfct,enum ip_conntrack_info * ctinfo)411 __ip_conntrack_get(struct nf_ct_info *nfct, enum ip_conntrack_info *ctinfo)
412 {
413 	struct ip_conntrack *ct
414 		= (struct ip_conntrack *)nfct->master;
415 
416 	/* ctinfo is the index of the nfct inside the conntrack */
417 	*ctinfo = nfct - ct->infos;
418 	IP_NF_ASSERT(*ctinfo >= 0 && *ctinfo < IP_CT_NUMBER);
419 	return ct;
420 }
421 
422 /* Return conntrack and conntrack_info given skb->nfct->master */
423 struct ip_conntrack *
ip_conntrack_get(struct sk_buff * skb,enum ip_conntrack_info * ctinfo)424 ip_conntrack_get(struct sk_buff *skb, enum ip_conntrack_info *ctinfo)
425 {
426 	if (skb->nfct)
427 		return __ip_conntrack_get(skb->nfct, ctinfo);
428 	return NULL;
429 }
430 
431 /* Confirm a connection given skb->nfct; places it in hash table */
432 int
__ip_conntrack_confirm(struct nf_ct_info * nfct)433 __ip_conntrack_confirm(struct nf_ct_info *nfct)
434 {
435 	unsigned int hash, repl_hash;
436 	struct ip_conntrack *ct;
437 	enum ip_conntrack_info ctinfo;
438 
439 	ct = __ip_conntrack_get(nfct, &ctinfo);
440 
441 	/* ipt_REJECT uses ip_conntrack_attach to attach related
442 	   ICMP/TCP RST packets in other direction.  Actual packet
443 	   which created connection will be IP_CT_NEW or for an
444 	   expected connection, IP_CT_RELATED. */
445 	if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
446 		return NF_ACCEPT;
447 
448 	hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
449 	repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
450 
451 	/* We're not in hash table, and we refuse to set up related
452 	   connections for unconfirmed conns.  But packet copies and
453 	   REJECT will give spurious warnings here. */
454 	/* IP_NF_ASSERT(atomic_read(&ct->ct_general.use) == 1); */
455 
456 	/* No external references means noone else could have
457            confirmed us. */
458 	IP_NF_ASSERT(!is_confirmed(ct));
459 	DEBUGP("Confirming conntrack %p\n", ct);
460 
461 	WRITE_LOCK(&ip_conntrack_lock);
462 	/* See if there's one in the list already, including reverse:
463            NAT could have grabbed it without realizing, since we're
464            not in the hash.  If there is, we lost race. */
465 	if (!LIST_FIND(&ip_conntrack_hash[hash],
466 		       conntrack_tuple_cmp,
467 		       struct ip_conntrack_tuple_hash *,
468 		       &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, NULL)
469 	    && !LIST_FIND(&ip_conntrack_hash[repl_hash],
470 			  conntrack_tuple_cmp,
471 			  struct ip_conntrack_tuple_hash *,
472 			  &ct->tuplehash[IP_CT_DIR_REPLY].tuple, NULL)) {
473 		/* Remove from unconfirmed list */
474 		list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
475 
476 		list_prepend(&ip_conntrack_hash[hash],
477 			     &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
478 		list_prepend(&ip_conntrack_hash[repl_hash],
479 			     &ct->tuplehash[IP_CT_DIR_REPLY]);
480 		/* Timer relative to confirmation time, not original
481 		   setting time, otherwise we'd get timer wrap in
482 		   weird delay cases. */
483 		ct->timeout.expires += jiffies;
484 		add_timer(&ct->timeout);
485 		atomic_inc(&ct->ct_general.use);
486 		set_bit(IPS_CONFIRMED_BIT, &ct->status);
487 		WRITE_UNLOCK(&ip_conntrack_lock);
488 		return NF_ACCEPT;
489 	}
490 
491 	WRITE_UNLOCK(&ip_conntrack_lock);
492 	return NF_DROP;
493 }
494 
495 /* Returns true if a connection correspondings to the tuple (required
496    for NAT). */
497 int
ip_conntrack_tuple_taken(const struct ip_conntrack_tuple * tuple,const struct ip_conntrack * ignored_conntrack)498 ip_conntrack_tuple_taken(const struct ip_conntrack_tuple *tuple,
499 			 const struct ip_conntrack *ignored_conntrack)
500 {
501 	struct ip_conntrack_tuple_hash *h;
502 
503 	READ_LOCK(&ip_conntrack_lock);
504 	h = __ip_conntrack_find(tuple, ignored_conntrack);
505 	READ_UNLOCK(&ip_conntrack_lock);
506 
507 	return h != NULL;
508 }
509 
510 /* Returns conntrack if it dealt with ICMP, and filled in skb fields */
511 struct ip_conntrack *
icmp_error_track(struct sk_buff * skb,enum ip_conntrack_info * ctinfo,unsigned int hooknum)512 icmp_error_track(struct sk_buff *skb,
513 		 enum ip_conntrack_info *ctinfo,
514 		 unsigned int hooknum)
515 {
516 	const struct iphdr *iph = skb->nh.iph;
517 	struct icmphdr *hdr;
518 	struct ip_conntrack_tuple innertuple, origtuple;
519 	struct iphdr *inner;
520 	size_t datalen;
521 	struct ip_conntrack_protocol *innerproto;
522 	struct ip_conntrack_tuple_hash *h;
523 
524 	IP_NF_ASSERT(iph->protocol == IPPROTO_ICMP);
525 	IP_NF_ASSERT(skb->nfct == NULL);
526 
527 	hdr = (struct icmphdr *)((u_int32_t *)iph + iph->ihl);
528 	inner = (struct iphdr *)(hdr + 1);
529 	datalen = skb->len - iph->ihl*4 - sizeof(*hdr);
530 
531 	if (skb->len < iph->ihl * 4 + sizeof(*hdr) + sizeof(*iph)) {
532 		DEBUGP("icmp_error_track: too short\n");
533 		return NULL;
534 	}
535 
536 	if (hdr->type != ICMP_DEST_UNREACH
537 	    && hdr->type != ICMP_SOURCE_QUENCH
538 	    && hdr->type != ICMP_TIME_EXCEEDED
539 	    && hdr->type != ICMP_PARAMETERPROB
540 	    && hdr->type != ICMP_REDIRECT)
541 		return NULL;
542 
543 	/* Ignore ICMP's containing fragments (shouldn't happen) */
544 	if (inner->frag_off & htons(IP_OFFSET)) {
545 		DEBUGP("icmp_error_track: fragment of proto %u\n",
546 		       inner->protocol);
547 		return NULL;
548 	}
549 
550 	/* Ignore it if the checksum's bogus. */
551 	if (ip_compute_csum((unsigned char *)hdr, sizeof(*hdr) + datalen)) {
552 		DEBUGP("icmp_error_track: bad csum\n");
553 		return NULL;
554 	}
555 
556 	innerproto = ip_ct_find_proto(inner->protocol);
557 	/* Are they talking about one of our connections? */
558 	if (inner->ihl * 4 + 8 > datalen
559 	    || !ip_ct_get_tuple(inner, datalen, &origtuple, innerproto)) {
560 		DEBUGP("icmp_error: ! get_tuple p=%u (%u*4+%u dlen=%u)\n",
561 		       inner->protocol, inner->ihl, 8,
562 		       datalen);
563 		return NULL;
564 	}
565 
566 	/* Ordinarily, we'd expect the inverted tupleproto, but it's
567 	   been preserved inside the ICMP. */
568 	if (!invert_tuple(&innertuple, &origtuple, innerproto)) {
569 		DEBUGP("icmp_error_track: Can't invert tuple\n");
570 		return NULL;
571 	}
572 
573 	*ctinfo = IP_CT_RELATED;
574 
575 	h = ip_conntrack_find_get(&innertuple, NULL);
576 	if (!h) {
577 		/* Locally generated ICMPs will match inverted if they
578 		   haven't been SNAT'ed yet */
579 		/* FIXME: NAT code has to handle half-done double NAT --RR */
580 		if (hooknum == NF_IP_LOCAL_OUT)
581 			h = ip_conntrack_find_get(&origtuple, NULL);
582 
583 		if (!h) {
584 			DEBUGP("icmp_error_track: no match\n");
585 			return NULL;
586 		}
587 		/* Reverse direction from that found */
588 		if (DIRECTION(h) != IP_CT_DIR_REPLY)
589 			*ctinfo += IP_CT_IS_REPLY;
590 	} else {
591 		if (DIRECTION(h) == IP_CT_DIR_REPLY)
592 			*ctinfo += IP_CT_IS_REPLY;
593 	}
594 
595 	/* Update skb to refer to this connection */
596 	skb->nfct = &h->ctrack->infos[*ctinfo];
597 	return h->ctrack;
598 }
599 
600 /* There's a small race here where we may free a just-assured
601    connection.  Too bad: we're in trouble anyway. */
unreplied(const struct ip_conntrack_tuple_hash * i)602 static inline int unreplied(const struct ip_conntrack_tuple_hash *i)
603 {
604 	return !(test_bit(IPS_ASSURED_BIT, &i->ctrack->status));
605 }
606 
early_drop(struct list_head * chain)607 static int early_drop(struct list_head *chain)
608 {
609 	/* Traverse backwards: gives us oldest, which is roughly LRU */
610 	struct ip_conntrack_tuple_hash *h;
611 	int dropped = 0;
612 
613 	READ_LOCK(&ip_conntrack_lock);
614 	h = LIST_FIND_B(chain, unreplied, struct ip_conntrack_tuple_hash *);
615 	if (h)
616 		atomic_inc(&h->ctrack->ct_general.use);
617 	READ_UNLOCK(&ip_conntrack_lock);
618 
619 	if (!h)
620 		return dropped;
621 
622 	if (del_timer(&h->ctrack->timeout)) {
623 		death_by_timeout((unsigned long)h->ctrack);
624 		dropped = 1;
625 	}
626 	ip_conntrack_put(h->ctrack);
627 	return dropped;
628 }
629 
helper_cmp(const struct ip_conntrack_helper * i,const struct ip_conntrack_tuple * rtuple)630 static inline int helper_cmp(const struct ip_conntrack_helper *i,
631 			     const struct ip_conntrack_tuple *rtuple)
632 {
633 	return ip_ct_tuple_mask_cmp(rtuple, &i->tuple, &i->mask);
634 }
635 
ip_ct_find_helper(const struct ip_conntrack_tuple * tuple)636 struct ip_conntrack_helper *ip_ct_find_helper(const struct ip_conntrack_tuple *tuple)
637 {
638 	return LIST_FIND(&helpers, helper_cmp,
639 			 struct ip_conntrack_helper *,
640 			 tuple);
641 }
642 
643 /* Allocate a new conntrack: we return -ENOMEM if classification
644    failed due to stress.  Otherwise it really is unclassifiable. */
645 static struct ip_conntrack_tuple_hash *
init_conntrack(const struct ip_conntrack_tuple * tuple,struct ip_conntrack_protocol * protocol,struct sk_buff * skb)646 init_conntrack(const struct ip_conntrack_tuple *tuple,
647 	       struct ip_conntrack_protocol *protocol,
648 	       struct sk_buff *skb)
649 {
650 	struct ip_conntrack *conntrack;
651 	struct ip_conntrack_tuple repl_tuple;
652 	size_t hash;
653 	struct ip_conntrack_expect *expected;
654 	int i;
655 	static unsigned int drop_next = 0;
656 
657 	if (!ip_conntrack_hash_rnd_initted) {
658 		get_random_bytes(&ip_conntrack_hash_rnd, 4);
659 		ip_conntrack_hash_rnd_initted = 1;
660 	}
661 
662 	hash = hash_conntrack(tuple);
663 
664 	if (ip_conntrack_max &&
665 	    atomic_read(&ip_conntrack_count) >= ip_conntrack_max) {
666 		/* Try dropping from random chain, or else from the
667                    chain about to put into (in case they're trying to
668                    bomb one hash chain). */
669 		unsigned int next = (drop_next++)%ip_conntrack_htable_size;
670 
671 		if (!early_drop(&ip_conntrack_hash[next])
672 		    && !early_drop(&ip_conntrack_hash[hash])) {
673 			if (net_ratelimit())
674 				printk(KERN_WARNING
675 				       "ip_conntrack: table full, dropping"
676 				       " packet.\n");
677 			return ERR_PTR(-ENOMEM);
678 		}
679 	}
680 
681 	if (!invert_tuple(&repl_tuple, tuple, protocol)) {
682 		DEBUGP("Can't invert tuple.\n");
683 		return NULL;
684 	}
685 
686 	conntrack = kmem_cache_alloc(ip_conntrack_cachep, GFP_ATOMIC);
687 	if (!conntrack) {
688 		DEBUGP("Can't allocate conntrack.\n");
689 		return ERR_PTR(-ENOMEM);
690 	}
691 
692 	memset(conntrack, 0, sizeof(*conntrack));
693 	atomic_set(&conntrack->ct_general.use, 1);
694 	conntrack->ct_general.destroy = destroy_conntrack;
695 	conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *tuple;
696 	conntrack->tuplehash[IP_CT_DIR_ORIGINAL].ctrack = conntrack;
697 	conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = repl_tuple;
698 	conntrack->tuplehash[IP_CT_DIR_REPLY].ctrack = conntrack;
699 	for (i=0; i < IP_CT_NUMBER; i++)
700 		conntrack->infos[i].master = &conntrack->ct_general;
701 
702 	if (!protocol->new(conntrack, skb->nh.iph, skb->len)) {
703 		kmem_cache_free(ip_conntrack_cachep, conntrack);
704 		return NULL;
705 	}
706 	/* Don't set timer yet: wait for confirmation */
707 	init_timer(&conntrack->timeout);
708 	conntrack->timeout.data = (unsigned long)conntrack;
709 	conntrack->timeout.function = death_by_timeout;
710 
711 	INIT_LIST_HEAD(&conntrack->sibling_list);
712 
713 	WRITE_LOCK(&ip_conntrack_lock);
714 	/* Need finding and deleting of expected ONLY if we win race */
715 	READ_LOCK(&ip_conntrack_expect_tuple_lock);
716 	expected = LIST_FIND(&ip_conntrack_expect_list, expect_cmp,
717 			     struct ip_conntrack_expect *, tuple);
718 	READ_UNLOCK(&ip_conntrack_expect_tuple_lock);
719 
720 	/* If master is not in hash table yet (ie. packet hasn't left
721 	   this machine yet), how can other end know about expected?
722 	   Hence these are not the droids you are looking for (if
723 	   master ct never got confirmed, we'd hold a reference to it
724 	   and weird things would happen to future packets). */
725 	if (expected && !is_confirmed(expected->expectant))
726 		expected = NULL;
727 
728 	/* Look up the conntrack helper for master connections only */
729 	if (!expected)
730 		conntrack->helper = ip_ct_find_helper(&repl_tuple);
731 
732 	/* If the expectation is dying, then this is a looser. */
733 	if (expected
734 	    && expected->expectant->helper->timeout
735 	    && ! del_timer(&expected->timeout))
736 		expected = NULL;
737 
738 	if (expected) {
739 		DEBUGP("conntrack: expectation arrives ct=%p exp=%p\n",
740 			conntrack, expected);
741 		/* Welcome, Mr. Bond.  We've been expecting you... */
742 		__set_bit(IPS_EXPECTED_BIT, &conntrack->status);
743 		conntrack->master = expected;
744 		expected->sibling = conntrack;
745 		LIST_DELETE(&ip_conntrack_expect_list, expected);
746 		expected->expectant->expecting--;
747 		nf_conntrack_get(&master_ct(conntrack)->infos[0]);
748 	}
749 	/* Overload tuple linked list to put us in unconfirmed list. */
750 	list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list,
751 	         &unconfirmed);
752 
753 	atomic_inc(&ip_conntrack_count);
754 	WRITE_UNLOCK(&ip_conntrack_lock);
755 
756 	if (expected && expected->expectfn)
757 		expected->expectfn(conntrack);
758 	return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL];
759 }
760 
761 /* On success, returns conntrack ptr, sets skb->nfct and ctinfo */
762 static inline struct ip_conntrack *
resolve_normal_ct(struct sk_buff * skb,struct ip_conntrack_protocol * proto,int * set_reply,unsigned int hooknum,enum ip_conntrack_info * ctinfo)763 resolve_normal_ct(struct sk_buff *skb,
764 		  struct ip_conntrack_protocol *proto,
765 		  int *set_reply,
766 		  unsigned int hooknum,
767 		  enum ip_conntrack_info *ctinfo)
768 {
769 	struct ip_conntrack_tuple tuple;
770 	struct ip_conntrack_tuple_hash *h;
771 
772 	IP_NF_ASSERT((skb->nh.iph->frag_off & htons(IP_OFFSET)) == 0);
773 
774 	if (!ip_ct_get_tuple(skb->nh.iph, skb->len, &tuple, proto))
775 		return NULL;
776 
777 	/* look for tuple match */
778 	h = ip_conntrack_find_get(&tuple, NULL);
779 	if (!h) {
780 		h = init_conntrack(&tuple, proto, skb);
781 		if (!h)
782 			return NULL;
783 		if (IS_ERR(h))
784 			return (void *)h;
785 	}
786 
787 	/* It exists; we have (non-exclusive) reference. */
788 	if (DIRECTION(h) == IP_CT_DIR_REPLY) {
789 		*ctinfo = IP_CT_ESTABLISHED + IP_CT_IS_REPLY;
790 		/* Please set reply bit if this packet OK */
791 		*set_reply = 1;
792 	} else {
793 		/* Once we've had two way comms, always ESTABLISHED. */
794 		if (test_bit(IPS_SEEN_REPLY_BIT, &h->ctrack->status)) {
795 			DEBUGP("ip_conntrack_in: normal packet for %p\n",
796 			       h->ctrack);
797 		        *ctinfo = IP_CT_ESTABLISHED;
798 		} else if (test_bit(IPS_EXPECTED_BIT, &h->ctrack->status)) {
799 			DEBUGP("ip_conntrack_in: related packet for %p\n",
800 			       h->ctrack);
801 			*ctinfo = IP_CT_RELATED;
802 		} else {
803 			DEBUGP("ip_conntrack_in: new packet for %p\n",
804 			       h->ctrack);
805 			*ctinfo = IP_CT_NEW;
806 		}
807 		*set_reply = 0;
808 	}
809 	skb->nfct = &h->ctrack->infos[*ctinfo];
810 	return h->ctrack;
811 }
812 
813 /* Netfilter hook itself. */
ip_conntrack_in(unsigned int hooknum,struct sk_buff ** pskb,const struct net_device * in,const struct net_device * out,int (* okfn)(struct sk_buff *))814 unsigned int ip_conntrack_in(unsigned int hooknum,
815 			     struct sk_buff **pskb,
816 			     const struct net_device *in,
817 			     const struct net_device *out,
818 			     int (*okfn)(struct sk_buff *))
819 {
820 	struct ip_conntrack *ct;
821 	enum ip_conntrack_info ctinfo;
822 	struct ip_conntrack_protocol *proto;
823 	int set_reply;
824 	int ret;
825 
826 	/* FIXME: Do this right please. --RR */
827 	(*pskb)->nfcache |= NFC_UNKNOWN;
828 
829 /* Doesn't cover locally-generated broadcast, so not worth it. */
830 #if 0
831 	/* Ignore broadcast: no `connection'. */
832 	if ((*pskb)->pkt_type == PACKET_BROADCAST) {
833 		printk("Broadcast packet!\n");
834 		return NF_ACCEPT;
835 	} else if (((*pskb)->nh.iph->daddr & htonl(0x000000FF))
836 		   == htonl(0x000000FF)) {
837 		printk("Should bcast: %u.%u.%u.%u->%u.%u.%u.%u (sk=%p, ptype=%u)\n",
838 		       NIPQUAD((*pskb)->nh.iph->saddr),
839 		       NIPQUAD((*pskb)->nh.iph->daddr),
840 		       (*pskb)->sk, (*pskb)->pkt_type);
841 	}
842 #endif
843 
844 	/* Previously seen (loopback)?  Ignore.  Do this before
845            fragment check. */
846 	if ((*pskb)->nfct)
847 		return NF_ACCEPT;
848 
849 	/* Gather fragments. */
850 	if ((*pskb)->nh.iph->frag_off & htons(IP_MF|IP_OFFSET)) {
851 		*pskb = ip_ct_gather_frags(*pskb,
852 		                           hooknum == NF_IP_PRE_ROUTING ?
853 		                           IP_DEFRAG_CONNTRACK_IN :
854 		                           IP_DEFRAG_CONNTRACK_OUT);
855 		if (!*pskb)
856 			return NF_STOLEN;
857 	}
858 
859 	proto = ip_ct_find_proto((*pskb)->nh.iph->protocol);
860 
861 	/* It may be an icmp error... */
862 	if ((*pskb)->nh.iph->protocol == IPPROTO_ICMP
863 	    && icmp_error_track(*pskb, &ctinfo, hooknum))
864 		return NF_ACCEPT;
865 
866 	if (!(ct = resolve_normal_ct(*pskb, proto,&set_reply,hooknum,&ctinfo)))
867 		/* Not valid part of a connection */
868 		return NF_ACCEPT;
869 
870 	if (IS_ERR(ct))
871 		/* Too stressed to deal. */
872 		return NF_DROP;
873 
874 	IP_NF_ASSERT((*pskb)->nfct);
875 
876 	ret = proto->packet(ct, (*pskb)->nh.iph, (*pskb)->len, ctinfo);
877 	if (ret == -1) {
878 		/* Invalid */
879 		nf_conntrack_put((*pskb)->nfct);
880 		(*pskb)->nfct = NULL;
881 		return NF_ACCEPT;
882 	}
883 
884 	if (ret != NF_DROP && ct->helper) {
885 		ret = ct->helper->help((*pskb)->nh.iph, (*pskb)->len,
886 				       ct, ctinfo);
887 		if (ret == -1) {
888 			/* Invalid */
889 			nf_conntrack_put((*pskb)->nfct);
890 			(*pskb)->nfct = NULL;
891 			return NF_ACCEPT;
892 		}
893 	}
894 	if (set_reply)
895 		set_bit(IPS_SEEN_REPLY_BIT, &ct->status);
896 
897 	return ret;
898 }
899 
invert_tuplepr(struct ip_conntrack_tuple * inverse,const struct ip_conntrack_tuple * orig)900 int invert_tuplepr(struct ip_conntrack_tuple *inverse,
901 		   const struct ip_conntrack_tuple *orig)
902 {
903 	return invert_tuple(inverse, orig, ip_ct_find_proto(orig->dst.protonum));
904 }
905 
resent_expect(const struct ip_conntrack_expect * i,const struct ip_conntrack_tuple * tuple,const struct ip_conntrack_tuple * mask)906 static inline int resent_expect(const struct ip_conntrack_expect *i,
907 			        const struct ip_conntrack_tuple *tuple,
908 			        const struct ip_conntrack_tuple *mask)
909 {
910 	DEBUGP("resent_expect\n");
911 	DEBUGP("   tuple:   "); DUMP_TUPLE(&i->tuple);
912 	DEBUGP("ct_tuple:   "); DUMP_TUPLE(&i->ct_tuple);
913 	DEBUGP("test tuple: "); DUMP_TUPLE(tuple);
914 	return (((i->ct_tuple.dst.protonum == 0 && ip_ct_tuple_equal(&i->tuple, tuple))
915 	         || (i->ct_tuple.dst.protonum && ip_ct_tuple_equal(&i->ct_tuple, tuple)))
916 		&& ip_ct_tuple_equal(&i->mask, mask));
917 }
918 
919 /* Would two expected things clash? */
expect_clash(const struct ip_conntrack_expect * i,const struct ip_conntrack_tuple * tuple,const struct ip_conntrack_tuple * mask)920 static inline int expect_clash(const struct ip_conntrack_expect *i,
921 			       const struct ip_conntrack_tuple *tuple,
922 			       const struct ip_conntrack_tuple *mask)
923 {
924 	/* Part covered by intersection of masks must be unequal,
925            otherwise they clash */
926 	struct ip_conntrack_tuple intersect_mask
927 		= { { i->mask.src.ip & mask->src.ip,
928 		      { i->mask.src.u.all & mask->src.u.all } },
929 		    { i->mask.dst.ip & mask->dst.ip,
930 		      { i->mask.dst.u.all & mask->dst.u.all },
931 		      i->mask.dst.protonum & mask->dst.protonum } };
932 
933 	return ip_ct_tuple_mask_cmp(&i->tuple, tuple, &intersect_mask);
934 }
935 
ip_conntrack_unexpect_related(struct ip_conntrack_expect * expect)936 inline void ip_conntrack_unexpect_related(struct ip_conntrack_expect *expect)
937 {
938 	WRITE_LOCK(&ip_conntrack_lock);
939 	unexpect_related(expect);
940 	WRITE_UNLOCK(&ip_conntrack_lock);
941 }
942 
expectation_timed_out(unsigned long ul_expect)943 static void expectation_timed_out(unsigned long ul_expect)
944 {
945 	struct ip_conntrack_expect *expect = (void *) ul_expect;
946 
947 	DEBUGP("expectation %p timed out\n", expect);
948 	WRITE_LOCK(&ip_conntrack_lock);
949 	__unexpect_related(expect);
950 	WRITE_UNLOCK(&ip_conntrack_lock);
951 }
952 
953 /* Add a related connection. */
ip_conntrack_expect_related(struct ip_conntrack * related_to,struct ip_conntrack_expect * expect)954 int ip_conntrack_expect_related(struct ip_conntrack *related_to,
955 				struct ip_conntrack_expect *expect)
956 {
957 	struct ip_conntrack_expect *old, *new;
958 	int ret = 0;
959 
960 	WRITE_LOCK(&ip_conntrack_lock);
961 	/* Because of the write lock, no reader can walk the lists,
962 	 * so there is no need to use the tuple lock too */
963 
964 	DEBUGP("ip_conntrack_expect_related %p\n", related_to);
965 	DEBUGP("tuple: "); DUMP_TUPLE(&expect->tuple);
966 	DEBUGP("mask:  "); DUMP_TUPLE(&expect->mask);
967 
968 	old = LIST_FIND(&ip_conntrack_expect_list, resent_expect,
969 		        struct ip_conntrack_expect *, &expect->tuple,
970 			&expect->mask);
971 	if (old) {
972 		/* Helper private data may contain offsets but no pointers
973 		   pointing into the payload - otherwise we should have to copy
974 		   the data filled out by the helper over the old one */
975 		DEBUGP("expect_related: resent packet\n");
976 		if (old->expectant == related_to &&
977 		    related_to->helper->timeout) {
978 			if (!del_timer(&old->timeout)) {
979 				/* expectation is dying. Fall through */
980 				old = NULL;
981 			} else {
982 				old->timeout.expires = jiffies +
983 					related_to->helper->timeout * HZ;
984 				add_timer(&old->timeout);
985 			}
986 		}
987 
988 		if (old) {
989 			WRITE_UNLOCK(&ip_conntrack_lock);
990 			return -EEXIST;
991 		}
992 	} else if (related_to->helper->max_expected &&
993 		   related_to->expecting >= related_to->helper->max_expected) {
994 		/* old == NULL */
995 		if (!(related_to->helper->flags &
996 		      IP_CT_HELPER_F_REUSE_EXPECT)) {
997 			WRITE_UNLOCK(&ip_conntrack_lock);
998  		    	if (net_ratelimit())
999  			    	printk(KERN_WARNING
1000 				       "ip_conntrack: max number of expected "
1001 				       "connections %i of %s reached for "
1002 				       "%u.%u.%u.%u->%u.%u.%u.%u\n",
1003 				       related_to->helper->max_expected,
1004 				       related_to->helper->name,
1005  		    	       	       NIPQUAD(related_to->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip),
1006  		    	       	       NIPQUAD(related_to->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip));
1007 			return -EPERM;
1008 		}
1009 		DEBUGP("ip_conntrack: max number of expected "
1010 		       "connections %i of %s reached for "
1011 		       "%u.%u.%u.%u->%u.%u.%u.%u, reusing\n",
1012  		       related_to->helper->max_expected,
1013 		       related_to->helper->name,
1014 		       NIPQUAD(related_to->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip),
1015 		       NIPQUAD(related_to->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip));
1016 
1017 		/* choose the the oldest expectation to evict */
1018 		list_for_each_entry(old, &related_to->sibling_list,
1019 		                                      expected_list)
1020 			if (old->sibling == NULL)
1021 				break;
1022 
1023 		/* We cannot fail since related_to->expecting is the number
1024 		 * of unconfirmed expectations */
1025 		IP_NF_ASSERT(old && old->sibling == NULL);
1026 
1027 		/* newnat14 does not reuse the real allocated memory
1028 		 * structures but rather unexpects the old and
1029 		 * allocates a new.  unexpect_related will decrement
1030 		 * related_to->expecting.
1031 		 */
1032 		unexpect_related(old);
1033 		ret = -EPERM;
1034 	} else if (LIST_FIND(&ip_conntrack_expect_list, expect_clash,
1035 			     struct ip_conntrack_expect *, &expect->tuple,
1036 			     &expect->mask)) {
1037 		WRITE_UNLOCK(&ip_conntrack_lock);
1038 		DEBUGP("expect_related: busy!\n");
1039 		return -EBUSY;
1040 	}
1041 
1042 	new = (struct ip_conntrack_expect *)
1043 	      kmalloc(sizeof(struct ip_conntrack_expect), GFP_ATOMIC);
1044 	if (!new) {
1045 		WRITE_UNLOCK(&ip_conntrack_lock);
1046 		DEBUGP("expect_relaed: OOM allocating expect\n");
1047 		return -ENOMEM;
1048 	}
1049 
1050 	DEBUGP("new expectation %p of conntrack %p\n", new, related_to);
1051 	memcpy(new, expect, sizeof(*expect));
1052 	new->expectant = related_to;
1053 	new->sibling = NULL;
1054 	atomic_set(&new->use, 1);
1055 
1056 	/* add to expected list for this connection */
1057 	list_add_tail(&new->expected_list, &related_to->sibling_list);
1058 	/* add to global list of expectations */
1059 	list_prepend(&ip_conntrack_expect_list, &new->list);
1060 	/* add and start timer if required */
1061 	if (related_to->helper->timeout) {
1062 		init_timer(&new->timeout);
1063 		new->timeout.data = (unsigned long)new;
1064 		new->timeout.function = expectation_timed_out;
1065 		new->timeout.expires = jiffies +
1066 					related_to->helper->timeout * HZ;
1067 		add_timer(&new->timeout);
1068 	}
1069 	related_to->expecting++;
1070 
1071 	WRITE_UNLOCK(&ip_conntrack_lock);
1072 
1073 	return ret;
1074 }
1075 
1076 /* Change tuple in an existing expectation */
ip_conntrack_change_expect(struct ip_conntrack_expect * expect,struct ip_conntrack_tuple * newtuple)1077 int ip_conntrack_change_expect(struct ip_conntrack_expect *expect,
1078 			       struct ip_conntrack_tuple *newtuple)
1079 {
1080 	int ret;
1081 
1082 	MUST_BE_READ_LOCKED(&ip_conntrack_lock);
1083 	WRITE_LOCK(&ip_conntrack_expect_tuple_lock);
1084 
1085 	DEBUGP("change_expect:\n");
1086 	DEBUGP("exp tuple: "); DUMP_TUPLE(&expect->tuple);
1087 	DEBUGP("exp mask:  "); DUMP_TUPLE(&expect->mask);
1088 	DEBUGP("newtuple:  "); DUMP_TUPLE(newtuple);
1089 	if (expect->ct_tuple.dst.protonum == 0) {
1090 		/* Never seen before */
1091 		DEBUGP("change expect: never seen before\n");
1092 		if (!ip_ct_tuple_equal(&expect->tuple, newtuple)
1093 		    && LIST_FIND(&ip_conntrack_expect_list, expect_clash,
1094 			         struct ip_conntrack_expect *, newtuple, &expect->mask)) {
1095 			/* Force NAT to find an unused tuple */
1096 			ret = -1;
1097 		} else {
1098 			memcpy(&expect->ct_tuple, &expect->tuple, sizeof(expect->tuple));
1099 			memcpy(&expect->tuple, newtuple, sizeof(expect->tuple));
1100 			ret = 0;
1101 		}
1102 	} else {
1103 		/* Resent packet */
1104 		DEBUGP("change expect: resent packet\n");
1105 		if (ip_ct_tuple_equal(&expect->tuple, newtuple)) {
1106 			ret = 0;
1107 		} else {
1108 			/* Force NAT to choose again the same port */
1109 			ret = -1;
1110 		}
1111 	}
1112 	WRITE_UNLOCK(&ip_conntrack_expect_tuple_lock);
1113 
1114 	return ret;
1115 }
1116 
1117 /* Alter reply tuple (maybe alter helper).  If it's already taken,
1118    return 0 and don't do alteration. */
ip_conntrack_alter_reply(struct ip_conntrack * conntrack,const struct ip_conntrack_tuple * newreply)1119 int ip_conntrack_alter_reply(struct ip_conntrack *conntrack,
1120 			     const struct ip_conntrack_tuple *newreply)
1121 {
1122 	WRITE_LOCK(&ip_conntrack_lock);
1123 	if (__ip_conntrack_find(newreply, conntrack)) {
1124 		WRITE_UNLOCK(&ip_conntrack_lock);
1125 		return 0;
1126 	}
1127 	/* Should be unconfirmed, so not in hash table yet */
1128 	IP_NF_ASSERT(!is_confirmed(conntrack));
1129 
1130 	DEBUGP("Altering reply tuple of %p to ", conntrack);
1131 	DUMP_TUPLE(newreply);
1132 
1133 	conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
1134 	if (!conntrack->master && list_empty(&conntrack->sibling_list))
1135 		conntrack->helper = ip_ct_find_helper(newreply);
1136 	WRITE_UNLOCK(&ip_conntrack_lock);
1137 
1138 	return 1;
1139 }
1140 
ip_conntrack_helper_register(struct ip_conntrack_helper * me)1141 int ip_conntrack_helper_register(struct ip_conntrack_helper *me)
1142 {
1143 	MOD_INC_USE_COUNT;
1144 
1145 	WRITE_LOCK(&ip_conntrack_lock);
1146 	list_prepend(&helpers, me);
1147 	WRITE_UNLOCK(&ip_conntrack_lock);
1148 
1149 	return 0;
1150 }
1151 
unhelp(struct ip_conntrack_tuple_hash * i,const struct ip_conntrack_helper * me)1152 static inline int unhelp(struct ip_conntrack_tuple_hash *i,
1153 			 const struct ip_conntrack_helper *me)
1154 {
1155 	if (i->ctrack->helper == me) {
1156 		/* Get rid of any expected. */
1157 		remove_expectations(i->ctrack, 0);
1158 		/* And *then* set helper to NULL */
1159 		i->ctrack->helper = NULL;
1160 	}
1161 	return 0;
1162 }
1163 
ip_conntrack_helper_unregister(struct ip_conntrack_helper * me)1164 void ip_conntrack_helper_unregister(struct ip_conntrack_helper *me)
1165 {
1166 	unsigned int i;
1167 
1168 	/* Need write lock here, to delete helper. */
1169 	WRITE_LOCK(&ip_conntrack_lock);
1170 	LIST_DELETE(&helpers, me);
1171 
1172 	/* Get rid of expecteds, set helpers to NULL. */
1173 	LIST_FIND_W(&unconfirmed, unhelp, struct ip_conntrack_tuple_hash*, me);
1174 	for (i = 0; i < ip_conntrack_htable_size; i++)
1175 		LIST_FIND_W(&ip_conntrack_hash[i], unhelp,
1176 			    struct ip_conntrack_tuple_hash *, me);
1177 	WRITE_UNLOCK(&ip_conntrack_lock);
1178 
1179 	/* Someone could be still looking at the helper in a bh. */
1180 	br_write_lock_bh(BR_NETPROTO_LOCK);
1181 	br_write_unlock_bh(BR_NETPROTO_LOCK);
1182 
1183 	MOD_DEC_USE_COUNT;
1184 }
1185 
1186 /* Refresh conntrack for this many jiffies. */
ip_ct_refresh(struct ip_conntrack * ct,unsigned long extra_jiffies)1187 void ip_ct_refresh(struct ip_conntrack *ct, unsigned long extra_jiffies)
1188 {
1189 	IP_NF_ASSERT(ct->timeout.data == (unsigned long)ct);
1190 
1191 	WRITE_LOCK(&ip_conntrack_lock);
1192 	/* If not in hash table, timer will not be active yet */
1193 	if (!is_confirmed(ct))
1194 		ct->timeout.expires = extra_jiffies;
1195 	else {
1196 		/* Need del_timer for race avoidance (may already be dying). */
1197 		if (del_timer(&ct->timeout)) {
1198 			ct->timeout.expires = jiffies + extra_jiffies;
1199 			add_timer(&ct->timeout);
1200 		}
1201 	}
1202 	WRITE_UNLOCK(&ip_conntrack_lock);
1203 }
1204 
1205 /* Returns new sk_buff, or NULL */
1206 struct sk_buff *
ip_ct_gather_frags(struct sk_buff * skb,u_int32_t user)1207 ip_ct_gather_frags(struct sk_buff *skb, u_int32_t user)
1208 {
1209 	struct sock *sk = skb->sk;
1210 #ifdef CONFIG_NETFILTER_DEBUG
1211 	unsigned int olddebug = skb->nf_debug;
1212 #endif
1213 
1214 	if (sk) {
1215 		sock_hold(sk);
1216 		skb_orphan(skb);
1217 	}
1218 
1219 	local_bh_disable();
1220 	skb = ip_defrag(skb, user);
1221 	local_bh_enable();
1222 
1223 	if (!skb) {
1224 		if (sk) sock_put(sk);
1225 		return skb;
1226 	} else if (skb_is_nonlinear(skb) && skb_linearize(skb, GFP_ATOMIC) != 0) {
1227 		kfree_skb(skb);
1228 		if (sk) sock_put(sk);
1229 		return NULL;
1230 	}
1231 
1232 	if (sk) {
1233 		skb_set_owner_w(skb, sk);
1234 		sock_put(sk);
1235 	}
1236 
1237 	ip_send_check(skb->nh.iph);
1238 	skb->nfcache |= NFC_ALTERED;
1239 #ifdef CONFIG_NETFILTER_DEBUG
1240 	/* Packet path as if nothing had happened. */
1241 	skb->nf_debug = olddebug;
1242 #endif
1243 	return skb;
1244 }
1245 
1246 /* Used by ipt_REJECT. */
ip_conntrack_attach(struct sk_buff * nskb,struct nf_ct_info * nfct)1247 static void ip_conntrack_attach(struct sk_buff *nskb, struct nf_ct_info *nfct)
1248 {
1249 	struct ip_conntrack *ct;
1250 	enum ip_conntrack_info ctinfo;
1251 
1252 	ct = __ip_conntrack_get(nfct, &ctinfo);
1253 
1254 	/* This ICMP is in reverse direction to the packet which
1255            caused it */
1256 	if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
1257 		ctinfo = IP_CT_RELATED + IP_CT_IS_REPLY;
1258 	else
1259 		ctinfo = IP_CT_RELATED;
1260 
1261 	/* Attach new skbuff, and increment count */
1262 	nskb->nfct = &ct->infos[ctinfo];
1263 	atomic_inc(&ct->ct_general.use);
1264 }
1265 
1266 static inline int
do_iter(const struct ip_conntrack_tuple_hash * i,int (* iter)(struct ip_conntrack * i,void * data),void * data)1267 do_iter(const struct ip_conntrack_tuple_hash *i,
1268         int (*iter)(struct ip_conntrack *i, void *data),
1269 	void *data)
1270 {
1271 	return iter(i->ctrack, data);
1272 }
1273 
1274 /* Bring out ya dead! */
1275 static struct ip_conntrack_tuple_hash *
get_next_corpse(int (* iter)(struct ip_conntrack * i,void * data),void * data,unsigned int * bucket)1276 get_next_corpse(int (*iter)(struct ip_conntrack *i, void *data),
1277 		void *data, unsigned int *bucket)
1278 {
1279 	struct ip_conntrack_tuple_hash *h = NULL;
1280 
1281 	WRITE_LOCK(&ip_conntrack_lock);
1282 	for (; *bucket < ip_conntrack_htable_size; (*bucket)++) {
1283 		h = LIST_FIND_W(&ip_conntrack_hash[*bucket], do_iter,
1284 		                struct ip_conntrack_tuple_hash *, iter, data);
1285 		if (h)
1286 			break;
1287 	}
1288 	if (!h)
1289 		h = LIST_FIND_W(&unconfirmed, do_iter,
1290 		                struct ip_conntrack_tuple_hash *, iter, data);
1291 	if (h)
1292 		atomic_inc(&h->ctrack->ct_general.use);
1293 	WRITE_UNLOCK(&ip_conntrack_lock);
1294 
1295 	return h;
1296 }
1297 
1298 void
ip_ct_iterate_cleanup(int (* iter)(struct ip_conntrack * i,void *),void * data)1299 ip_ct_iterate_cleanup(int (*iter)(struct ip_conntrack *i, void *), void *data)
1300 {
1301 	struct ip_conntrack_tuple_hash *h;
1302 	unsigned int bucket = 0;
1303 
1304 	while ((h = get_next_corpse(iter, data, &bucket)) != NULL) {
1305 		/* Time to push up daises... */
1306 		if (del_timer(&h->ctrack->timeout))
1307 			death_by_timeout((unsigned long)h->ctrack);
1308 		/* ... else the timer will get him soon. */
1309 
1310 		ip_conntrack_put(h->ctrack);
1311 	}
1312 }
1313 
1314 /* Fast function for those who don't want to parse /proc (and I don't
1315    blame them). */
1316 /* Reversing the socket's dst/src point of view gives us the reply
1317    mapping. */
1318 static int
getorigdst(struct sock * sk,int optval,void * user,int * len)1319 getorigdst(struct sock *sk, int optval, void *user, int *len)
1320 {
1321 	struct ip_conntrack_tuple_hash *h;
1322 	struct ip_conntrack_tuple tuple;
1323 
1324 	IP_CT_TUPLE_U_BLANK(&tuple);
1325 	tuple.src.ip = sk->rcv_saddr;
1326 	tuple.src.u.tcp.port = sk->sport;
1327 	tuple.dst.ip = sk->daddr;
1328 	tuple.dst.u.tcp.port = sk->dport;
1329 	tuple.dst.protonum = IPPROTO_TCP;
1330 
1331 	/* We only do TCP at the moment: is there a better way? */
1332 	if (strcmp(sk->prot->name, "TCP") != 0) {
1333 		DEBUGP("SO_ORIGINAL_DST: Not a TCP socket\n");
1334 		return -ENOPROTOOPT;
1335 	}
1336 
1337 	if ((unsigned int) *len < sizeof(struct sockaddr_in)) {
1338 		DEBUGP("SO_ORIGINAL_DST: len %u not %u\n",
1339 		       *len, sizeof(struct sockaddr_in));
1340 		return -EINVAL;
1341 	}
1342 
1343 	h = ip_conntrack_find_get(&tuple, NULL);
1344 	if (h) {
1345 		struct sockaddr_in sin;
1346 
1347 		sin.sin_family = AF_INET;
1348 		sin.sin_port = h->ctrack->tuplehash[IP_CT_DIR_ORIGINAL]
1349 			.tuple.dst.u.tcp.port;
1350 		sin.sin_addr.s_addr = h->ctrack->tuplehash[IP_CT_DIR_ORIGINAL]
1351 			.tuple.dst.ip;
1352 		memset(sin.sin_zero, 0, sizeof(sin.sin_zero));
1353 
1354 		DEBUGP("SO_ORIGINAL_DST: %u.%u.%u.%u %u\n",
1355 		       NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port));
1356 		ip_conntrack_put(h->ctrack);
1357 		if (copy_to_user(user, &sin, sizeof(sin)) != 0)
1358 			return -EFAULT;
1359 		else
1360 			return 0;
1361 	}
1362 	DEBUGP("SO_ORIGINAL_DST: Can't find %u.%u.%u.%u/%u-%u.%u.%u.%u/%u.\n",
1363 	       NIPQUAD(tuple.src.ip), ntohs(tuple.src.u.tcp.port),
1364 	       NIPQUAD(tuple.dst.ip), ntohs(tuple.dst.u.tcp.port));
1365 	return -ENOENT;
1366 }
1367 
1368 static struct nf_sockopt_ops so_getorigdst
1369 = { { NULL, NULL }, PF_INET,
1370     0, 0, NULL, /* Setsockopts */
1371     SO_ORIGINAL_DST, SO_ORIGINAL_DST+1, &getorigdst,
1372     0, NULL };
1373 
kill_all(struct ip_conntrack * i,void * data)1374 static int kill_all(struct ip_conntrack *i, void *data)
1375 {
1376 	return 1;
1377 }
1378 
1379 /* Mishearing the voices in his head, our hero wonders how he's
1380    supposed to kill the mall. */
ip_conntrack_cleanup(void)1381 void ip_conntrack_cleanup(void)
1382 {
1383 	ip_ct_attach = NULL;
1384 	/* This makes sure all current packets have passed through
1385            netfilter framework.  Roll on, two-stage module
1386            delete... */
1387 	br_write_lock_bh(BR_NETPROTO_LOCK);
1388 	br_write_unlock_bh(BR_NETPROTO_LOCK);
1389 
1390  i_see_dead_people:
1391 	ip_ct_iterate_cleanup(kill_all, NULL);
1392 	if (atomic_read(&ip_conntrack_count) != 0) {
1393 		schedule();
1394 		goto i_see_dead_people;
1395 	}
1396 
1397 	kmem_cache_destroy(ip_conntrack_cachep);
1398 	vfree(ip_conntrack_hash);
1399 	nf_unregister_sockopt(&so_getorigdst);
1400 }
1401 
1402 static int hashsize = 0;
1403 MODULE_PARM(hashsize, "i");
1404 
ip_conntrack_init(void)1405 int __init ip_conntrack_init(void)
1406 {
1407 	unsigned int i;
1408 	int ret;
1409 
1410 	/* Idea from tcp.c: use 1/16384 of memory.  On i386: 32MB
1411 	 * machine has 256 buckets.  >= 1GB machines have 8192 buckets. */
1412  	if (hashsize) {
1413  		ip_conntrack_htable_size = hashsize;
1414  	} else {
1415 		ip_conntrack_htable_size
1416 			= (((num_physpages << PAGE_SHIFT) / 16384)
1417 			   / sizeof(struct list_head));
1418 		if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE))
1419 			ip_conntrack_htable_size = 8192;
1420 		if (ip_conntrack_htable_size < 16)
1421 			ip_conntrack_htable_size = 16;
1422 	}
1423 	ip_conntrack_max = 8 * ip_conntrack_htable_size;
1424 
1425 	printk("ip_conntrack version %s (%u buckets, %d max)"
1426 	       " - %Zd bytes per conntrack\n", IP_CONNTRACK_VERSION,
1427 	       ip_conntrack_htable_size, ip_conntrack_max,
1428 	       sizeof(struct ip_conntrack));
1429 
1430 	ret = nf_register_sockopt(&so_getorigdst);
1431 	if (ret != 0) {
1432 		printk(KERN_ERR "Unable to register netfilter socket option\n");
1433 		return ret;
1434 	}
1435 
1436 	ip_conntrack_hash = vmalloc(sizeof(struct list_head)
1437 				    * ip_conntrack_htable_size);
1438 	if (!ip_conntrack_hash) {
1439 		printk(KERN_ERR "Unable to create ip_conntrack_hash\n");
1440 		goto err_unreg_sockopt;
1441 	}
1442 
1443 	ip_conntrack_cachep = kmem_cache_create("ip_conntrack",
1444 	                                        sizeof(struct ip_conntrack), 0,
1445 	                                        SLAB_HWCACHE_ALIGN, NULL, NULL);
1446 	if (!ip_conntrack_cachep) {
1447 		printk(KERN_ERR "Unable to create ip_conntrack slab cache\n");
1448 		goto err_free_hash;
1449 	}
1450 	/* Don't NEED lock here, but good form anyway. */
1451 	WRITE_LOCK(&ip_conntrack_lock);
1452 	/* Sew in builtin protocols. */
1453 	list_append(&protocol_list, &ip_conntrack_protocol_tcp);
1454 	list_append(&protocol_list, &ip_conntrack_protocol_udp);
1455 	list_append(&protocol_list, &ip_conntrack_protocol_icmp);
1456 	WRITE_UNLOCK(&ip_conntrack_lock);
1457 
1458 	for (i = 0; i < ip_conntrack_htable_size; i++)
1459 		INIT_LIST_HEAD(&ip_conntrack_hash[i]);
1460 
1461 	/* For use by ipt_REJECT */
1462 	ip_ct_attach = ip_conntrack_attach;
1463 	return ret;
1464 
1465 err_free_hash:
1466 	vfree(ip_conntrack_hash);
1467 err_unreg_sockopt:
1468 	nf_unregister_sockopt(&so_getorigdst);
1469 
1470 	return -ENOMEM;
1471 }
1472