1 /* Connection state tracking for netfilter. This is separated from,
2 but required by, the NAT layer; it can also be used by an iptables
3 extension. */
4
5 /* (c) 1999 Paul `Rusty' Russell. Licenced under the GNU General
6 * Public Licence.
7 *
8 * 23 Apr 2001: Harald Welte <laforge@gnumonks.org>
9 * - new API and handling of conntrack/nat helpers
10 * - now capable of multiple expectations for one master
11 * 16 Jul 2002: Harald Welte <laforge@gnumonks.org>
12 * - add usage/reference counts to ip_conntrack_expect
13 * - export ip_conntrack[_expect]_{find_get,put} functions
14 * */
15
16 #include <linux/version.h>
17 #include <linux/config.h>
18 #include <linux/types.h>
19 #include <linux/ip.h>
20 #include <linux/netfilter.h>
21 #include <linux/netfilter_ipv4.h>
22 #include <linux/module.h>
23 #include <linux/skbuff.h>
24 #include <linux/proc_fs.h>
25 #include <linux/vmalloc.h>
26 #include <linux/brlock.h>
27 #include <net/checksum.h>
28 #include <linux/stddef.h>
29 #include <linux/sysctl.h>
30 #include <linux/slab.h>
31 #include <linux/random.h>
32 #include <linux/jhash.h>
33 /* For ERR_PTR(). Yeah, I know... --RR */
34 #include <linux/fs.h>
35
36 /* This rwlock protects the main hash table, protocol/helper/expected
37 registrations, conntrack timers*/
38 #define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_conntrack_lock)
39 #define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_conntrack_lock)
40
41 #include <linux/netfilter_ipv4/ip_conntrack.h>
42 #include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
43 #include <linux/netfilter_ipv4/ip_conntrack_helper.h>
44 #include <linux/netfilter_ipv4/ip_conntrack_core.h>
45 #include <linux/netfilter_ipv4/listhelp.h>
46
47 #define IP_CONNTRACK_VERSION "2.1"
48
49 #if 0
50 #define DEBUGP printk
51 #else
52 #define DEBUGP(format, args...)
53 #endif
54
55 DECLARE_RWLOCK(ip_conntrack_lock);
56 DECLARE_RWLOCK(ip_conntrack_expect_tuple_lock);
57
58 void (*ip_conntrack_destroyed)(struct ip_conntrack *conntrack) = NULL;
59 LIST_HEAD(ip_conntrack_expect_list);
60 LIST_HEAD(protocol_list);
61 static LIST_HEAD(helpers);
62 unsigned int ip_conntrack_htable_size = 0;
63 int ip_conntrack_max = 0;
64 static atomic_t ip_conntrack_count = ATOMIC_INIT(0);
65 struct list_head *ip_conntrack_hash;
66 static kmem_cache_t *ip_conntrack_cachep;
67 static LIST_HEAD(unconfirmed);
68
69 extern struct ip_conntrack_protocol ip_conntrack_generic_protocol;
70
proto_cmpfn(const struct ip_conntrack_protocol * curr,u_int8_t protocol)71 static inline int proto_cmpfn(const struct ip_conntrack_protocol *curr,
72 u_int8_t protocol)
73 {
74 return protocol == curr->proto;
75 }
76
__ip_ct_find_proto(u_int8_t protocol)77 struct ip_conntrack_protocol *__ip_ct_find_proto(u_int8_t protocol)
78 {
79 struct ip_conntrack_protocol *p;
80
81 MUST_BE_READ_LOCKED(&ip_conntrack_lock);
82 p = LIST_FIND(&protocol_list, proto_cmpfn,
83 struct ip_conntrack_protocol *, protocol);
84 if (!p)
85 p = &ip_conntrack_generic_protocol;
86
87 return p;
88 }
89
ip_ct_find_proto(u_int8_t protocol)90 struct ip_conntrack_protocol *ip_ct_find_proto(u_int8_t protocol)
91 {
92 struct ip_conntrack_protocol *p;
93
94 READ_LOCK(&ip_conntrack_lock);
95 p = __ip_ct_find_proto(protocol);
96 READ_UNLOCK(&ip_conntrack_lock);
97 return p;
98 }
99
100 inline void
ip_conntrack_put(struct ip_conntrack * ct)101 ip_conntrack_put(struct ip_conntrack *ct)
102 {
103 IP_NF_ASSERT(ct);
104 IP_NF_ASSERT(ct->infos[0].master);
105 /* nf_conntrack_put wants to go via an info struct, so feed it
106 one at random. */
107 nf_conntrack_put(&ct->infos[0]);
108 }
109
110 static int ip_conntrack_hash_rnd_initted;
111 static unsigned int ip_conntrack_hash_rnd;
112
113 static u_int32_t
hash_conntrack(const struct ip_conntrack_tuple * tuple)114 hash_conntrack(const struct ip_conntrack_tuple *tuple)
115 {
116 #if 0
117 dump_tuple(tuple);
118 #endif
119 return (jhash_3words(tuple->src.ip,
120 (tuple->dst.ip ^ tuple->dst.protonum),
121 (tuple->src.u.all | (tuple->dst.u.all << 16)),
122 ip_conntrack_hash_rnd) % ip_conntrack_htable_size);
123 }
124
125 inline int
ip_ct_get_tuple(const struct iphdr * iph,size_t len,struct ip_conntrack_tuple * tuple,struct ip_conntrack_protocol * protocol)126 ip_ct_get_tuple(const struct iphdr *iph, size_t len,
127 struct ip_conntrack_tuple *tuple,
128 struct ip_conntrack_protocol *protocol)
129 {
130 int ret;
131
132 /* Never happen */
133 if (iph->frag_off & htons(IP_OFFSET)) {
134 printk("ip_conntrack_core: Frag of proto %u.\n",
135 iph->protocol);
136 return 0;
137 }
138 /* Guarantee 8 protocol bytes: if more wanted, use len param */
139 else if (iph->ihl * 4 + 8 > len)
140 return 0;
141
142 tuple->src.ip = iph->saddr;
143 tuple->dst.ip = iph->daddr;
144 tuple->dst.protonum = iph->protocol;
145
146 ret = protocol->pkt_to_tuple((u_int32_t *)iph + iph->ihl,
147 len - 4*iph->ihl,
148 tuple);
149 return ret;
150 }
151
152 static int
invert_tuple(struct ip_conntrack_tuple * inverse,const struct ip_conntrack_tuple * orig,const struct ip_conntrack_protocol * protocol)153 invert_tuple(struct ip_conntrack_tuple *inverse,
154 const struct ip_conntrack_tuple *orig,
155 const struct ip_conntrack_protocol *protocol)
156 {
157 inverse->src.ip = orig->dst.ip;
158 inverse->dst.ip = orig->src.ip;
159 inverse->dst.protonum = orig->dst.protonum;
160
161 return protocol->invert_tuple(inverse, orig);
162 }
163
164
165 /* ip_conntrack_expect helper functions */
166
167 /* Compare tuple parts depending on mask. */
expect_cmp(const struct ip_conntrack_expect * i,const struct ip_conntrack_tuple * tuple)168 static inline int expect_cmp(const struct ip_conntrack_expect *i,
169 const struct ip_conntrack_tuple *tuple)
170 {
171 MUST_BE_READ_LOCKED(&ip_conntrack_expect_tuple_lock);
172 return ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask);
173 }
174
175 static void
destroy_expect(struct ip_conntrack_expect * exp)176 destroy_expect(struct ip_conntrack_expect *exp)
177 {
178 DEBUGP("destroy_expect(%p) use=%d\n", exp, atomic_read(&exp->use));
179 IP_NF_ASSERT(atomic_read(&exp->use) == 0);
180 IP_NF_ASSERT(!timer_pending(&exp->timeout));
181
182 kfree(exp);
183 }
184
ip_conntrack_expect_put(struct ip_conntrack_expect * exp)185 inline void ip_conntrack_expect_put(struct ip_conntrack_expect *exp)
186 {
187 IP_NF_ASSERT(exp);
188
189 if (atomic_dec_and_test(&exp->use)) {
190 /* usage count dropped to zero */
191 destroy_expect(exp);
192 }
193 }
194
195 static inline struct ip_conntrack_expect *
__ip_ct_expect_find(const struct ip_conntrack_tuple * tuple)196 __ip_ct_expect_find(const struct ip_conntrack_tuple *tuple)
197 {
198 MUST_BE_READ_LOCKED(&ip_conntrack_lock);
199 MUST_BE_READ_LOCKED(&ip_conntrack_expect_tuple_lock);
200 return LIST_FIND(&ip_conntrack_expect_list, expect_cmp,
201 struct ip_conntrack_expect *, tuple);
202 }
203
204 /* Find a expectation corresponding to a tuple. */
205 struct ip_conntrack_expect *
ip_conntrack_expect_find_get(const struct ip_conntrack_tuple * tuple)206 ip_conntrack_expect_find_get(const struct ip_conntrack_tuple *tuple)
207 {
208 struct ip_conntrack_expect *exp;
209
210 READ_LOCK(&ip_conntrack_lock);
211 READ_LOCK(&ip_conntrack_expect_tuple_lock);
212 exp = __ip_ct_expect_find(tuple);
213 if (exp)
214 atomic_inc(&exp->use);
215 READ_UNLOCK(&ip_conntrack_expect_tuple_lock);
216 READ_UNLOCK(&ip_conntrack_lock);
217
218 return exp;
219 }
220
221 /* remove one specific expectation from all lists and drop refcount,
222 * does _NOT_ delete the timer. */
__unexpect_related(struct ip_conntrack_expect * expect)223 static void __unexpect_related(struct ip_conntrack_expect *expect)
224 {
225 DEBUGP("unexpect_related(%p)\n", expect);
226 MUST_BE_WRITE_LOCKED(&ip_conntrack_lock);
227
228 /* we're not allowed to unexpect a confirmed expectation! */
229 IP_NF_ASSERT(!expect->sibling);
230
231 /* delete from global and local lists */
232 list_del(&expect->list);
233 list_del(&expect->expected_list);
234
235 /* decrement expect-count of master conntrack */
236 if (expect->expectant)
237 expect->expectant->expecting--;
238
239 ip_conntrack_expect_put(expect);
240 }
241
242 /* remove one specific expecatation from all lists, drop refcount
243 * and expire timer.
244 * This function can _NOT_ be called for confirmed expects! */
unexpect_related(struct ip_conntrack_expect * expect)245 static void unexpect_related(struct ip_conntrack_expect *expect)
246 {
247 IP_NF_ASSERT(expect->expectant);
248 IP_NF_ASSERT(expect->expectant->helper);
249 /* if we are supposed to have a timer, but we can't delete
250 * it: race condition. __unexpect_related will
251 * be calledd by timeout function */
252 if (expect->expectant->helper->timeout
253 && !del_timer(&expect->timeout))
254 return;
255
256 __unexpect_related(expect);
257 }
258
259 /* delete all unconfirmed expectations for this conntrack */
remove_expectations(struct ip_conntrack * ct,int drop_refcount)260 static void remove_expectations(struct ip_conntrack *ct, int drop_refcount)
261 {
262 struct list_head *exp_entry, *next;
263 struct ip_conntrack_expect *exp;
264
265 DEBUGP("remove_expectations(%p)\n", ct);
266
267 list_for_each_safe(exp_entry, next, &ct->sibling_list) {
268 exp = list_entry(exp_entry, struct ip_conntrack_expect,
269 expected_list);
270
271 /* we skip established expectations, as we want to delete
272 * the un-established ones only */
273 if (exp->sibling) {
274 DEBUGP("remove_expectations: skipping established %p of %p\n", exp->sibling, ct);
275 if (drop_refcount) {
276 /* Indicate that this expectations parent is dead */
277 ip_conntrack_put(exp->expectant);
278 exp->expectant = NULL;
279 }
280 continue;
281 }
282
283 IP_NF_ASSERT(list_inlist(&ip_conntrack_expect_list, exp));
284 IP_NF_ASSERT(exp->expectant == ct);
285
286 /* delete expectation from global and private lists */
287 unexpect_related(exp);
288 }
289 }
290
291 static void
clean_from_lists(struct ip_conntrack * ct)292 clean_from_lists(struct ip_conntrack *ct)
293 {
294 unsigned int ho, hr;
295
296 DEBUGP("clean_from_lists(%p)\n", ct);
297 MUST_BE_WRITE_LOCKED(&ip_conntrack_lock);
298
299 ho = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
300 hr = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
301 LIST_DELETE(&ip_conntrack_hash[ho], &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
302 LIST_DELETE(&ip_conntrack_hash[hr], &ct->tuplehash[IP_CT_DIR_REPLY]);
303
304 /* Destroy all un-established, pending expectations */
305 remove_expectations(ct, 1);
306 }
307
308 static void
destroy_conntrack(struct nf_conntrack * nfct)309 destroy_conntrack(struct nf_conntrack *nfct)
310 {
311 struct ip_conntrack *ct = (struct ip_conntrack *)nfct, *master = NULL;
312 struct ip_conntrack_protocol *proto;
313
314 DEBUGP("destroy_conntrack(%p)\n", ct);
315 IP_NF_ASSERT(atomic_read(&nfct->use) == 0);
316 IP_NF_ASSERT(!timer_pending(&ct->timeout));
317
318 /* To make sure we don't get any weird locking issues here:
319 * destroy_conntrack() MUST NOT be called with a write lock
320 * to ip_conntrack_lock!!! -HW */
321 proto = ip_ct_find_proto(ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum);
322 if (proto && proto->destroy)
323 proto->destroy(ct);
324
325 if (ip_conntrack_destroyed)
326 ip_conntrack_destroyed(ct);
327
328 WRITE_LOCK(&ip_conntrack_lock);
329 /* Make sure don't leave any orphaned expectations lying around */
330 if (ct->expecting)
331 remove_expectations(ct, 1);
332
333 /* We overload first tuple to link into unconfirmed list. */
334 if (!is_confirmed(ct)) {
335 BUG_ON(list_empty(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list));
336 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
337 }
338
339 /* Delete our master expectation */
340 if (ct->master) {
341 if (ct->master->expectant) {
342 /* can't call __unexpect_related here,
343 * since it would screw up expect_list */
344 list_del(&ct->master->expected_list);
345 master = ct->master->expectant;
346 }
347 kfree(ct->master);
348 }
349 WRITE_UNLOCK(&ip_conntrack_lock);
350
351 if (master)
352 ip_conntrack_put(master);
353
354 DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct);
355 kmem_cache_free(ip_conntrack_cachep, ct);
356 atomic_dec(&ip_conntrack_count);
357 }
358
death_by_timeout(unsigned long ul_conntrack)359 static void death_by_timeout(unsigned long ul_conntrack)
360 {
361 struct ip_conntrack *ct = (void *)ul_conntrack;
362
363 WRITE_LOCK(&ip_conntrack_lock);
364 clean_from_lists(ct);
365 WRITE_UNLOCK(&ip_conntrack_lock);
366 ip_conntrack_put(ct);
367 }
368
369 static inline int
conntrack_tuple_cmp(const struct ip_conntrack_tuple_hash * i,const struct ip_conntrack_tuple * tuple,const struct ip_conntrack * ignored_conntrack)370 conntrack_tuple_cmp(const struct ip_conntrack_tuple_hash *i,
371 const struct ip_conntrack_tuple *tuple,
372 const struct ip_conntrack *ignored_conntrack)
373 {
374 MUST_BE_READ_LOCKED(&ip_conntrack_lock);
375 return i->ctrack != ignored_conntrack
376 && ip_ct_tuple_equal(tuple, &i->tuple);
377 }
378
379 static struct ip_conntrack_tuple_hash *
__ip_conntrack_find(const struct ip_conntrack_tuple * tuple,const struct ip_conntrack * ignored_conntrack)380 __ip_conntrack_find(const struct ip_conntrack_tuple *tuple,
381 const struct ip_conntrack *ignored_conntrack)
382 {
383 struct ip_conntrack_tuple_hash *h;
384 unsigned int hash = hash_conntrack(tuple);
385
386 MUST_BE_READ_LOCKED(&ip_conntrack_lock);
387 h = LIST_FIND(&ip_conntrack_hash[hash],
388 conntrack_tuple_cmp,
389 struct ip_conntrack_tuple_hash *,
390 tuple, ignored_conntrack);
391 return h;
392 }
393
394 /* Find a connection corresponding to a tuple. */
395 struct ip_conntrack_tuple_hash *
ip_conntrack_find_get(const struct ip_conntrack_tuple * tuple,const struct ip_conntrack * ignored_conntrack)396 ip_conntrack_find_get(const struct ip_conntrack_tuple *tuple,
397 const struct ip_conntrack *ignored_conntrack)
398 {
399 struct ip_conntrack_tuple_hash *h;
400
401 READ_LOCK(&ip_conntrack_lock);
402 h = __ip_conntrack_find(tuple, ignored_conntrack);
403 if (h)
404 atomic_inc(&h->ctrack->ct_general.use);
405 READ_UNLOCK(&ip_conntrack_lock);
406
407 return h;
408 }
409
410 static inline struct ip_conntrack *
__ip_conntrack_get(struct nf_ct_info * nfct,enum ip_conntrack_info * ctinfo)411 __ip_conntrack_get(struct nf_ct_info *nfct, enum ip_conntrack_info *ctinfo)
412 {
413 struct ip_conntrack *ct
414 = (struct ip_conntrack *)nfct->master;
415
416 /* ctinfo is the index of the nfct inside the conntrack */
417 *ctinfo = nfct - ct->infos;
418 IP_NF_ASSERT(*ctinfo >= 0 && *ctinfo < IP_CT_NUMBER);
419 return ct;
420 }
421
422 /* Return conntrack and conntrack_info given skb->nfct->master */
423 struct ip_conntrack *
ip_conntrack_get(struct sk_buff * skb,enum ip_conntrack_info * ctinfo)424 ip_conntrack_get(struct sk_buff *skb, enum ip_conntrack_info *ctinfo)
425 {
426 if (skb->nfct)
427 return __ip_conntrack_get(skb->nfct, ctinfo);
428 return NULL;
429 }
430
431 /* Confirm a connection given skb->nfct; places it in hash table */
432 int
__ip_conntrack_confirm(struct nf_ct_info * nfct)433 __ip_conntrack_confirm(struct nf_ct_info *nfct)
434 {
435 unsigned int hash, repl_hash;
436 struct ip_conntrack *ct;
437 enum ip_conntrack_info ctinfo;
438
439 ct = __ip_conntrack_get(nfct, &ctinfo);
440
441 /* ipt_REJECT uses ip_conntrack_attach to attach related
442 ICMP/TCP RST packets in other direction. Actual packet
443 which created connection will be IP_CT_NEW or for an
444 expected connection, IP_CT_RELATED. */
445 if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
446 return NF_ACCEPT;
447
448 hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
449 repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
450
451 /* We're not in hash table, and we refuse to set up related
452 connections for unconfirmed conns. But packet copies and
453 REJECT will give spurious warnings here. */
454 /* IP_NF_ASSERT(atomic_read(&ct->ct_general.use) == 1); */
455
456 /* No external references means noone else could have
457 confirmed us. */
458 IP_NF_ASSERT(!is_confirmed(ct));
459 DEBUGP("Confirming conntrack %p\n", ct);
460
461 WRITE_LOCK(&ip_conntrack_lock);
462 /* See if there's one in the list already, including reverse:
463 NAT could have grabbed it without realizing, since we're
464 not in the hash. If there is, we lost race. */
465 if (!LIST_FIND(&ip_conntrack_hash[hash],
466 conntrack_tuple_cmp,
467 struct ip_conntrack_tuple_hash *,
468 &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, NULL)
469 && !LIST_FIND(&ip_conntrack_hash[repl_hash],
470 conntrack_tuple_cmp,
471 struct ip_conntrack_tuple_hash *,
472 &ct->tuplehash[IP_CT_DIR_REPLY].tuple, NULL)) {
473 /* Remove from unconfirmed list */
474 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
475
476 list_prepend(&ip_conntrack_hash[hash],
477 &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
478 list_prepend(&ip_conntrack_hash[repl_hash],
479 &ct->tuplehash[IP_CT_DIR_REPLY]);
480 /* Timer relative to confirmation time, not original
481 setting time, otherwise we'd get timer wrap in
482 weird delay cases. */
483 ct->timeout.expires += jiffies;
484 add_timer(&ct->timeout);
485 atomic_inc(&ct->ct_general.use);
486 set_bit(IPS_CONFIRMED_BIT, &ct->status);
487 WRITE_UNLOCK(&ip_conntrack_lock);
488 return NF_ACCEPT;
489 }
490
491 WRITE_UNLOCK(&ip_conntrack_lock);
492 return NF_DROP;
493 }
494
495 /* Returns true if a connection correspondings to the tuple (required
496 for NAT). */
497 int
ip_conntrack_tuple_taken(const struct ip_conntrack_tuple * tuple,const struct ip_conntrack * ignored_conntrack)498 ip_conntrack_tuple_taken(const struct ip_conntrack_tuple *tuple,
499 const struct ip_conntrack *ignored_conntrack)
500 {
501 struct ip_conntrack_tuple_hash *h;
502
503 READ_LOCK(&ip_conntrack_lock);
504 h = __ip_conntrack_find(tuple, ignored_conntrack);
505 READ_UNLOCK(&ip_conntrack_lock);
506
507 return h != NULL;
508 }
509
510 /* Returns conntrack if it dealt with ICMP, and filled in skb fields */
511 struct ip_conntrack *
icmp_error_track(struct sk_buff * skb,enum ip_conntrack_info * ctinfo,unsigned int hooknum)512 icmp_error_track(struct sk_buff *skb,
513 enum ip_conntrack_info *ctinfo,
514 unsigned int hooknum)
515 {
516 const struct iphdr *iph = skb->nh.iph;
517 struct icmphdr *hdr;
518 struct ip_conntrack_tuple innertuple, origtuple;
519 struct iphdr *inner;
520 size_t datalen;
521 struct ip_conntrack_protocol *innerproto;
522 struct ip_conntrack_tuple_hash *h;
523
524 IP_NF_ASSERT(iph->protocol == IPPROTO_ICMP);
525 IP_NF_ASSERT(skb->nfct == NULL);
526
527 hdr = (struct icmphdr *)((u_int32_t *)iph + iph->ihl);
528 inner = (struct iphdr *)(hdr + 1);
529 datalen = skb->len - iph->ihl*4 - sizeof(*hdr);
530
531 if (skb->len < iph->ihl * 4 + sizeof(*hdr) + sizeof(*iph)) {
532 DEBUGP("icmp_error_track: too short\n");
533 return NULL;
534 }
535
536 if (hdr->type != ICMP_DEST_UNREACH
537 && hdr->type != ICMP_SOURCE_QUENCH
538 && hdr->type != ICMP_TIME_EXCEEDED
539 && hdr->type != ICMP_PARAMETERPROB
540 && hdr->type != ICMP_REDIRECT)
541 return NULL;
542
543 /* Ignore ICMP's containing fragments (shouldn't happen) */
544 if (inner->frag_off & htons(IP_OFFSET)) {
545 DEBUGP("icmp_error_track: fragment of proto %u\n",
546 inner->protocol);
547 return NULL;
548 }
549
550 /* Ignore it if the checksum's bogus. */
551 if (ip_compute_csum((unsigned char *)hdr, sizeof(*hdr) + datalen)) {
552 DEBUGP("icmp_error_track: bad csum\n");
553 return NULL;
554 }
555
556 innerproto = ip_ct_find_proto(inner->protocol);
557 /* Are they talking about one of our connections? */
558 if (inner->ihl * 4 + 8 > datalen
559 || !ip_ct_get_tuple(inner, datalen, &origtuple, innerproto)) {
560 DEBUGP("icmp_error: ! get_tuple p=%u (%u*4+%u dlen=%u)\n",
561 inner->protocol, inner->ihl, 8,
562 datalen);
563 return NULL;
564 }
565
566 /* Ordinarily, we'd expect the inverted tupleproto, but it's
567 been preserved inside the ICMP. */
568 if (!invert_tuple(&innertuple, &origtuple, innerproto)) {
569 DEBUGP("icmp_error_track: Can't invert tuple\n");
570 return NULL;
571 }
572
573 *ctinfo = IP_CT_RELATED;
574
575 h = ip_conntrack_find_get(&innertuple, NULL);
576 if (!h) {
577 /* Locally generated ICMPs will match inverted if they
578 haven't been SNAT'ed yet */
579 /* FIXME: NAT code has to handle half-done double NAT --RR */
580 if (hooknum == NF_IP_LOCAL_OUT)
581 h = ip_conntrack_find_get(&origtuple, NULL);
582
583 if (!h) {
584 DEBUGP("icmp_error_track: no match\n");
585 return NULL;
586 }
587 /* Reverse direction from that found */
588 if (DIRECTION(h) != IP_CT_DIR_REPLY)
589 *ctinfo += IP_CT_IS_REPLY;
590 } else {
591 if (DIRECTION(h) == IP_CT_DIR_REPLY)
592 *ctinfo += IP_CT_IS_REPLY;
593 }
594
595 /* Update skb to refer to this connection */
596 skb->nfct = &h->ctrack->infos[*ctinfo];
597 return h->ctrack;
598 }
599
600 /* There's a small race here where we may free a just-assured
601 connection. Too bad: we're in trouble anyway. */
unreplied(const struct ip_conntrack_tuple_hash * i)602 static inline int unreplied(const struct ip_conntrack_tuple_hash *i)
603 {
604 return !(test_bit(IPS_ASSURED_BIT, &i->ctrack->status));
605 }
606
early_drop(struct list_head * chain)607 static int early_drop(struct list_head *chain)
608 {
609 /* Traverse backwards: gives us oldest, which is roughly LRU */
610 struct ip_conntrack_tuple_hash *h;
611 int dropped = 0;
612
613 READ_LOCK(&ip_conntrack_lock);
614 h = LIST_FIND_B(chain, unreplied, struct ip_conntrack_tuple_hash *);
615 if (h)
616 atomic_inc(&h->ctrack->ct_general.use);
617 READ_UNLOCK(&ip_conntrack_lock);
618
619 if (!h)
620 return dropped;
621
622 if (del_timer(&h->ctrack->timeout)) {
623 death_by_timeout((unsigned long)h->ctrack);
624 dropped = 1;
625 }
626 ip_conntrack_put(h->ctrack);
627 return dropped;
628 }
629
helper_cmp(const struct ip_conntrack_helper * i,const struct ip_conntrack_tuple * rtuple)630 static inline int helper_cmp(const struct ip_conntrack_helper *i,
631 const struct ip_conntrack_tuple *rtuple)
632 {
633 return ip_ct_tuple_mask_cmp(rtuple, &i->tuple, &i->mask);
634 }
635
ip_ct_find_helper(const struct ip_conntrack_tuple * tuple)636 struct ip_conntrack_helper *ip_ct_find_helper(const struct ip_conntrack_tuple *tuple)
637 {
638 return LIST_FIND(&helpers, helper_cmp,
639 struct ip_conntrack_helper *,
640 tuple);
641 }
642
643 /* Allocate a new conntrack: we return -ENOMEM if classification
644 failed due to stress. Otherwise it really is unclassifiable. */
645 static struct ip_conntrack_tuple_hash *
init_conntrack(const struct ip_conntrack_tuple * tuple,struct ip_conntrack_protocol * protocol,struct sk_buff * skb)646 init_conntrack(const struct ip_conntrack_tuple *tuple,
647 struct ip_conntrack_protocol *protocol,
648 struct sk_buff *skb)
649 {
650 struct ip_conntrack *conntrack;
651 struct ip_conntrack_tuple repl_tuple;
652 size_t hash;
653 struct ip_conntrack_expect *expected;
654 int i;
655 static unsigned int drop_next = 0;
656
657 if (!ip_conntrack_hash_rnd_initted) {
658 get_random_bytes(&ip_conntrack_hash_rnd, 4);
659 ip_conntrack_hash_rnd_initted = 1;
660 }
661
662 hash = hash_conntrack(tuple);
663
664 if (ip_conntrack_max &&
665 atomic_read(&ip_conntrack_count) >= ip_conntrack_max) {
666 /* Try dropping from random chain, or else from the
667 chain about to put into (in case they're trying to
668 bomb one hash chain). */
669 unsigned int next = (drop_next++)%ip_conntrack_htable_size;
670
671 if (!early_drop(&ip_conntrack_hash[next])
672 && !early_drop(&ip_conntrack_hash[hash])) {
673 if (net_ratelimit())
674 printk(KERN_WARNING
675 "ip_conntrack: table full, dropping"
676 " packet.\n");
677 return ERR_PTR(-ENOMEM);
678 }
679 }
680
681 if (!invert_tuple(&repl_tuple, tuple, protocol)) {
682 DEBUGP("Can't invert tuple.\n");
683 return NULL;
684 }
685
686 conntrack = kmem_cache_alloc(ip_conntrack_cachep, GFP_ATOMIC);
687 if (!conntrack) {
688 DEBUGP("Can't allocate conntrack.\n");
689 return ERR_PTR(-ENOMEM);
690 }
691
692 memset(conntrack, 0, sizeof(*conntrack));
693 atomic_set(&conntrack->ct_general.use, 1);
694 conntrack->ct_general.destroy = destroy_conntrack;
695 conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *tuple;
696 conntrack->tuplehash[IP_CT_DIR_ORIGINAL].ctrack = conntrack;
697 conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = repl_tuple;
698 conntrack->tuplehash[IP_CT_DIR_REPLY].ctrack = conntrack;
699 for (i=0; i < IP_CT_NUMBER; i++)
700 conntrack->infos[i].master = &conntrack->ct_general;
701
702 if (!protocol->new(conntrack, skb->nh.iph, skb->len)) {
703 kmem_cache_free(ip_conntrack_cachep, conntrack);
704 return NULL;
705 }
706 /* Don't set timer yet: wait for confirmation */
707 init_timer(&conntrack->timeout);
708 conntrack->timeout.data = (unsigned long)conntrack;
709 conntrack->timeout.function = death_by_timeout;
710
711 INIT_LIST_HEAD(&conntrack->sibling_list);
712
713 WRITE_LOCK(&ip_conntrack_lock);
714 /* Need finding and deleting of expected ONLY if we win race */
715 READ_LOCK(&ip_conntrack_expect_tuple_lock);
716 expected = LIST_FIND(&ip_conntrack_expect_list, expect_cmp,
717 struct ip_conntrack_expect *, tuple);
718 READ_UNLOCK(&ip_conntrack_expect_tuple_lock);
719
720 /* If master is not in hash table yet (ie. packet hasn't left
721 this machine yet), how can other end know about expected?
722 Hence these are not the droids you are looking for (if
723 master ct never got confirmed, we'd hold a reference to it
724 and weird things would happen to future packets). */
725 if (expected && !is_confirmed(expected->expectant))
726 expected = NULL;
727
728 /* Look up the conntrack helper for master connections only */
729 if (!expected)
730 conntrack->helper = ip_ct_find_helper(&repl_tuple);
731
732 /* If the expectation is dying, then this is a looser. */
733 if (expected
734 && expected->expectant->helper->timeout
735 && ! del_timer(&expected->timeout))
736 expected = NULL;
737
738 if (expected) {
739 DEBUGP("conntrack: expectation arrives ct=%p exp=%p\n",
740 conntrack, expected);
741 /* Welcome, Mr. Bond. We've been expecting you... */
742 __set_bit(IPS_EXPECTED_BIT, &conntrack->status);
743 conntrack->master = expected;
744 expected->sibling = conntrack;
745 LIST_DELETE(&ip_conntrack_expect_list, expected);
746 expected->expectant->expecting--;
747 nf_conntrack_get(&master_ct(conntrack)->infos[0]);
748 }
749 /* Overload tuple linked list to put us in unconfirmed list. */
750 list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list,
751 &unconfirmed);
752
753 atomic_inc(&ip_conntrack_count);
754 WRITE_UNLOCK(&ip_conntrack_lock);
755
756 if (expected && expected->expectfn)
757 expected->expectfn(conntrack);
758 return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL];
759 }
760
761 /* On success, returns conntrack ptr, sets skb->nfct and ctinfo */
762 static inline struct ip_conntrack *
resolve_normal_ct(struct sk_buff * skb,struct ip_conntrack_protocol * proto,int * set_reply,unsigned int hooknum,enum ip_conntrack_info * ctinfo)763 resolve_normal_ct(struct sk_buff *skb,
764 struct ip_conntrack_protocol *proto,
765 int *set_reply,
766 unsigned int hooknum,
767 enum ip_conntrack_info *ctinfo)
768 {
769 struct ip_conntrack_tuple tuple;
770 struct ip_conntrack_tuple_hash *h;
771
772 IP_NF_ASSERT((skb->nh.iph->frag_off & htons(IP_OFFSET)) == 0);
773
774 if (!ip_ct_get_tuple(skb->nh.iph, skb->len, &tuple, proto))
775 return NULL;
776
777 /* look for tuple match */
778 h = ip_conntrack_find_get(&tuple, NULL);
779 if (!h) {
780 h = init_conntrack(&tuple, proto, skb);
781 if (!h)
782 return NULL;
783 if (IS_ERR(h))
784 return (void *)h;
785 }
786
787 /* It exists; we have (non-exclusive) reference. */
788 if (DIRECTION(h) == IP_CT_DIR_REPLY) {
789 *ctinfo = IP_CT_ESTABLISHED + IP_CT_IS_REPLY;
790 /* Please set reply bit if this packet OK */
791 *set_reply = 1;
792 } else {
793 /* Once we've had two way comms, always ESTABLISHED. */
794 if (test_bit(IPS_SEEN_REPLY_BIT, &h->ctrack->status)) {
795 DEBUGP("ip_conntrack_in: normal packet for %p\n",
796 h->ctrack);
797 *ctinfo = IP_CT_ESTABLISHED;
798 } else if (test_bit(IPS_EXPECTED_BIT, &h->ctrack->status)) {
799 DEBUGP("ip_conntrack_in: related packet for %p\n",
800 h->ctrack);
801 *ctinfo = IP_CT_RELATED;
802 } else {
803 DEBUGP("ip_conntrack_in: new packet for %p\n",
804 h->ctrack);
805 *ctinfo = IP_CT_NEW;
806 }
807 *set_reply = 0;
808 }
809 skb->nfct = &h->ctrack->infos[*ctinfo];
810 return h->ctrack;
811 }
812
813 /* Netfilter hook itself. */
ip_conntrack_in(unsigned int hooknum,struct sk_buff ** pskb,const struct net_device * in,const struct net_device * out,int (* okfn)(struct sk_buff *))814 unsigned int ip_conntrack_in(unsigned int hooknum,
815 struct sk_buff **pskb,
816 const struct net_device *in,
817 const struct net_device *out,
818 int (*okfn)(struct sk_buff *))
819 {
820 struct ip_conntrack *ct;
821 enum ip_conntrack_info ctinfo;
822 struct ip_conntrack_protocol *proto;
823 int set_reply;
824 int ret;
825
826 /* FIXME: Do this right please. --RR */
827 (*pskb)->nfcache |= NFC_UNKNOWN;
828
829 /* Doesn't cover locally-generated broadcast, so not worth it. */
830 #if 0
831 /* Ignore broadcast: no `connection'. */
832 if ((*pskb)->pkt_type == PACKET_BROADCAST) {
833 printk("Broadcast packet!\n");
834 return NF_ACCEPT;
835 } else if (((*pskb)->nh.iph->daddr & htonl(0x000000FF))
836 == htonl(0x000000FF)) {
837 printk("Should bcast: %u.%u.%u.%u->%u.%u.%u.%u (sk=%p, ptype=%u)\n",
838 NIPQUAD((*pskb)->nh.iph->saddr),
839 NIPQUAD((*pskb)->nh.iph->daddr),
840 (*pskb)->sk, (*pskb)->pkt_type);
841 }
842 #endif
843
844 /* Previously seen (loopback)? Ignore. Do this before
845 fragment check. */
846 if ((*pskb)->nfct)
847 return NF_ACCEPT;
848
849 /* Gather fragments. */
850 if ((*pskb)->nh.iph->frag_off & htons(IP_MF|IP_OFFSET)) {
851 *pskb = ip_ct_gather_frags(*pskb,
852 hooknum == NF_IP_PRE_ROUTING ?
853 IP_DEFRAG_CONNTRACK_IN :
854 IP_DEFRAG_CONNTRACK_OUT);
855 if (!*pskb)
856 return NF_STOLEN;
857 }
858
859 proto = ip_ct_find_proto((*pskb)->nh.iph->protocol);
860
861 /* It may be an icmp error... */
862 if ((*pskb)->nh.iph->protocol == IPPROTO_ICMP
863 && icmp_error_track(*pskb, &ctinfo, hooknum))
864 return NF_ACCEPT;
865
866 if (!(ct = resolve_normal_ct(*pskb, proto,&set_reply,hooknum,&ctinfo)))
867 /* Not valid part of a connection */
868 return NF_ACCEPT;
869
870 if (IS_ERR(ct))
871 /* Too stressed to deal. */
872 return NF_DROP;
873
874 IP_NF_ASSERT((*pskb)->nfct);
875
876 ret = proto->packet(ct, (*pskb)->nh.iph, (*pskb)->len, ctinfo);
877 if (ret == -1) {
878 /* Invalid */
879 nf_conntrack_put((*pskb)->nfct);
880 (*pskb)->nfct = NULL;
881 return NF_ACCEPT;
882 }
883
884 if (ret != NF_DROP && ct->helper) {
885 ret = ct->helper->help((*pskb)->nh.iph, (*pskb)->len,
886 ct, ctinfo);
887 if (ret == -1) {
888 /* Invalid */
889 nf_conntrack_put((*pskb)->nfct);
890 (*pskb)->nfct = NULL;
891 return NF_ACCEPT;
892 }
893 }
894 if (set_reply)
895 set_bit(IPS_SEEN_REPLY_BIT, &ct->status);
896
897 return ret;
898 }
899
invert_tuplepr(struct ip_conntrack_tuple * inverse,const struct ip_conntrack_tuple * orig)900 int invert_tuplepr(struct ip_conntrack_tuple *inverse,
901 const struct ip_conntrack_tuple *orig)
902 {
903 return invert_tuple(inverse, orig, ip_ct_find_proto(orig->dst.protonum));
904 }
905
resent_expect(const struct ip_conntrack_expect * i,const struct ip_conntrack_tuple * tuple,const struct ip_conntrack_tuple * mask)906 static inline int resent_expect(const struct ip_conntrack_expect *i,
907 const struct ip_conntrack_tuple *tuple,
908 const struct ip_conntrack_tuple *mask)
909 {
910 DEBUGP("resent_expect\n");
911 DEBUGP(" tuple: "); DUMP_TUPLE(&i->tuple);
912 DEBUGP("ct_tuple: "); DUMP_TUPLE(&i->ct_tuple);
913 DEBUGP("test tuple: "); DUMP_TUPLE(tuple);
914 return (((i->ct_tuple.dst.protonum == 0 && ip_ct_tuple_equal(&i->tuple, tuple))
915 || (i->ct_tuple.dst.protonum && ip_ct_tuple_equal(&i->ct_tuple, tuple)))
916 && ip_ct_tuple_equal(&i->mask, mask));
917 }
918
919 /* Would two expected things clash? */
expect_clash(const struct ip_conntrack_expect * i,const struct ip_conntrack_tuple * tuple,const struct ip_conntrack_tuple * mask)920 static inline int expect_clash(const struct ip_conntrack_expect *i,
921 const struct ip_conntrack_tuple *tuple,
922 const struct ip_conntrack_tuple *mask)
923 {
924 /* Part covered by intersection of masks must be unequal,
925 otherwise they clash */
926 struct ip_conntrack_tuple intersect_mask
927 = { { i->mask.src.ip & mask->src.ip,
928 { i->mask.src.u.all & mask->src.u.all } },
929 { i->mask.dst.ip & mask->dst.ip,
930 { i->mask.dst.u.all & mask->dst.u.all },
931 i->mask.dst.protonum & mask->dst.protonum } };
932
933 return ip_ct_tuple_mask_cmp(&i->tuple, tuple, &intersect_mask);
934 }
935
ip_conntrack_unexpect_related(struct ip_conntrack_expect * expect)936 inline void ip_conntrack_unexpect_related(struct ip_conntrack_expect *expect)
937 {
938 WRITE_LOCK(&ip_conntrack_lock);
939 unexpect_related(expect);
940 WRITE_UNLOCK(&ip_conntrack_lock);
941 }
942
expectation_timed_out(unsigned long ul_expect)943 static void expectation_timed_out(unsigned long ul_expect)
944 {
945 struct ip_conntrack_expect *expect = (void *) ul_expect;
946
947 DEBUGP("expectation %p timed out\n", expect);
948 WRITE_LOCK(&ip_conntrack_lock);
949 __unexpect_related(expect);
950 WRITE_UNLOCK(&ip_conntrack_lock);
951 }
952
953 /* Add a related connection. */
ip_conntrack_expect_related(struct ip_conntrack * related_to,struct ip_conntrack_expect * expect)954 int ip_conntrack_expect_related(struct ip_conntrack *related_to,
955 struct ip_conntrack_expect *expect)
956 {
957 struct ip_conntrack_expect *old, *new;
958 int ret = 0;
959
960 WRITE_LOCK(&ip_conntrack_lock);
961 /* Because of the write lock, no reader can walk the lists,
962 * so there is no need to use the tuple lock too */
963
964 DEBUGP("ip_conntrack_expect_related %p\n", related_to);
965 DEBUGP("tuple: "); DUMP_TUPLE(&expect->tuple);
966 DEBUGP("mask: "); DUMP_TUPLE(&expect->mask);
967
968 old = LIST_FIND(&ip_conntrack_expect_list, resent_expect,
969 struct ip_conntrack_expect *, &expect->tuple,
970 &expect->mask);
971 if (old) {
972 /* Helper private data may contain offsets but no pointers
973 pointing into the payload - otherwise we should have to copy
974 the data filled out by the helper over the old one */
975 DEBUGP("expect_related: resent packet\n");
976 if (old->expectant == related_to &&
977 related_to->helper->timeout) {
978 if (!del_timer(&old->timeout)) {
979 /* expectation is dying. Fall through */
980 old = NULL;
981 } else {
982 old->timeout.expires = jiffies +
983 related_to->helper->timeout * HZ;
984 add_timer(&old->timeout);
985 }
986 }
987
988 if (old) {
989 WRITE_UNLOCK(&ip_conntrack_lock);
990 return -EEXIST;
991 }
992 } else if (related_to->helper->max_expected &&
993 related_to->expecting >= related_to->helper->max_expected) {
994 /* old == NULL */
995 if (!(related_to->helper->flags &
996 IP_CT_HELPER_F_REUSE_EXPECT)) {
997 WRITE_UNLOCK(&ip_conntrack_lock);
998 if (net_ratelimit())
999 printk(KERN_WARNING
1000 "ip_conntrack: max number of expected "
1001 "connections %i of %s reached for "
1002 "%u.%u.%u.%u->%u.%u.%u.%u\n",
1003 related_to->helper->max_expected,
1004 related_to->helper->name,
1005 NIPQUAD(related_to->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip),
1006 NIPQUAD(related_to->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip));
1007 return -EPERM;
1008 }
1009 DEBUGP("ip_conntrack: max number of expected "
1010 "connections %i of %s reached for "
1011 "%u.%u.%u.%u->%u.%u.%u.%u, reusing\n",
1012 related_to->helper->max_expected,
1013 related_to->helper->name,
1014 NIPQUAD(related_to->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip),
1015 NIPQUAD(related_to->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip));
1016
1017 /* choose the the oldest expectation to evict */
1018 list_for_each_entry(old, &related_to->sibling_list,
1019 expected_list)
1020 if (old->sibling == NULL)
1021 break;
1022
1023 /* We cannot fail since related_to->expecting is the number
1024 * of unconfirmed expectations */
1025 IP_NF_ASSERT(old && old->sibling == NULL);
1026
1027 /* newnat14 does not reuse the real allocated memory
1028 * structures but rather unexpects the old and
1029 * allocates a new. unexpect_related will decrement
1030 * related_to->expecting.
1031 */
1032 unexpect_related(old);
1033 ret = -EPERM;
1034 } else if (LIST_FIND(&ip_conntrack_expect_list, expect_clash,
1035 struct ip_conntrack_expect *, &expect->tuple,
1036 &expect->mask)) {
1037 WRITE_UNLOCK(&ip_conntrack_lock);
1038 DEBUGP("expect_related: busy!\n");
1039 return -EBUSY;
1040 }
1041
1042 new = (struct ip_conntrack_expect *)
1043 kmalloc(sizeof(struct ip_conntrack_expect), GFP_ATOMIC);
1044 if (!new) {
1045 WRITE_UNLOCK(&ip_conntrack_lock);
1046 DEBUGP("expect_relaed: OOM allocating expect\n");
1047 return -ENOMEM;
1048 }
1049
1050 DEBUGP("new expectation %p of conntrack %p\n", new, related_to);
1051 memcpy(new, expect, sizeof(*expect));
1052 new->expectant = related_to;
1053 new->sibling = NULL;
1054 atomic_set(&new->use, 1);
1055
1056 /* add to expected list for this connection */
1057 list_add_tail(&new->expected_list, &related_to->sibling_list);
1058 /* add to global list of expectations */
1059 list_prepend(&ip_conntrack_expect_list, &new->list);
1060 /* add and start timer if required */
1061 if (related_to->helper->timeout) {
1062 init_timer(&new->timeout);
1063 new->timeout.data = (unsigned long)new;
1064 new->timeout.function = expectation_timed_out;
1065 new->timeout.expires = jiffies +
1066 related_to->helper->timeout * HZ;
1067 add_timer(&new->timeout);
1068 }
1069 related_to->expecting++;
1070
1071 WRITE_UNLOCK(&ip_conntrack_lock);
1072
1073 return ret;
1074 }
1075
1076 /* Change tuple in an existing expectation */
ip_conntrack_change_expect(struct ip_conntrack_expect * expect,struct ip_conntrack_tuple * newtuple)1077 int ip_conntrack_change_expect(struct ip_conntrack_expect *expect,
1078 struct ip_conntrack_tuple *newtuple)
1079 {
1080 int ret;
1081
1082 MUST_BE_READ_LOCKED(&ip_conntrack_lock);
1083 WRITE_LOCK(&ip_conntrack_expect_tuple_lock);
1084
1085 DEBUGP("change_expect:\n");
1086 DEBUGP("exp tuple: "); DUMP_TUPLE(&expect->tuple);
1087 DEBUGP("exp mask: "); DUMP_TUPLE(&expect->mask);
1088 DEBUGP("newtuple: "); DUMP_TUPLE(newtuple);
1089 if (expect->ct_tuple.dst.protonum == 0) {
1090 /* Never seen before */
1091 DEBUGP("change expect: never seen before\n");
1092 if (!ip_ct_tuple_equal(&expect->tuple, newtuple)
1093 && LIST_FIND(&ip_conntrack_expect_list, expect_clash,
1094 struct ip_conntrack_expect *, newtuple, &expect->mask)) {
1095 /* Force NAT to find an unused tuple */
1096 ret = -1;
1097 } else {
1098 memcpy(&expect->ct_tuple, &expect->tuple, sizeof(expect->tuple));
1099 memcpy(&expect->tuple, newtuple, sizeof(expect->tuple));
1100 ret = 0;
1101 }
1102 } else {
1103 /* Resent packet */
1104 DEBUGP("change expect: resent packet\n");
1105 if (ip_ct_tuple_equal(&expect->tuple, newtuple)) {
1106 ret = 0;
1107 } else {
1108 /* Force NAT to choose again the same port */
1109 ret = -1;
1110 }
1111 }
1112 WRITE_UNLOCK(&ip_conntrack_expect_tuple_lock);
1113
1114 return ret;
1115 }
1116
1117 /* Alter reply tuple (maybe alter helper). If it's already taken,
1118 return 0 and don't do alteration. */
ip_conntrack_alter_reply(struct ip_conntrack * conntrack,const struct ip_conntrack_tuple * newreply)1119 int ip_conntrack_alter_reply(struct ip_conntrack *conntrack,
1120 const struct ip_conntrack_tuple *newreply)
1121 {
1122 WRITE_LOCK(&ip_conntrack_lock);
1123 if (__ip_conntrack_find(newreply, conntrack)) {
1124 WRITE_UNLOCK(&ip_conntrack_lock);
1125 return 0;
1126 }
1127 /* Should be unconfirmed, so not in hash table yet */
1128 IP_NF_ASSERT(!is_confirmed(conntrack));
1129
1130 DEBUGP("Altering reply tuple of %p to ", conntrack);
1131 DUMP_TUPLE(newreply);
1132
1133 conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
1134 if (!conntrack->master && list_empty(&conntrack->sibling_list))
1135 conntrack->helper = ip_ct_find_helper(newreply);
1136 WRITE_UNLOCK(&ip_conntrack_lock);
1137
1138 return 1;
1139 }
1140
ip_conntrack_helper_register(struct ip_conntrack_helper * me)1141 int ip_conntrack_helper_register(struct ip_conntrack_helper *me)
1142 {
1143 MOD_INC_USE_COUNT;
1144
1145 WRITE_LOCK(&ip_conntrack_lock);
1146 list_prepend(&helpers, me);
1147 WRITE_UNLOCK(&ip_conntrack_lock);
1148
1149 return 0;
1150 }
1151
unhelp(struct ip_conntrack_tuple_hash * i,const struct ip_conntrack_helper * me)1152 static inline int unhelp(struct ip_conntrack_tuple_hash *i,
1153 const struct ip_conntrack_helper *me)
1154 {
1155 if (i->ctrack->helper == me) {
1156 /* Get rid of any expected. */
1157 remove_expectations(i->ctrack, 0);
1158 /* And *then* set helper to NULL */
1159 i->ctrack->helper = NULL;
1160 }
1161 return 0;
1162 }
1163
ip_conntrack_helper_unregister(struct ip_conntrack_helper * me)1164 void ip_conntrack_helper_unregister(struct ip_conntrack_helper *me)
1165 {
1166 unsigned int i;
1167
1168 /* Need write lock here, to delete helper. */
1169 WRITE_LOCK(&ip_conntrack_lock);
1170 LIST_DELETE(&helpers, me);
1171
1172 /* Get rid of expecteds, set helpers to NULL. */
1173 LIST_FIND_W(&unconfirmed, unhelp, struct ip_conntrack_tuple_hash*, me);
1174 for (i = 0; i < ip_conntrack_htable_size; i++)
1175 LIST_FIND_W(&ip_conntrack_hash[i], unhelp,
1176 struct ip_conntrack_tuple_hash *, me);
1177 WRITE_UNLOCK(&ip_conntrack_lock);
1178
1179 /* Someone could be still looking at the helper in a bh. */
1180 br_write_lock_bh(BR_NETPROTO_LOCK);
1181 br_write_unlock_bh(BR_NETPROTO_LOCK);
1182
1183 MOD_DEC_USE_COUNT;
1184 }
1185
1186 /* Refresh conntrack for this many jiffies. */
ip_ct_refresh(struct ip_conntrack * ct,unsigned long extra_jiffies)1187 void ip_ct_refresh(struct ip_conntrack *ct, unsigned long extra_jiffies)
1188 {
1189 IP_NF_ASSERT(ct->timeout.data == (unsigned long)ct);
1190
1191 WRITE_LOCK(&ip_conntrack_lock);
1192 /* If not in hash table, timer will not be active yet */
1193 if (!is_confirmed(ct))
1194 ct->timeout.expires = extra_jiffies;
1195 else {
1196 /* Need del_timer for race avoidance (may already be dying). */
1197 if (del_timer(&ct->timeout)) {
1198 ct->timeout.expires = jiffies + extra_jiffies;
1199 add_timer(&ct->timeout);
1200 }
1201 }
1202 WRITE_UNLOCK(&ip_conntrack_lock);
1203 }
1204
1205 /* Returns new sk_buff, or NULL */
1206 struct sk_buff *
ip_ct_gather_frags(struct sk_buff * skb,u_int32_t user)1207 ip_ct_gather_frags(struct sk_buff *skb, u_int32_t user)
1208 {
1209 struct sock *sk = skb->sk;
1210 #ifdef CONFIG_NETFILTER_DEBUG
1211 unsigned int olddebug = skb->nf_debug;
1212 #endif
1213
1214 if (sk) {
1215 sock_hold(sk);
1216 skb_orphan(skb);
1217 }
1218
1219 local_bh_disable();
1220 skb = ip_defrag(skb, user);
1221 local_bh_enable();
1222
1223 if (!skb) {
1224 if (sk) sock_put(sk);
1225 return skb;
1226 } else if (skb_is_nonlinear(skb) && skb_linearize(skb, GFP_ATOMIC) != 0) {
1227 kfree_skb(skb);
1228 if (sk) sock_put(sk);
1229 return NULL;
1230 }
1231
1232 if (sk) {
1233 skb_set_owner_w(skb, sk);
1234 sock_put(sk);
1235 }
1236
1237 ip_send_check(skb->nh.iph);
1238 skb->nfcache |= NFC_ALTERED;
1239 #ifdef CONFIG_NETFILTER_DEBUG
1240 /* Packet path as if nothing had happened. */
1241 skb->nf_debug = olddebug;
1242 #endif
1243 return skb;
1244 }
1245
1246 /* Used by ipt_REJECT. */
ip_conntrack_attach(struct sk_buff * nskb,struct nf_ct_info * nfct)1247 static void ip_conntrack_attach(struct sk_buff *nskb, struct nf_ct_info *nfct)
1248 {
1249 struct ip_conntrack *ct;
1250 enum ip_conntrack_info ctinfo;
1251
1252 ct = __ip_conntrack_get(nfct, &ctinfo);
1253
1254 /* This ICMP is in reverse direction to the packet which
1255 caused it */
1256 if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
1257 ctinfo = IP_CT_RELATED + IP_CT_IS_REPLY;
1258 else
1259 ctinfo = IP_CT_RELATED;
1260
1261 /* Attach new skbuff, and increment count */
1262 nskb->nfct = &ct->infos[ctinfo];
1263 atomic_inc(&ct->ct_general.use);
1264 }
1265
1266 static inline int
do_iter(const struct ip_conntrack_tuple_hash * i,int (* iter)(struct ip_conntrack * i,void * data),void * data)1267 do_iter(const struct ip_conntrack_tuple_hash *i,
1268 int (*iter)(struct ip_conntrack *i, void *data),
1269 void *data)
1270 {
1271 return iter(i->ctrack, data);
1272 }
1273
1274 /* Bring out ya dead! */
1275 static struct ip_conntrack_tuple_hash *
get_next_corpse(int (* iter)(struct ip_conntrack * i,void * data),void * data,unsigned int * bucket)1276 get_next_corpse(int (*iter)(struct ip_conntrack *i, void *data),
1277 void *data, unsigned int *bucket)
1278 {
1279 struct ip_conntrack_tuple_hash *h = NULL;
1280
1281 WRITE_LOCK(&ip_conntrack_lock);
1282 for (; *bucket < ip_conntrack_htable_size; (*bucket)++) {
1283 h = LIST_FIND_W(&ip_conntrack_hash[*bucket], do_iter,
1284 struct ip_conntrack_tuple_hash *, iter, data);
1285 if (h)
1286 break;
1287 }
1288 if (!h)
1289 h = LIST_FIND_W(&unconfirmed, do_iter,
1290 struct ip_conntrack_tuple_hash *, iter, data);
1291 if (h)
1292 atomic_inc(&h->ctrack->ct_general.use);
1293 WRITE_UNLOCK(&ip_conntrack_lock);
1294
1295 return h;
1296 }
1297
1298 void
ip_ct_iterate_cleanup(int (* iter)(struct ip_conntrack * i,void *),void * data)1299 ip_ct_iterate_cleanup(int (*iter)(struct ip_conntrack *i, void *), void *data)
1300 {
1301 struct ip_conntrack_tuple_hash *h;
1302 unsigned int bucket = 0;
1303
1304 while ((h = get_next_corpse(iter, data, &bucket)) != NULL) {
1305 /* Time to push up daises... */
1306 if (del_timer(&h->ctrack->timeout))
1307 death_by_timeout((unsigned long)h->ctrack);
1308 /* ... else the timer will get him soon. */
1309
1310 ip_conntrack_put(h->ctrack);
1311 }
1312 }
1313
1314 /* Fast function for those who don't want to parse /proc (and I don't
1315 blame them). */
1316 /* Reversing the socket's dst/src point of view gives us the reply
1317 mapping. */
1318 static int
getorigdst(struct sock * sk,int optval,void * user,int * len)1319 getorigdst(struct sock *sk, int optval, void *user, int *len)
1320 {
1321 struct ip_conntrack_tuple_hash *h;
1322 struct ip_conntrack_tuple tuple;
1323
1324 IP_CT_TUPLE_U_BLANK(&tuple);
1325 tuple.src.ip = sk->rcv_saddr;
1326 tuple.src.u.tcp.port = sk->sport;
1327 tuple.dst.ip = sk->daddr;
1328 tuple.dst.u.tcp.port = sk->dport;
1329 tuple.dst.protonum = IPPROTO_TCP;
1330
1331 /* We only do TCP at the moment: is there a better way? */
1332 if (strcmp(sk->prot->name, "TCP") != 0) {
1333 DEBUGP("SO_ORIGINAL_DST: Not a TCP socket\n");
1334 return -ENOPROTOOPT;
1335 }
1336
1337 if ((unsigned int) *len < sizeof(struct sockaddr_in)) {
1338 DEBUGP("SO_ORIGINAL_DST: len %u not %u\n",
1339 *len, sizeof(struct sockaddr_in));
1340 return -EINVAL;
1341 }
1342
1343 h = ip_conntrack_find_get(&tuple, NULL);
1344 if (h) {
1345 struct sockaddr_in sin;
1346
1347 sin.sin_family = AF_INET;
1348 sin.sin_port = h->ctrack->tuplehash[IP_CT_DIR_ORIGINAL]
1349 .tuple.dst.u.tcp.port;
1350 sin.sin_addr.s_addr = h->ctrack->tuplehash[IP_CT_DIR_ORIGINAL]
1351 .tuple.dst.ip;
1352 memset(sin.sin_zero, 0, sizeof(sin.sin_zero));
1353
1354 DEBUGP("SO_ORIGINAL_DST: %u.%u.%u.%u %u\n",
1355 NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port));
1356 ip_conntrack_put(h->ctrack);
1357 if (copy_to_user(user, &sin, sizeof(sin)) != 0)
1358 return -EFAULT;
1359 else
1360 return 0;
1361 }
1362 DEBUGP("SO_ORIGINAL_DST: Can't find %u.%u.%u.%u/%u-%u.%u.%u.%u/%u.\n",
1363 NIPQUAD(tuple.src.ip), ntohs(tuple.src.u.tcp.port),
1364 NIPQUAD(tuple.dst.ip), ntohs(tuple.dst.u.tcp.port));
1365 return -ENOENT;
1366 }
1367
1368 static struct nf_sockopt_ops so_getorigdst
1369 = { { NULL, NULL }, PF_INET,
1370 0, 0, NULL, /* Setsockopts */
1371 SO_ORIGINAL_DST, SO_ORIGINAL_DST+1, &getorigdst,
1372 0, NULL };
1373
kill_all(struct ip_conntrack * i,void * data)1374 static int kill_all(struct ip_conntrack *i, void *data)
1375 {
1376 return 1;
1377 }
1378
1379 /* Mishearing the voices in his head, our hero wonders how he's
1380 supposed to kill the mall. */
ip_conntrack_cleanup(void)1381 void ip_conntrack_cleanup(void)
1382 {
1383 ip_ct_attach = NULL;
1384 /* This makes sure all current packets have passed through
1385 netfilter framework. Roll on, two-stage module
1386 delete... */
1387 br_write_lock_bh(BR_NETPROTO_LOCK);
1388 br_write_unlock_bh(BR_NETPROTO_LOCK);
1389
1390 i_see_dead_people:
1391 ip_ct_iterate_cleanup(kill_all, NULL);
1392 if (atomic_read(&ip_conntrack_count) != 0) {
1393 schedule();
1394 goto i_see_dead_people;
1395 }
1396
1397 kmem_cache_destroy(ip_conntrack_cachep);
1398 vfree(ip_conntrack_hash);
1399 nf_unregister_sockopt(&so_getorigdst);
1400 }
1401
1402 static int hashsize = 0;
1403 MODULE_PARM(hashsize, "i");
1404
ip_conntrack_init(void)1405 int __init ip_conntrack_init(void)
1406 {
1407 unsigned int i;
1408 int ret;
1409
1410 /* Idea from tcp.c: use 1/16384 of memory. On i386: 32MB
1411 * machine has 256 buckets. >= 1GB machines have 8192 buckets. */
1412 if (hashsize) {
1413 ip_conntrack_htable_size = hashsize;
1414 } else {
1415 ip_conntrack_htable_size
1416 = (((num_physpages << PAGE_SHIFT) / 16384)
1417 / sizeof(struct list_head));
1418 if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE))
1419 ip_conntrack_htable_size = 8192;
1420 if (ip_conntrack_htable_size < 16)
1421 ip_conntrack_htable_size = 16;
1422 }
1423 ip_conntrack_max = 8 * ip_conntrack_htable_size;
1424
1425 printk("ip_conntrack version %s (%u buckets, %d max)"
1426 " - %Zd bytes per conntrack\n", IP_CONNTRACK_VERSION,
1427 ip_conntrack_htable_size, ip_conntrack_max,
1428 sizeof(struct ip_conntrack));
1429
1430 ret = nf_register_sockopt(&so_getorigdst);
1431 if (ret != 0) {
1432 printk(KERN_ERR "Unable to register netfilter socket option\n");
1433 return ret;
1434 }
1435
1436 ip_conntrack_hash = vmalloc(sizeof(struct list_head)
1437 * ip_conntrack_htable_size);
1438 if (!ip_conntrack_hash) {
1439 printk(KERN_ERR "Unable to create ip_conntrack_hash\n");
1440 goto err_unreg_sockopt;
1441 }
1442
1443 ip_conntrack_cachep = kmem_cache_create("ip_conntrack",
1444 sizeof(struct ip_conntrack), 0,
1445 SLAB_HWCACHE_ALIGN, NULL, NULL);
1446 if (!ip_conntrack_cachep) {
1447 printk(KERN_ERR "Unable to create ip_conntrack slab cache\n");
1448 goto err_free_hash;
1449 }
1450 /* Don't NEED lock here, but good form anyway. */
1451 WRITE_LOCK(&ip_conntrack_lock);
1452 /* Sew in builtin protocols. */
1453 list_append(&protocol_list, &ip_conntrack_protocol_tcp);
1454 list_append(&protocol_list, &ip_conntrack_protocol_udp);
1455 list_append(&protocol_list, &ip_conntrack_protocol_icmp);
1456 WRITE_UNLOCK(&ip_conntrack_lock);
1457
1458 for (i = 0; i < ip_conntrack_htable_size; i++)
1459 INIT_LIST_HEAD(&ip_conntrack_hash[i]);
1460
1461 /* For use by ipt_REJECT */
1462 ip_ct_attach = ip_conntrack_attach;
1463 return ret;
1464
1465 err_free_hash:
1466 vfree(ip_conntrack_hash);
1467 err_unreg_sockopt:
1468 nf_unregister_sockopt(&so_getorigdst);
1469
1470 return -ENOMEM;
1471 }
1472