1 /* NAT for netfilter; shared with compatibility layer. */
2
3 /* (c) 1999 Paul `Rusty' Russell. Licenced under the GNU General
4 Public Licence. */
5 #include <linux/version.h>
6 #include <linux/module.h>
7 #include <linux/types.h>
8 #include <linux/timer.h>
9 #include <linux/skbuff.h>
10 #include <linux/netfilter_ipv4.h>
11 #include <linux/brlock.h>
12 #include <linux/vmalloc.h>
13 #include <net/checksum.h>
14 #include <net/icmp.h>
15 #include <net/ip.h>
16 #include <net/tcp.h> /* For tcp_prot in getorigdst */
17
18 #define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_nat_lock)
19 #define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_nat_lock)
20
21 #include <linux/netfilter_ipv4/ip_conntrack.h>
22 #include <linux/netfilter_ipv4/ip_conntrack_core.h>
23 #include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
24 #include <linux/netfilter_ipv4/ip_nat.h>
25 #include <linux/netfilter_ipv4/ip_nat_protocol.h>
26 #include <linux/netfilter_ipv4/ip_nat_core.h>
27 #include <linux/netfilter_ipv4/ip_nat_helper.h>
28 #include <linux/netfilter_ipv4/ip_conntrack_helper.h>
29 #include <linux/netfilter_ipv4/listhelp.h>
30
31 #if 0
32 #define DEBUGP printk
33 #else
34 #define DEBUGP(format, args...)
35 #endif
36
37 DECLARE_RWLOCK(ip_nat_lock);
38 DECLARE_RWLOCK_EXTERN(ip_conntrack_lock);
39
40 /* Calculated at init based on memory size */
41 static unsigned int ip_nat_htable_size;
42
43 static struct list_head *bysource;
44 static struct list_head *byipsproto;
45 LIST_HEAD(protos);
46 LIST_HEAD(helpers);
47
48 extern struct ip_nat_protocol unknown_nat_protocol;
49
50 /* We keep extra hashes for each conntrack, for fast searching. */
51 static inline size_t
hash_by_ipsproto(u_int32_t src,u_int32_t dst,u_int16_t proto)52 hash_by_ipsproto(u_int32_t src, u_int32_t dst, u_int16_t proto)
53 {
54 /* Modified src and dst, to ensure we don't create two
55 identical streams. */
56 return (src + dst + proto) % ip_nat_htable_size;
57 }
58
59 static inline size_t
hash_by_src(const struct ip_conntrack_manip * manip,u_int16_t proto)60 hash_by_src(const struct ip_conntrack_manip *manip, u_int16_t proto)
61 {
62 /* Original src, to ensure we map it consistently if poss. */
63 return (manip->ip + manip->u.all + proto) % ip_nat_htable_size;
64 }
65
66 /* Noone using conntrack by the time this called. */
ip_nat_cleanup_conntrack(struct ip_conntrack * conn)67 static void ip_nat_cleanup_conntrack(struct ip_conntrack *conn)
68 {
69 struct ip_nat_info *info = &conn->nat.info;
70 unsigned int hs, hp;
71
72 if (!info->initialized)
73 return;
74
75 IP_NF_ASSERT(info->bysource.conntrack);
76 IP_NF_ASSERT(info->byipsproto.conntrack);
77
78 hs = hash_by_src(&conn->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src,
79 conn->tuplehash[IP_CT_DIR_ORIGINAL]
80 .tuple.dst.protonum);
81
82 hp = hash_by_ipsproto(conn->tuplehash[IP_CT_DIR_REPLY].tuple.src.ip,
83 conn->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip,
84 conn->tuplehash[IP_CT_DIR_REPLY]
85 .tuple.dst.protonum);
86
87 WRITE_LOCK(&ip_nat_lock);
88 LIST_DELETE(&bysource[hs], &info->bysource);
89 LIST_DELETE(&byipsproto[hp], &info->byipsproto);
90 WRITE_UNLOCK(&ip_nat_lock);
91 }
92
93 /* We do checksum mangling, so if they were wrong before they're still
94 * wrong. Also works for incomplete packets (eg. ICMP dest
95 * unreachables.) */
96 u_int16_t
ip_nat_cheat_check(u_int32_t oldvalinv,u_int32_t newval,u_int16_t oldcheck)97 ip_nat_cheat_check(u_int32_t oldvalinv, u_int32_t newval, u_int16_t oldcheck)
98 {
99 u_int32_t diffs[] = { oldvalinv, newval };
100 return csum_fold(csum_partial((char *)diffs, sizeof(diffs),
101 oldcheck^0xFFFF));
102 }
103
cmp_proto(const struct ip_nat_protocol * i,int proto)104 static inline int cmp_proto(const struct ip_nat_protocol *i, int proto)
105 {
106 return i->protonum == proto;
107 }
108
109 struct ip_nat_protocol *
find_nat_proto(u_int16_t protonum)110 find_nat_proto(u_int16_t protonum)
111 {
112 struct ip_nat_protocol *i;
113
114 MUST_BE_READ_LOCKED(&ip_nat_lock);
115 i = LIST_FIND(&protos, cmp_proto, struct ip_nat_protocol *, protonum);
116 if (!i)
117 i = &unknown_nat_protocol;
118 return i;
119 }
120
121 /* Is this tuple already taken? (not by us) */
122 int
ip_nat_used_tuple(const struct ip_conntrack_tuple * tuple,const struct ip_conntrack * ignored_conntrack)123 ip_nat_used_tuple(const struct ip_conntrack_tuple *tuple,
124 const struct ip_conntrack *ignored_conntrack)
125 {
126 /* Conntrack tracking doesn't keep track of outgoing tuples; only
127 incoming ones. NAT means they don't have a fixed mapping,
128 so we invert the tuple and look for the incoming reply.
129
130 We could keep a separate hash if this proves too slow. */
131 struct ip_conntrack_tuple reply;
132
133 invert_tuplepr(&reply, tuple);
134 return ip_conntrack_tuple_taken(&reply, ignored_conntrack);
135 }
136
137 /* Does tuple + the source manip come within the range mr */
138 static int
in_range(const struct ip_conntrack_tuple * tuple,const struct ip_conntrack_manip * manip,const struct ip_nat_multi_range * mr)139 in_range(const struct ip_conntrack_tuple *tuple,
140 const struct ip_conntrack_manip *manip,
141 const struct ip_nat_multi_range *mr)
142 {
143 struct ip_nat_protocol *proto = find_nat_proto(tuple->dst.protonum);
144 unsigned int i;
145 struct ip_conntrack_tuple newtuple = { *manip, tuple->dst };
146
147 for (i = 0; i < mr->rangesize; i++) {
148 /* If we are allowed to map IPs, then we must be in the
149 range specified, otherwise we must be unchanged. */
150 if (mr->range[i].flags & IP_NAT_RANGE_MAP_IPS) {
151 if (ntohl(newtuple.src.ip) < ntohl(mr->range[i].min_ip)
152 || (ntohl(newtuple.src.ip)
153 > ntohl(mr->range[i].max_ip)))
154 continue;
155 } else {
156 if (newtuple.src.ip != tuple->src.ip)
157 continue;
158 }
159
160 if ((mr->range[i].flags & IP_NAT_RANGE_PROTO_SPECIFIED)
161 && proto->in_range(&newtuple, IP_NAT_MANIP_SRC,
162 &mr->range[i].min, &mr->range[i].max))
163 return 1;
164 }
165 return 0;
166 }
167
168 static inline int
src_cmp(const struct ip_nat_hash * i,const struct ip_conntrack_tuple * tuple,const struct ip_nat_multi_range * mr)169 src_cmp(const struct ip_nat_hash *i,
170 const struct ip_conntrack_tuple *tuple,
171 const struct ip_nat_multi_range *mr)
172 {
173 return (i->conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum
174 == tuple->dst.protonum
175 && i->conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip
176 == tuple->src.ip
177 && i->conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.all
178 == tuple->src.u.all
179 && in_range(tuple,
180 &i->conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
181 .tuple.src,
182 mr));
183 }
184
185 /* Only called for SRC manip */
186 static struct ip_conntrack_manip *
find_appropriate_src(const struct ip_conntrack_tuple * tuple,const struct ip_nat_multi_range * mr)187 find_appropriate_src(const struct ip_conntrack_tuple *tuple,
188 const struct ip_nat_multi_range *mr)
189 {
190 unsigned int h = hash_by_src(&tuple->src, tuple->dst.protonum);
191 struct ip_nat_hash *i;
192
193 MUST_BE_READ_LOCKED(&ip_nat_lock);
194 i = LIST_FIND(&bysource[h], src_cmp, struct ip_nat_hash *, tuple, mr);
195 if (i)
196 return &i->conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src;
197 else
198 return NULL;
199 }
200
201 /* If it's really a local destination manip, it may need to do a
202 source manip too. */
203 static int
do_extra_mangle(u_int32_t var_ip,u_int32_t * other_ipp)204 do_extra_mangle(u_int32_t var_ip, u_int32_t *other_ipp)
205 {
206 struct rtable *rt;
207
208 /* FIXME: IPTOS_TOS(iph->tos) --RR */
209 if (ip_route_output(&rt, var_ip, 0, 0, 0) != 0) {
210 DEBUGP("do_extra_mangle: Can't get route to %u.%u.%u.%u\n",
211 NIPQUAD(var_ip));
212 return 0;
213 }
214
215 *other_ipp = rt->rt_src;
216 ip_rt_put(rt);
217 return 1;
218 }
219
220 /* Simple way to iterate through all. */
fake_cmp(const struct ip_nat_hash * i,u_int32_t src,u_int32_t dst,u_int16_t protonum,unsigned int * score,const struct ip_conntrack * conntrack)221 static inline int fake_cmp(const struct ip_nat_hash *i,
222 u_int32_t src, u_int32_t dst, u_int16_t protonum,
223 unsigned int *score,
224 const struct ip_conntrack *conntrack)
225 {
226 /* Compare backwards: we're dealing with OUTGOING tuples, and
227 inside the conntrack is the REPLY tuple. Don't count this
228 conntrack. */
229 if (i->conntrack != conntrack
230 && i->conntrack->tuplehash[IP_CT_DIR_REPLY].tuple.src.ip == dst
231 && i->conntrack->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip == src
232 && (i->conntrack->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum
233 == protonum))
234 (*score)++;
235 return 0;
236 }
237
238 static inline unsigned int
count_maps(u_int32_t src,u_int32_t dst,u_int16_t protonum,const struct ip_conntrack * conntrack)239 count_maps(u_int32_t src, u_int32_t dst, u_int16_t protonum,
240 const struct ip_conntrack *conntrack)
241 {
242 unsigned int score = 0;
243 unsigned int h;
244
245 MUST_BE_READ_LOCKED(&ip_nat_lock);
246 h = hash_by_ipsproto(src, dst, protonum);
247 LIST_FIND(&byipsproto[h], fake_cmp, struct ip_nat_hash *,
248 src, dst, protonum, &score, conntrack);
249
250 return score;
251 }
252
253 /* For [FUTURE] fragmentation handling, we want the least-used
254 src-ip/dst-ip/proto triple. Fairness doesn't come into it. Thus
255 if the range specifies 1.2.3.4 ports 10000-10005 and 1.2.3.5 ports
256 1-65535, we don't do pro-rata allocation based on ports; we choose
257 the ip with the lowest src-ip/dst-ip/proto usage.
258
259 If an allocation then fails (eg. all 6 ports used in the 1.2.3.4
260 range), we eliminate that and try again. This is not the most
261 efficient approach, but if you're worried about that, don't hand us
262 ranges you don't really have. */
263 static struct ip_nat_range *
find_best_ips_proto(struct ip_conntrack_tuple * tuple,const struct ip_nat_multi_range * mr,const struct ip_conntrack * conntrack,unsigned int hooknum)264 find_best_ips_proto(struct ip_conntrack_tuple *tuple,
265 const struct ip_nat_multi_range *mr,
266 const struct ip_conntrack *conntrack,
267 unsigned int hooknum)
268 {
269 unsigned int i;
270 struct {
271 const struct ip_nat_range *range;
272 unsigned int score;
273 struct ip_conntrack_tuple tuple;
274 } best = { NULL, 0xFFFFFFFF };
275 u_int32_t *var_ipp, *other_ipp, saved_ip, orig_dstip;
276 static unsigned int randomness = 0;
277
278 if (HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC) {
279 var_ipp = &tuple->src.ip;
280 saved_ip = tuple->dst.ip;
281 other_ipp = &tuple->dst.ip;
282 } else {
283 var_ipp = &tuple->dst.ip;
284 saved_ip = tuple->src.ip;
285 other_ipp = &tuple->src.ip;
286 }
287 /* Don't do do_extra_mangle unless neccessary (overrides
288 explicit socket bindings, for example) */
289 orig_dstip = tuple->dst.ip;
290
291 IP_NF_ASSERT(mr->rangesize >= 1);
292 for (i = 0; i < mr->rangesize; i++) {
293 /* Host order */
294 u_int32_t minip, maxip, j;
295
296 /* Don't do ranges which are already eliminated. */
297 if (mr->range[i].flags & IP_NAT_RANGE_FULL) {
298 continue;
299 }
300
301 if (mr->range[i].flags & IP_NAT_RANGE_MAP_IPS) {
302 minip = ntohl(mr->range[i].min_ip);
303 maxip = ntohl(mr->range[i].max_ip);
304 } else
305 minip = maxip = ntohl(*var_ipp);
306
307 randomness++;
308 for (j = 0; j < maxip - minip + 1; j++) {
309 unsigned int score;
310
311 *var_ipp = htonl(minip + (randomness + j)
312 % (maxip - minip + 1));
313
314 /* Reset the other ip in case it was mangled by
315 * do_extra_mangle last time. */
316 *other_ipp = saved_ip;
317
318 if (hooknum == NF_IP_LOCAL_OUT
319 && *var_ipp != orig_dstip
320 && !do_extra_mangle(*var_ipp, other_ipp)) {
321 DEBUGP("Range %u %u.%u.%u.%u rt failed!\n",
322 i, NIPQUAD(*var_ipp));
323 /* Can't route? This whole range part is
324 * probably screwed, but keep trying
325 * anyway. */
326 continue;
327 }
328
329 /* Count how many others map onto this. */
330 score = count_maps(tuple->src.ip, tuple->dst.ip,
331 tuple->dst.protonum, conntrack);
332 if (score < best.score) {
333 /* Optimization: doesn't get any better than
334 this. */
335 if (score == 0)
336 return (struct ip_nat_range *)
337 &mr->range[i];
338
339 best.score = score;
340 best.tuple = *tuple;
341 best.range = &mr->range[i];
342 }
343 }
344 }
345 *tuple = best.tuple;
346
347 /* Discard const. */
348 return (struct ip_nat_range *)best.range;
349 }
350
351 /* Fast version doesn't iterate through hash chains, but only handles
352 common case of single IP address (null NAT, masquerade) */
353 static struct ip_nat_range *
find_best_ips_proto_fast(struct ip_conntrack_tuple * tuple,const struct ip_nat_multi_range * mr,const struct ip_conntrack * conntrack,unsigned int hooknum)354 find_best_ips_proto_fast(struct ip_conntrack_tuple *tuple,
355 const struct ip_nat_multi_range *mr,
356 const struct ip_conntrack *conntrack,
357 unsigned int hooknum)
358 {
359 if (mr->rangesize != 1
360 || (mr->range[0].flags & IP_NAT_RANGE_FULL)
361 || ((mr->range[0].flags & IP_NAT_RANGE_MAP_IPS)
362 && mr->range[0].min_ip != mr->range[0].max_ip))
363 return find_best_ips_proto(tuple, mr, conntrack, hooknum);
364
365 if (mr->range[0].flags & IP_NAT_RANGE_MAP_IPS) {
366 if (HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC)
367 tuple->src.ip = mr->range[0].min_ip;
368 else {
369 /* Only do extra mangle when required (breaks
370 socket binding) */
371 if (tuple->dst.ip != mr->range[0].min_ip
372 && hooknum == NF_IP_LOCAL_OUT
373 && !do_extra_mangle(mr->range[0].min_ip,
374 &tuple->src.ip))
375 return NULL;
376 tuple->dst.ip = mr->range[0].min_ip;
377 }
378 }
379
380 /* Discard const. */
381 return (struct ip_nat_range *)&mr->range[0];
382 }
383
384 static int
get_unique_tuple(struct ip_conntrack_tuple * tuple,const struct ip_conntrack_tuple * orig_tuple,const struct ip_nat_multi_range * mrr,struct ip_conntrack * conntrack,unsigned int hooknum)385 get_unique_tuple(struct ip_conntrack_tuple *tuple,
386 const struct ip_conntrack_tuple *orig_tuple,
387 const struct ip_nat_multi_range *mrr,
388 struct ip_conntrack *conntrack,
389 unsigned int hooknum)
390 {
391 struct ip_nat_protocol *proto
392 = find_nat_proto(orig_tuple->dst.protonum);
393 struct ip_nat_range *rptr;
394 unsigned int i;
395 int ret;
396
397 /* We temporarily use flags for marking full parts, but we
398 always clean up afterwards */
399 struct ip_nat_multi_range *mr = (void *)mrr;
400
401 /* 1) If this srcip/proto/src-proto-part is currently mapped,
402 and that same mapping gives a unique tuple within the given
403 range, use that.
404
405 This is only required for source (ie. NAT/masq) mappings.
406 So far, we don't do local source mappings, so multiple
407 manips not an issue. */
408 if (hooknum == NF_IP_POST_ROUTING) {
409 struct ip_conntrack_manip *manip;
410
411 manip = find_appropriate_src(orig_tuple, mr);
412 if (manip) {
413 /* Apply same source manipulation. */
414 *tuple = ((struct ip_conntrack_tuple)
415 { *manip, orig_tuple->dst });
416 DEBUGP("get_unique_tuple: Found current src map\n");
417 return 1;
418 }
419 }
420
421 /* 2) Select the least-used IP/proto combination in the given
422 range.
423 */
424 *tuple = *orig_tuple;
425 while ((rptr = find_best_ips_proto_fast(tuple, mr, conntrack, hooknum))
426 != NULL) {
427 DEBUGP("Found best for "); DUMP_TUPLE(tuple);
428 /* 3) The per-protocol part of the manip is made to
429 map into the range to make a unique tuple. */
430
431 /* Only bother mapping if it's not already in range
432 and unique */
433 if ((!(rptr->flags & IP_NAT_RANGE_PROTO_SPECIFIED)
434 || proto->in_range(tuple, HOOK2MANIP(hooknum),
435 &rptr->min, &rptr->max))
436 && !ip_nat_used_tuple(tuple, conntrack)) {
437 ret = 1;
438 goto clear_fulls;
439 } else {
440 if (proto->unique_tuple(tuple, rptr,
441 HOOK2MANIP(hooknum),
442 conntrack)) {
443 /* Must be unique. */
444 IP_NF_ASSERT(!ip_nat_used_tuple(tuple,
445 conntrack));
446 ret = 1;
447 goto clear_fulls;
448 } else if (HOOK2MANIP(hooknum) == IP_NAT_MANIP_DST) {
449 /* Try implicit source NAT; protocol
450 may be able to play with ports to
451 make it unique. */
452 struct ip_nat_range r
453 = { IP_NAT_RANGE_MAP_IPS,
454 tuple->src.ip, tuple->src.ip,
455 { 0 }, { 0 } };
456 DEBUGP("Trying implicit mapping\n");
457 if (proto->unique_tuple(tuple, &r,
458 IP_NAT_MANIP_SRC,
459 conntrack)) {
460 /* Must be unique. */
461 IP_NF_ASSERT(!ip_nat_used_tuple
462 (tuple, conntrack));
463 ret = 1;
464 goto clear_fulls;
465 }
466 }
467 DEBUGP("Protocol can't get unique tuple %u.\n",
468 hooknum);
469 }
470
471 /* Eliminate that from range, and try again. */
472 rptr->flags |= IP_NAT_RANGE_FULL;
473 *tuple = *orig_tuple;
474 }
475
476 ret = 0;
477
478 clear_fulls:
479 /* Clear full flags. */
480 IP_NF_ASSERT(mr->rangesize >= 1);
481 for (i = 0; i < mr->rangesize; i++)
482 mr->range[i].flags &= ~IP_NAT_RANGE_FULL;
483
484 return ret;
485 }
486
487 static inline int
helper_cmp(const struct ip_nat_helper * helper,const struct ip_conntrack_tuple * tuple)488 helper_cmp(const struct ip_nat_helper *helper,
489 const struct ip_conntrack_tuple *tuple)
490 {
491 return ip_ct_tuple_mask_cmp(tuple, &helper->tuple, &helper->mask);
492 }
493
494 /* Where to manip the reply packets (will be reverse manip). */
495 static unsigned int opposite_hook[NF_IP_NUMHOOKS]
496 = { [NF_IP_PRE_ROUTING] = NF_IP_POST_ROUTING,
497 [NF_IP_POST_ROUTING] = NF_IP_PRE_ROUTING,
498 [NF_IP_LOCAL_OUT] = NF_IP_LOCAL_IN,
499 [NF_IP_LOCAL_IN] = NF_IP_LOCAL_OUT,
500 };
501
502 unsigned int
ip_nat_setup_info(struct ip_conntrack * conntrack,const struct ip_nat_multi_range * mr,unsigned int hooknum)503 ip_nat_setup_info(struct ip_conntrack *conntrack,
504 const struct ip_nat_multi_range *mr,
505 unsigned int hooknum)
506 {
507 struct ip_conntrack_tuple new_tuple, inv_tuple, reply;
508 struct ip_conntrack_tuple orig_tp;
509 struct ip_nat_info *info = &conntrack->nat.info;
510 int in_hashes = info->initialized;
511
512 MUST_BE_WRITE_LOCKED(&ip_nat_lock);
513 IP_NF_ASSERT(hooknum == NF_IP_PRE_ROUTING
514 || hooknum == NF_IP_POST_ROUTING
515 || hooknum == NF_IP_LOCAL_IN
516 || hooknum == NF_IP_LOCAL_OUT);
517 IP_NF_ASSERT(info->num_manips < IP_NAT_MAX_MANIPS);
518 IP_NF_ASSERT(!(info->initialized & (1 << HOOK2MANIP(hooknum))));
519
520 /* What we've got will look like inverse of reply. Normally
521 this is what is in the conntrack, except for prior
522 manipulations (future optimization: if num_manips == 0,
523 orig_tp =
524 conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple) */
525 invert_tuplepr(&orig_tp,
526 &conntrack->tuplehash[IP_CT_DIR_REPLY].tuple);
527
528 #if 0
529 {
530 unsigned int i;
531
532 DEBUGP("Hook %u (%s), ", hooknum,
533 HOOK2MANIP(hooknum)==IP_NAT_MANIP_SRC ? "SRC" : "DST");
534 DUMP_TUPLE(&orig_tp);
535 DEBUGP("Range %p: ", mr);
536 for (i = 0; i < mr->rangesize; i++) {
537 DEBUGP("%u:%s%s%s %u.%u.%u.%u - %u.%u.%u.%u %u - %u\n",
538 i,
539 (mr->range[i].flags & IP_NAT_RANGE_MAP_IPS)
540 ? " MAP_IPS" : "",
541 (mr->range[i].flags
542 & IP_NAT_RANGE_PROTO_SPECIFIED)
543 ? " PROTO_SPECIFIED" : "",
544 (mr->range[i].flags & IP_NAT_RANGE_FULL)
545 ? " FULL" : "",
546 NIPQUAD(mr->range[i].min_ip),
547 NIPQUAD(mr->range[i].max_ip),
548 mr->range[i].min.all,
549 mr->range[i].max.all);
550 }
551 }
552 #endif
553
554 do {
555 if (!get_unique_tuple(&new_tuple, &orig_tp, mr, conntrack,
556 hooknum)) {
557 DEBUGP("ip_nat_setup_info: Can't get unique for %p.\n",
558 conntrack);
559 return NF_DROP;
560 }
561
562 #if 0
563 DEBUGP("Hook %u (%s) %p\n", hooknum,
564 HOOK2MANIP(hooknum)==IP_NAT_MANIP_SRC ? "SRC" : "DST",
565 conntrack);
566 DEBUGP("Original: ");
567 DUMP_TUPLE(&orig_tp);
568 DEBUGP("New: ");
569 DUMP_TUPLE(&new_tuple);
570 #endif
571
572 /* We now have two tuples (SRCIP/SRCPT/DSTIP/DSTPT):
573 the original (A/B/C/D') and the mangled one (E/F/G/H').
574
575 We're only allowed to work with the SRC per-proto
576 part, so we create inverses of both to start, then
577 derive the other fields we need. */
578
579 /* Reply connection: simply invert the new tuple
580 (G/H/E/F') */
581 invert_tuplepr(&reply, &new_tuple);
582
583 /* Alter conntrack table so it recognizes replies.
584 If fail this race (reply tuple now used), repeat. */
585 } while (!ip_conntrack_alter_reply(conntrack, &reply));
586
587 /* FIXME: We can simply used existing conntrack reply tuple
588 here --RR */
589 /* Create inverse of original: C/D/A/B' */
590 invert_tuplepr(&inv_tuple, &orig_tp);
591
592 /* Has source changed?. */
593 if (!ip_ct_tuple_src_equal(&new_tuple, &orig_tp)) {
594 /* In this direction, a source manip. */
595 info->manips[info->num_manips++] =
596 ((struct ip_nat_info_manip)
597 { IP_CT_DIR_ORIGINAL, hooknum,
598 IP_NAT_MANIP_SRC, new_tuple.src });
599
600 IP_NF_ASSERT(info->num_manips < IP_NAT_MAX_MANIPS);
601
602 /* In the reverse direction, a destination manip. */
603 info->manips[info->num_manips++] =
604 ((struct ip_nat_info_manip)
605 { IP_CT_DIR_REPLY, opposite_hook[hooknum],
606 IP_NAT_MANIP_DST, orig_tp.src });
607 IP_NF_ASSERT(info->num_manips <= IP_NAT_MAX_MANIPS);
608 }
609
610 /* Has destination changed? */
611 if (!ip_ct_tuple_dst_equal(&new_tuple, &orig_tp)) {
612 /* In this direction, a destination manip */
613 info->manips[info->num_manips++] =
614 ((struct ip_nat_info_manip)
615 { IP_CT_DIR_ORIGINAL, hooknum,
616 IP_NAT_MANIP_DST, reply.src });
617
618 IP_NF_ASSERT(info->num_manips < IP_NAT_MAX_MANIPS);
619
620 /* In the reverse direction, a source manip. */
621 info->manips[info->num_manips++] =
622 ((struct ip_nat_info_manip)
623 { IP_CT_DIR_REPLY, opposite_hook[hooknum],
624 IP_NAT_MANIP_SRC, inv_tuple.src });
625 IP_NF_ASSERT(info->num_manips <= IP_NAT_MAX_MANIPS);
626 }
627
628 /* If there's a helper, assign it; based on new tuple. */
629 if (!conntrack->master)
630 info->helper = LIST_FIND(&helpers, helper_cmp, struct ip_nat_helper *,
631 &reply);
632
633 /* It's done. */
634 info->initialized |= (1 << HOOK2MANIP(hooknum));
635
636 if (in_hashes) {
637 IP_NF_ASSERT(info->bysource.conntrack);
638 replace_in_hashes(conntrack, info);
639 } else {
640 place_in_hashes(conntrack, info);
641 }
642
643 return NF_ACCEPT;
644 }
645
replace_in_hashes(struct ip_conntrack * conntrack,struct ip_nat_info * info)646 void replace_in_hashes(struct ip_conntrack *conntrack,
647 struct ip_nat_info *info)
648 {
649 /* Source has changed, so replace in hashes. */
650 unsigned int srchash
651 = hash_by_src(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
652 .tuple.src,
653 conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
654 .tuple.dst.protonum);
655 /* We place packet as seen OUTGOUNG in byips_proto hash
656 (ie. reverse dst and src of reply packet. */
657 unsigned int ipsprotohash
658 = hash_by_ipsproto(conntrack->tuplehash[IP_CT_DIR_REPLY]
659 .tuple.dst.ip,
660 conntrack->tuplehash[IP_CT_DIR_REPLY]
661 .tuple.src.ip,
662 conntrack->tuplehash[IP_CT_DIR_REPLY]
663 .tuple.dst.protonum);
664
665 IP_NF_ASSERT(info->bysource.conntrack == conntrack);
666 MUST_BE_WRITE_LOCKED(&ip_nat_lock);
667
668 list_del(&info->bysource.list);
669 list_del(&info->byipsproto.list);
670
671 list_prepend(&bysource[srchash], &info->bysource);
672 list_prepend(&byipsproto[ipsprotohash], &info->byipsproto);
673 }
674
place_in_hashes(struct ip_conntrack * conntrack,struct ip_nat_info * info)675 void place_in_hashes(struct ip_conntrack *conntrack,
676 struct ip_nat_info *info)
677 {
678 unsigned int srchash
679 = hash_by_src(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
680 .tuple.src,
681 conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
682 .tuple.dst.protonum);
683 /* We place packet as seen OUTGOUNG in byips_proto hash
684 (ie. reverse dst and src of reply packet. */
685 unsigned int ipsprotohash
686 = hash_by_ipsproto(conntrack->tuplehash[IP_CT_DIR_REPLY]
687 .tuple.dst.ip,
688 conntrack->tuplehash[IP_CT_DIR_REPLY]
689 .tuple.src.ip,
690 conntrack->tuplehash[IP_CT_DIR_REPLY]
691 .tuple.dst.protonum);
692
693 IP_NF_ASSERT(!info->bysource.conntrack);
694
695 MUST_BE_WRITE_LOCKED(&ip_nat_lock);
696 info->byipsproto.conntrack = conntrack;
697 info->bysource.conntrack = conntrack;
698
699 list_prepend(&bysource[srchash], &info->bysource);
700 list_prepend(&byipsproto[ipsprotohash], &info->byipsproto);
701 }
702
703 static void
manip_pkt(u_int16_t proto,struct iphdr * iph,size_t len,const struct ip_conntrack_manip * manip,enum ip_nat_manip_type maniptype,__u32 * nfcache)704 manip_pkt(u_int16_t proto, struct iphdr *iph, size_t len,
705 const struct ip_conntrack_manip *manip,
706 enum ip_nat_manip_type maniptype,
707 __u32 *nfcache)
708 {
709 *nfcache |= NFC_ALTERED;
710 find_nat_proto(proto)->manip_pkt(iph, len, manip, maniptype);
711
712 if (maniptype == IP_NAT_MANIP_SRC) {
713 iph->check = ip_nat_cheat_check(~iph->saddr, manip->ip,
714 iph->check);
715 iph->saddr = manip->ip;
716 } else {
717 iph->check = ip_nat_cheat_check(~iph->daddr, manip->ip,
718 iph->check);
719 iph->daddr = manip->ip;
720 }
721 #if 0
722 if (ip_fast_csum((u8 *)iph, iph->ihl) != 0)
723 DEBUGP("IP: checksum on packet bad.\n");
724
725 if (proto == IPPROTO_TCP) {
726 void *th = (u_int32_t *)iph + iph->ihl;
727 if (tcp_v4_check(th, len - 4*iph->ihl, iph->saddr, iph->daddr,
728 csum_partial((char *)th, len-4*iph->ihl, 0)))
729 DEBUGP("TCP: checksum on packet bad\n");
730 }
731 #endif
732 }
733
exp_for_packet(struct ip_conntrack_expect * exp,struct sk_buff ** pskb)734 static inline int exp_for_packet(struct ip_conntrack_expect *exp,
735 struct sk_buff **pskb)
736 {
737 struct ip_conntrack_protocol *proto;
738 int ret = 1;
739
740 MUST_BE_READ_LOCKED(&ip_conntrack_lock);
741 proto = __ip_ct_find_proto((*pskb)->nh.iph->protocol);
742 if (proto->exp_matches_pkt)
743 ret = proto->exp_matches_pkt(exp, pskb);
744
745 return ret;
746 }
747
748 /* Do packet manipulations according to binding. */
749 unsigned int
do_bindings(struct ip_conntrack * ct,enum ip_conntrack_info ctinfo,struct ip_nat_info * info,unsigned int hooknum,struct sk_buff ** pskb)750 do_bindings(struct ip_conntrack *ct,
751 enum ip_conntrack_info ctinfo,
752 struct ip_nat_info *info,
753 unsigned int hooknum,
754 struct sk_buff **pskb)
755 {
756 unsigned int i;
757 struct ip_nat_helper *helper;
758 enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
759 int is_tcp = (*pskb)->nh.iph->protocol == IPPROTO_TCP;
760
761 /* Need nat lock to protect against modification, but neither
762 conntrack (referenced) and helper (deleted with
763 synchronize_bh()) can vanish. */
764 READ_LOCK(&ip_nat_lock);
765 for (i = 0; i < info->num_manips; i++) {
766 /* raw socket (tcpdump) may have clone of incoming
767 skb: don't disturb it --RR */
768 if (skb_cloned(*pskb) && !(*pskb)->sk) {
769 struct sk_buff *nskb = skb_copy(*pskb, GFP_ATOMIC);
770 if (!nskb) {
771 READ_UNLOCK(&ip_nat_lock);
772 return NF_DROP;
773 }
774 kfree_skb(*pskb);
775 *pskb = nskb;
776 }
777
778 if (info->manips[i].direction == dir
779 && info->manips[i].hooknum == hooknum) {
780 DEBUGP("Mangling %p: %s to %u.%u.%u.%u %u\n",
781 *pskb,
782 info->manips[i].maniptype == IP_NAT_MANIP_SRC
783 ? "SRC" : "DST",
784 NIPQUAD(info->manips[i].manip.ip),
785 htons(info->manips[i].manip.u.all));
786 manip_pkt((*pskb)->nh.iph->protocol,
787 (*pskb)->nh.iph,
788 (*pskb)->len,
789 &info->manips[i].manip,
790 info->manips[i].maniptype,
791 &(*pskb)->nfcache);
792 }
793 }
794 helper = info->helper;
795 READ_UNLOCK(&ip_nat_lock);
796
797 if (helper) {
798 struct ip_conntrack_expect *exp = NULL;
799 struct list_head *cur_item;
800 int ret = NF_ACCEPT;
801 int helper_called = 0;
802
803 DEBUGP("do_bindings: helper existing for (%p)\n", ct);
804
805 /* Always defragged for helpers */
806 IP_NF_ASSERT(!((*pskb)->nh.iph->frag_off
807 & htons(IP_MF|IP_OFFSET)));
808
809 /* Have to grab read lock before sibling_list traversal */
810 READ_LOCK(&ip_conntrack_lock);
811 list_for_each_prev(cur_item, &ct->sibling_list) {
812 exp = list_entry(cur_item, struct ip_conntrack_expect,
813 expected_list);
814
815 /* if this expectation is already established, skip */
816 if (exp->sibling)
817 continue;
818
819 if (exp_for_packet(exp, pskb)) {
820 /* FIXME: May be true multiple times in the
821 * case of UDP!! */
822 DEBUGP("calling nat helper (exp=%p) for packet\n", exp);
823 ret = helper->help(ct, exp, info, ctinfo,
824 hooknum, pskb);
825 if (ret != NF_ACCEPT) {
826 READ_UNLOCK(&ip_conntrack_lock);
827 return ret;
828 }
829 helper_called = 1;
830 }
831 }
832 /* Helper might want to manip the packet even when there is no
833 * matching expectation for this packet */
834 if (!helper_called && helper->flags & IP_NAT_HELPER_F_ALWAYS) {
835 DEBUGP("calling nat helper for packet without expectation\n");
836 ret = helper->help(ct, NULL, info, ctinfo,
837 hooknum, pskb);
838 if (ret != NF_ACCEPT) {
839 READ_UNLOCK(&ip_conntrack_lock);
840 return ret;
841 }
842 }
843 READ_UNLOCK(&ip_conntrack_lock);
844
845 /* Adjust sequence number only once per packet
846 * (helper is called at all hooks) */
847 if (is_tcp && (hooknum == NF_IP_POST_ROUTING
848 || hooknum == NF_IP_LOCAL_IN)) {
849 DEBUGP("ip_nat_core: adjusting sequence number\n");
850 /* future: put this in a l4-proto specific function,
851 * and call this function here. */
852 ip_nat_seq_adjust(*pskb, ct, ctinfo);
853 }
854
855 return ret;
856
857 } else
858 return NF_ACCEPT;
859
860 /* not reached */
861 }
862
tuple_src_equal_dst(const struct ip_conntrack_tuple * t1,const struct ip_conntrack_tuple * t2)863 static inline int tuple_src_equal_dst(const struct ip_conntrack_tuple *t1,
864 const struct ip_conntrack_tuple *t2)
865 {
866 if (t1->dst.protonum != t2->dst.protonum || t1->src.ip != t2->dst.ip)
867 return 0;
868 if (t1->dst.protonum != IPPROTO_ICMP)
869 return t1->src.u.all == t2->dst.u.all;
870 else {
871 struct ip_conntrack_tuple inv;
872
873 /* ICMP tuples are asymetric */
874 invert_tuplepr(&inv, t1);
875 return inv.src.u.all == t2->src.u.all &&
876 inv.dst.u.all == t2->dst.u.all;
877 }
878 }
879
880 unsigned int
icmp_reply_translation(struct sk_buff * skb,struct ip_conntrack * conntrack,unsigned int hooknum,int dir)881 icmp_reply_translation(struct sk_buff *skb,
882 struct ip_conntrack *conntrack,
883 unsigned int hooknum,
884 int dir)
885 {
886 struct iphdr *iph = skb->nh.iph;
887 struct icmphdr *hdr = (struct icmphdr *)((u_int32_t *)iph + iph->ihl);
888 struct iphdr *inner = (struct iphdr *)(hdr + 1);
889 size_t datalen = skb->len - ((void *)inner - (void *)iph);
890 unsigned int i;
891 struct ip_nat_info *info = &conntrack->nat.info;
892 struct ip_conntrack_tuple *cttuple, innertuple;
893
894 IP_NF_ASSERT(skb->len >= iph->ihl*4 + sizeof(struct icmphdr));
895 /* Must be RELATED */
896 IP_NF_ASSERT(skb->nfct
897 - ((struct ip_conntrack *)skb->nfct->master)->infos
898 == IP_CT_RELATED
899 || skb->nfct
900 - ((struct ip_conntrack *)skb->nfct->master)->infos
901 == IP_CT_RELATED+IP_CT_IS_REPLY);
902
903 /* Redirects on non-null nats must be dropped, else they'll
904 start talking to each other without our translation, and be
905 confused... --RR */
906 if (hdr->type == ICMP_REDIRECT) {
907 /* Don't care about races here. */
908 if (info->initialized
909 != ((1 << IP_NAT_MANIP_SRC) | (1 << IP_NAT_MANIP_DST))
910 || info->num_manips != 0)
911 return NF_DROP;
912 }
913
914 DEBUGP("icmp_reply_translation: translating error %p hook %u dir %s\n",
915 skb, hooknum, dir == IP_CT_DIR_ORIGINAL ? "ORIG" : "REPLY");
916 /* Note: May not be from a NAT'd host, but probably safest to
917 do translation always as if it came from the host itself
918 (even though a "host unreachable" coming from the host
919 itself is a bit weird).
920
921 More explanation: some people use NAT for anonymizing.
922 Also, CERT recommends dropping all packets from private IP
923 addresses (although ICMP errors from internal links with
924 such addresses are not too uncommon, as Alan Cox points
925 out) */
926
927 if (!ip_ct_get_tuple(inner, datalen, &innertuple,
928 ip_ct_find_proto(inner->protocol)))
929 return 0;
930 cttuple = &conntrack->tuplehash[dir].tuple;
931
932 READ_LOCK(&ip_nat_lock);
933 for (i = 0; i < info->num_manips; i++) {
934 DEBUGP("icmp_reply: manip %u dir %s hook %u\n",
935 i, info->manips[i].direction == IP_CT_DIR_ORIGINAL ?
936 "ORIG" : "REPLY", info->manips[i].hooknum);
937
938 if (info->manips[i].direction != dir)
939 continue;
940
941 /* Mapping the inner packet is just like a normal packet, except
942 * it was never src/dst reversed, so where we would normally
943 * apply a dst manip, we apply a src, and vice versa. */
944
945 /* Only true for forwarded packets, locally generated packets
946 * never hit PRE_ROUTING, we need to apply their PRE_ROUTING
947 * manips in LOCAL_OUT. */
948 if (hooknum == NF_IP_LOCAL_OUT &&
949 info->manips[i].hooknum == NF_IP_PRE_ROUTING)
950 hooknum = info->manips[i].hooknum;
951
952 if (info->manips[i].hooknum != hooknum)
953 continue;
954
955 /* ICMP errors may be generated locally for packets that
956 * don't have all NAT manips applied yet. Verify manips
957 * have been applied before reversing them */
958 if (info->manips[i].maniptype == IP_NAT_MANIP_SRC) {
959 if (!tuple_src_equal_dst(cttuple, &innertuple))
960 continue;
961 } else {
962 if (!tuple_src_equal_dst(&innertuple, cttuple))
963 continue;
964 }
965
966 DEBUGP("icmp_reply: inner %s -> %u.%u.%u.%u %u\n",
967 info->manips[i].maniptype == IP_NAT_MANIP_SRC
968 ? "DST" : "SRC", NIPQUAD(info->manips[i].manip.ip),
969 ntohs(info->manips[i].manip.u.udp.port));
970 manip_pkt(inner->protocol, inner,
971 skb->len - ((void *)inner - (void *)iph),
972 &info->manips[i].manip, !info->manips[i].maniptype,
973 &skb->nfcache);
974 /* Outer packet needs to have IP header NATed like
975 it's a reply. */
976
977 /* Use mapping to map outer packet: 0 give no
978 per-proto mapping */
979 DEBUGP("icmp_reply: outer %s -> %u.%u.%u.%u\n",
980 info->manips[i].maniptype == IP_NAT_MANIP_SRC
981 ? "SRC" : "DST", NIPQUAD(info->manips[i].manip.ip));
982 manip_pkt(0, iph, skb->len, &info->manips[i].manip,
983 info->manips[i].maniptype, &skb->nfcache);
984 }
985 READ_UNLOCK(&ip_nat_lock);
986
987 /* Since we mangled inside ICMP packet, recalculate its
988 checksum from scratch. (Hence the handling of incorrect
989 checksums in conntrack, so we don't accidentally fix one.) */
990 hdr->checksum = 0;
991 hdr->checksum = ip_compute_csum((unsigned char *)hdr,
992 sizeof(*hdr) + datalen);
993
994 return NF_ACCEPT;
995 }
996
ip_nat_init(void)997 int __init ip_nat_init(void)
998 {
999 size_t i;
1000
1001 /* Leave them the same for the moment. */
1002 ip_nat_htable_size = ip_conntrack_htable_size;
1003
1004 /* One vmalloc for both hash tables */
1005 bysource = vmalloc(sizeof(struct list_head) * ip_nat_htable_size*2);
1006 if (!bysource) {
1007 return -ENOMEM;
1008 }
1009 byipsproto = bysource + ip_nat_htable_size;
1010
1011 /* Sew in builtin protocols. */
1012 WRITE_LOCK(&ip_nat_lock);
1013 list_append(&protos, &ip_nat_protocol_tcp);
1014 list_append(&protos, &ip_nat_protocol_udp);
1015 list_append(&protos, &ip_nat_protocol_icmp);
1016 WRITE_UNLOCK(&ip_nat_lock);
1017
1018 for (i = 0; i < ip_nat_htable_size; i++) {
1019 INIT_LIST_HEAD(&bysource[i]);
1020 INIT_LIST_HEAD(&byipsproto[i]);
1021 }
1022
1023 /* FIXME: Man, this is a hack. <SIGH> */
1024 IP_NF_ASSERT(ip_conntrack_destroyed == NULL);
1025 ip_conntrack_destroyed = &ip_nat_cleanup_conntrack;
1026
1027 return 0;
1028 }
1029
1030 /* Clear NAT section of all conntracks, in case we're loaded again. */
clean_nat(struct ip_conntrack * i,void * data)1031 static int clean_nat(struct ip_conntrack *i, void *data)
1032 {
1033 memset(&i->nat, 0, sizeof(i->nat));
1034 return 0;
1035 }
1036
1037 /* Not __exit: called from ip_nat_standalone.c:init_or_cleanup() --RR */
ip_nat_cleanup(void)1038 void ip_nat_cleanup(void)
1039 {
1040 ip_ct_iterate_cleanup(&clean_nat, NULL);
1041 ip_conntrack_destroyed = NULL;
1042 vfree(bysource);
1043 }
1044