1 /* NAT for netfilter; shared with compatibility layer. */
2 
3 /* (c) 1999 Paul `Rusty' Russell.  Licenced under the GNU General
4    Public Licence. */
5 #include <linux/version.h>
6 #include <linux/module.h>
7 #include <linux/types.h>
8 #include <linux/timer.h>
9 #include <linux/skbuff.h>
10 #include <linux/netfilter_ipv4.h>
11 #include <linux/brlock.h>
12 #include <linux/vmalloc.h>
13 #include <net/checksum.h>
14 #include <net/icmp.h>
15 #include <net/ip.h>
16 #include <net/tcp.h>  /* For tcp_prot in getorigdst */
17 
18 #define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_nat_lock)
19 #define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_nat_lock)
20 
21 #include <linux/netfilter_ipv4/ip_conntrack.h>
22 #include <linux/netfilter_ipv4/ip_conntrack_core.h>
23 #include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
24 #include <linux/netfilter_ipv4/ip_nat.h>
25 #include <linux/netfilter_ipv4/ip_nat_protocol.h>
26 #include <linux/netfilter_ipv4/ip_nat_core.h>
27 #include <linux/netfilter_ipv4/ip_nat_helper.h>
28 #include <linux/netfilter_ipv4/ip_conntrack_helper.h>
29 #include <linux/netfilter_ipv4/listhelp.h>
30 
31 #if 0
32 #define DEBUGP printk
33 #else
34 #define DEBUGP(format, args...)
35 #endif
36 
37 DECLARE_RWLOCK(ip_nat_lock);
38 DECLARE_RWLOCK_EXTERN(ip_conntrack_lock);
39 
40 /* Calculated at init based on memory size */
41 static unsigned int ip_nat_htable_size;
42 
43 static struct list_head *bysource;
44 static struct list_head *byipsproto;
45 LIST_HEAD(protos);
46 LIST_HEAD(helpers);
47 
48 extern struct ip_nat_protocol unknown_nat_protocol;
49 
50 /* We keep extra hashes for each conntrack, for fast searching. */
51 static inline size_t
hash_by_ipsproto(u_int32_t src,u_int32_t dst,u_int16_t proto)52 hash_by_ipsproto(u_int32_t src, u_int32_t dst, u_int16_t proto)
53 {
54 	/* Modified src and dst, to ensure we don't create two
55            identical streams. */
56 	return (src + dst + proto) % ip_nat_htable_size;
57 }
58 
59 static inline size_t
hash_by_src(const struct ip_conntrack_manip * manip,u_int16_t proto)60 hash_by_src(const struct ip_conntrack_manip *manip, u_int16_t proto)
61 {
62 	/* Original src, to ensure we map it consistently if poss. */
63 	return (manip->ip + manip->u.all + proto) % ip_nat_htable_size;
64 }
65 
66 /* Noone using conntrack by the time this called. */
ip_nat_cleanup_conntrack(struct ip_conntrack * conn)67 static void ip_nat_cleanup_conntrack(struct ip_conntrack *conn)
68 {
69 	struct ip_nat_info *info = &conn->nat.info;
70 	unsigned int hs, hp;
71 
72 	if (!info->initialized)
73 		return;
74 
75 	IP_NF_ASSERT(info->bysource.conntrack);
76 	IP_NF_ASSERT(info->byipsproto.conntrack);
77 
78 	hs = hash_by_src(&conn->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src,
79 	                 conn->tuplehash[IP_CT_DIR_ORIGINAL]
80 	                 .tuple.dst.protonum);
81 
82 	hp = hash_by_ipsproto(conn->tuplehash[IP_CT_DIR_REPLY].tuple.src.ip,
83 	                      conn->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip,
84 	                      conn->tuplehash[IP_CT_DIR_REPLY]
85 	                      .tuple.dst.protonum);
86 
87 	WRITE_LOCK(&ip_nat_lock);
88 	LIST_DELETE(&bysource[hs], &info->bysource);
89 	LIST_DELETE(&byipsproto[hp], &info->byipsproto);
90 	WRITE_UNLOCK(&ip_nat_lock);
91 }
92 
93 /* We do checksum mangling, so if they were wrong before they're still
94  * wrong.  Also works for incomplete packets (eg. ICMP dest
95  * unreachables.) */
96 u_int16_t
ip_nat_cheat_check(u_int32_t oldvalinv,u_int32_t newval,u_int16_t oldcheck)97 ip_nat_cheat_check(u_int32_t oldvalinv, u_int32_t newval, u_int16_t oldcheck)
98 {
99 	u_int32_t diffs[] = { oldvalinv, newval };
100 	return csum_fold(csum_partial((char *)diffs, sizeof(diffs),
101 				      oldcheck^0xFFFF));
102 }
103 
cmp_proto(const struct ip_nat_protocol * i,int proto)104 static inline int cmp_proto(const struct ip_nat_protocol *i, int proto)
105 {
106 	return i->protonum == proto;
107 }
108 
109 struct ip_nat_protocol *
find_nat_proto(u_int16_t protonum)110 find_nat_proto(u_int16_t protonum)
111 {
112 	struct ip_nat_protocol *i;
113 
114 	MUST_BE_READ_LOCKED(&ip_nat_lock);
115 	i = LIST_FIND(&protos, cmp_proto, struct ip_nat_protocol *, protonum);
116 	if (!i)
117 		i = &unknown_nat_protocol;
118 	return i;
119 }
120 
121 /* Is this tuple already taken? (not by us) */
122 int
ip_nat_used_tuple(const struct ip_conntrack_tuple * tuple,const struct ip_conntrack * ignored_conntrack)123 ip_nat_used_tuple(const struct ip_conntrack_tuple *tuple,
124 		  const struct ip_conntrack *ignored_conntrack)
125 {
126 	/* Conntrack tracking doesn't keep track of outgoing tuples; only
127 	   incoming ones.  NAT means they don't have a fixed mapping,
128 	   so we invert the tuple and look for the incoming reply.
129 
130 	   We could keep a separate hash if this proves too slow. */
131 	struct ip_conntrack_tuple reply;
132 
133 	invert_tuplepr(&reply, tuple);
134 	return ip_conntrack_tuple_taken(&reply, ignored_conntrack);
135 }
136 
137 /* Does tuple + the source manip come within the range mr */
138 static int
in_range(const struct ip_conntrack_tuple * tuple,const struct ip_conntrack_manip * manip,const struct ip_nat_multi_range * mr)139 in_range(const struct ip_conntrack_tuple *tuple,
140 	 const struct ip_conntrack_manip *manip,
141 	 const struct ip_nat_multi_range *mr)
142 {
143 	struct ip_nat_protocol *proto = find_nat_proto(tuple->dst.protonum);
144 	unsigned int i;
145 	struct ip_conntrack_tuple newtuple = { *manip, tuple->dst };
146 
147 	for (i = 0; i < mr->rangesize; i++) {
148 		/* If we are allowed to map IPs, then we must be in the
149 		   range specified, otherwise we must be unchanged. */
150 		if (mr->range[i].flags & IP_NAT_RANGE_MAP_IPS) {
151 			if (ntohl(newtuple.src.ip) < ntohl(mr->range[i].min_ip)
152 			    || (ntohl(newtuple.src.ip)
153 				> ntohl(mr->range[i].max_ip)))
154 				continue;
155 		} else {
156 			if (newtuple.src.ip != tuple->src.ip)
157 				continue;
158 		}
159 
160 		if ((mr->range[i].flags & IP_NAT_RANGE_PROTO_SPECIFIED)
161 		    && proto->in_range(&newtuple, IP_NAT_MANIP_SRC,
162 				       &mr->range[i].min, &mr->range[i].max))
163 			return 1;
164 	}
165 	return 0;
166 }
167 
168 static inline int
src_cmp(const struct ip_nat_hash * i,const struct ip_conntrack_tuple * tuple,const struct ip_nat_multi_range * mr)169 src_cmp(const struct ip_nat_hash *i,
170 	const struct ip_conntrack_tuple *tuple,
171 	const struct ip_nat_multi_range *mr)
172 {
173 	return (i->conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum
174 		== tuple->dst.protonum
175 		&& i->conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip
176 		== tuple->src.ip
177 		&& i->conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.all
178 		== tuple->src.u.all
179 		&& in_range(tuple,
180 			    &i->conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
181 			    .tuple.src,
182 			    mr));
183 }
184 
185 /* Only called for SRC manip */
186 static struct ip_conntrack_manip *
find_appropriate_src(const struct ip_conntrack_tuple * tuple,const struct ip_nat_multi_range * mr)187 find_appropriate_src(const struct ip_conntrack_tuple *tuple,
188 		     const struct ip_nat_multi_range *mr)
189 {
190 	unsigned int h = hash_by_src(&tuple->src, tuple->dst.protonum);
191 	struct ip_nat_hash *i;
192 
193 	MUST_BE_READ_LOCKED(&ip_nat_lock);
194 	i = LIST_FIND(&bysource[h], src_cmp, struct ip_nat_hash *, tuple, mr);
195 	if (i)
196 		return &i->conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src;
197 	else
198 		return NULL;
199 }
200 
201 /* If it's really a local destination manip, it may need to do a
202    source manip too. */
203 static int
do_extra_mangle(u_int32_t var_ip,u_int32_t * other_ipp)204 do_extra_mangle(u_int32_t var_ip, u_int32_t *other_ipp)
205 {
206 	struct rtable *rt;
207 
208 	/* FIXME: IPTOS_TOS(iph->tos) --RR */
209 	if (ip_route_output(&rt, var_ip, 0, 0, 0) != 0) {
210 		DEBUGP("do_extra_mangle: Can't get route to %u.%u.%u.%u\n",
211 		       NIPQUAD(var_ip));
212 		return 0;
213 	}
214 
215 	*other_ipp = rt->rt_src;
216 	ip_rt_put(rt);
217 	return 1;
218 }
219 
220 /* Simple way to iterate through all. */
fake_cmp(const struct ip_nat_hash * i,u_int32_t src,u_int32_t dst,u_int16_t protonum,unsigned int * score,const struct ip_conntrack * conntrack)221 static inline int fake_cmp(const struct ip_nat_hash *i,
222 			   u_int32_t src, u_int32_t dst, u_int16_t protonum,
223 			   unsigned int *score,
224 			   const struct ip_conntrack *conntrack)
225 {
226 	/* Compare backwards: we're dealing with OUTGOING tuples, and
227            inside the conntrack is the REPLY tuple.  Don't count this
228            conntrack. */
229 	if (i->conntrack != conntrack
230 	    && i->conntrack->tuplehash[IP_CT_DIR_REPLY].tuple.src.ip == dst
231 	    && i->conntrack->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip == src
232 	    && (i->conntrack->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum
233 		== protonum))
234 		(*score)++;
235 	return 0;
236 }
237 
238 static inline unsigned int
count_maps(u_int32_t src,u_int32_t dst,u_int16_t protonum,const struct ip_conntrack * conntrack)239 count_maps(u_int32_t src, u_int32_t dst, u_int16_t protonum,
240 	   const struct ip_conntrack *conntrack)
241 {
242 	unsigned int score = 0;
243 	unsigned int h;
244 
245 	MUST_BE_READ_LOCKED(&ip_nat_lock);
246 	h = hash_by_ipsproto(src, dst, protonum);
247 	LIST_FIND(&byipsproto[h], fake_cmp, struct ip_nat_hash *,
248 	          src, dst, protonum, &score, conntrack);
249 
250 	return score;
251 }
252 
253 /* For [FUTURE] fragmentation handling, we want the least-used
254    src-ip/dst-ip/proto triple.  Fairness doesn't come into it.  Thus
255    if the range specifies 1.2.3.4 ports 10000-10005 and 1.2.3.5 ports
256    1-65535, we don't do pro-rata allocation based on ports; we choose
257    the ip with the lowest src-ip/dst-ip/proto usage.
258 
259    If an allocation then fails (eg. all 6 ports used in the 1.2.3.4
260    range), we eliminate that and try again.  This is not the most
261    efficient approach, but if you're worried about that, don't hand us
262    ranges you don't really have.  */
263 static struct ip_nat_range *
find_best_ips_proto(struct ip_conntrack_tuple * tuple,const struct ip_nat_multi_range * mr,const struct ip_conntrack * conntrack,unsigned int hooknum)264 find_best_ips_proto(struct ip_conntrack_tuple *tuple,
265 		    const struct ip_nat_multi_range *mr,
266 		    const struct ip_conntrack *conntrack,
267 		    unsigned int hooknum)
268 {
269 	unsigned int i;
270 	struct {
271 		const struct ip_nat_range *range;
272 		unsigned int score;
273 		struct ip_conntrack_tuple tuple;
274 	} best = { NULL,  0xFFFFFFFF };
275 	u_int32_t *var_ipp, *other_ipp, saved_ip, orig_dstip;
276 	static unsigned int randomness = 0;
277 
278 	if (HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC) {
279 		var_ipp = &tuple->src.ip;
280 		saved_ip = tuple->dst.ip;
281 		other_ipp = &tuple->dst.ip;
282 	} else {
283 		var_ipp = &tuple->dst.ip;
284 		saved_ip = tuple->src.ip;
285 		other_ipp = &tuple->src.ip;
286 	}
287 	/* Don't do do_extra_mangle unless neccessary (overrides
288            explicit socket bindings, for example) */
289 	orig_dstip = tuple->dst.ip;
290 
291 	IP_NF_ASSERT(mr->rangesize >= 1);
292 	for (i = 0; i < mr->rangesize; i++) {
293 		/* Host order */
294 		u_int32_t minip, maxip, j;
295 
296 		/* Don't do ranges which are already eliminated. */
297 		if (mr->range[i].flags & IP_NAT_RANGE_FULL) {
298 			continue;
299 		}
300 
301 		if (mr->range[i].flags & IP_NAT_RANGE_MAP_IPS) {
302 			minip = ntohl(mr->range[i].min_ip);
303 			maxip = ntohl(mr->range[i].max_ip);
304 		} else
305 			minip = maxip = ntohl(*var_ipp);
306 
307 		randomness++;
308 		for (j = 0; j < maxip - minip + 1; j++) {
309 			unsigned int score;
310 
311 			*var_ipp = htonl(minip + (randomness + j)
312 					 % (maxip - minip + 1));
313 
314 			/* Reset the other ip in case it was mangled by
315 			 * do_extra_mangle last time. */
316 			*other_ipp = saved_ip;
317 
318 			if (hooknum == NF_IP_LOCAL_OUT
319 			    && *var_ipp != orig_dstip
320 			    && !do_extra_mangle(*var_ipp, other_ipp)) {
321 				DEBUGP("Range %u %u.%u.%u.%u rt failed!\n",
322 				       i, NIPQUAD(*var_ipp));
323 				/* Can't route?  This whole range part is
324 				 * probably screwed, but keep trying
325 				 * anyway. */
326 				continue;
327 			}
328 
329 			/* Count how many others map onto this. */
330 			score = count_maps(tuple->src.ip, tuple->dst.ip,
331 					   tuple->dst.protonum, conntrack);
332 			if (score < best.score) {
333 				/* Optimization: doesn't get any better than
334 				   this. */
335 				if (score == 0)
336 					return (struct ip_nat_range *)
337 						&mr->range[i];
338 
339 				best.score = score;
340 				best.tuple = *tuple;
341 				best.range = &mr->range[i];
342 			}
343 		}
344 	}
345 	*tuple = best.tuple;
346 
347 	/* Discard const. */
348 	return (struct ip_nat_range *)best.range;
349 }
350 
351 /* Fast version doesn't iterate through hash chains, but only handles
352    common case of single IP address (null NAT, masquerade) */
353 static struct ip_nat_range *
find_best_ips_proto_fast(struct ip_conntrack_tuple * tuple,const struct ip_nat_multi_range * mr,const struct ip_conntrack * conntrack,unsigned int hooknum)354 find_best_ips_proto_fast(struct ip_conntrack_tuple *tuple,
355 			 const struct ip_nat_multi_range *mr,
356 			 const struct ip_conntrack *conntrack,
357 			 unsigned int hooknum)
358 {
359 	if (mr->rangesize != 1
360 	    || (mr->range[0].flags & IP_NAT_RANGE_FULL)
361 	    || ((mr->range[0].flags & IP_NAT_RANGE_MAP_IPS)
362 		&& mr->range[0].min_ip != mr->range[0].max_ip))
363 		return find_best_ips_proto(tuple, mr, conntrack, hooknum);
364 
365 	if (mr->range[0].flags & IP_NAT_RANGE_MAP_IPS) {
366 		if (HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC)
367 			tuple->src.ip = mr->range[0].min_ip;
368 		else {
369 			/* Only do extra mangle when required (breaks
370                            socket binding) */
371 			if (tuple->dst.ip != mr->range[0].min_ip
372 			    && hooknum == NF_IP_LOCAL_OUT
373 			    && !do_extra_mangle(mr->range[0].min_ip,
374 						&tuple->src.ip))
375 				return NULL;
376 			tuple->dst.ip = mr->range[0].min_ip;
377 		}
378 	}
379 
380 	/* Discard const. */
381 	return (struct ip_nat_range *)&mr->range[0];
382 }
383 
384 static int
get_unique_tuple(struct ip_conntrack_tuple * tuple,const struct ip_conntrack_tuple * orig_tuple,const struct ip_nat_multi_range * mrr,struct ip_conntrack * conntrack,unsigned int hooknum)385 get_unique_tuple(struct ip_conntrack_tuple *tuple,
386 		 const struct ip_conntrack_tuple *orig_tuple,
387 		 const struct ip_nat_multi_range *mrr,
388 		 struct ip_conntrack *conntrack,
389 		 unsigned int hooknum)
390 {
391 	struct ip_nat_protocol *proto
392 		= find_nat_proto(orig_tuple->dst.protonum);
393 	struct ip_nat_range *rptr;
394 	unsigned int i;
395 	int ret;
396 
397 	/* We temporarily use flags for marking full parts, but we
398 	   always clean up afterwards */
399 	struct ip_nat_multi_range *mr = (void *)mrr;
400 
401 	/* 1) If this srcip/proto/src-proto-part is currently mapped,
402 	   and that same mapping gives a unique tuple within the given
403 	   range, use that.
404 
405 	   This is only required for source (ie. NAT/masq) mappings.
406 	   So far, we don't do local source mappings, so multiple
407 	   manips not an issue.  */
408 	if (hooknum == NF_IP_POST_ROUTING) {
409 		struct ip_conntrack_manip *manip;
410 
411 		manip = find_appropriate_src(orig_tuple, mr);
412 		if (manip) {
413 			/* Apply same source manipulation. */
414 			*tuple = ((struct ip_conntrack_tuple)
415 				  { *manip, orig_tuple->dst });
416 			DEBUGP("get_unique_tuple: Found current src map\n");
417 			return 1;
418 		}
419 	}
420 
421 	/* 2) Select the least-used IP/proto combination in the given
422 	   range.
423 	*/
424 	*tuple = *orig_tuple;
425 	while ((rptr = find_best_ips_proto_fast(tuple, mr, conntrack, hooknum))
426 	       != NULL) {
427 		DEBUGP("Found best for "); DUMP_TUPLE(tuple);
428 		/* 3) The per-protocol part of the manip is made to
429 		   map into the range to make a unique tuple. */
430 
431 		/* Only bother mapping if it's not already in range
432 		   and unique */
433 		if ((!(rptr->flags & IP_NAT_RANGE_PROTO_SPECIFIED)
434 		     || proto->in_range(tuple, HOOK2MANIP(hooknum),
435 					&rptr->min, &rptr->max))
436 		    && !ip_nat_used_tuple(tuple, conntrack)) {
437 			ret = 1;
438 			goto clear_fulls;
439 		} else {
440 			if (proto->unique_tuple(tuple, rptr,
441 						HOOK2MANIP(hooknum),
442 						conntrack)) {
443 				/* Must be unique. */
444 				IP_NF_ASSERT(!ip_nat_used_tuple(tuple,
445 								conntrack));
446 				ret = 1;
447 				goto clear_fulls;
448 			} else if (HOOK2MANIP(hooknum) == IP_NAT_MANIP_DST) {
449 				/* Try implicit source NAT; protocol
450                                    may be able to play with ports to
451                                    make it unique. */
452 				struct ip_nat_range r
453 					= { IP_NAT_RANGE_MAP_IPS,
454 					    tuple->src.ip, tuple->src.ip,
455 					    { 0 }, { 0 } };
456 				DEBUGP("Trying implicit mapping\n");
457 				if (proto->unique_tuple(tuple, &r,
458 							IP_NAT_MANIP_SRC,
459 							conntrack)) {
460 					/* Must be unique. */
461 					IP_NF_ASSERT(!ip_nat_used_tuple
462 						     (tuple, conntrack));
463 					ret = 1;
464 					goto clear_fulls;
465 				}
466 			}
467 			DEBUGP("Protocol can't get unique tuple %u.\n",
468 			       hooknum);
469 		}
470 
471 		/* Eliminate that from range, and try again. */
472 		rptr->flags |= IP_NAT_RANGE_FULL;
473 		*tuple = *orig_tuple;
474 	}
475 
476 	ret = 0;
477 
478  clear_fulls:
479 	/* Clear full flags. */
480 	IP_NF_ASSERT(mr->rangesize >= 1);
481 	for (i = 0; i < mr->rangesize; i++)
482 		mr->range[i].flags &= ~IP_NAT_RANGE_FULL;
483 
484 	return ret;
485 }
486 
487 static inline int
helper_cmp(const struct ip_nat_helper * helper,const struct ip_conntrack_tuple * tuple)488 helper_cmp(const struct ip_nat_helper *helper,
489 	   const struct ip_conntrack_tuple *tuple)
490 {
491 	return ip_ct_tuple_mask_cmp(tuple, &helper->tuple, &helper->mask);
492 }
493 
494 /* Where to manip the reply packets (will be reverse manip). */
495 static unsigned int opposite_hook[NF_IP_NUMHOOKS]
496 = { [NF_IP_PRE_ROUTING] = NF_IP_POST_ROUTING,
497     [NF_IP_POST_ROUTING] = NF_IP_PRE_ROUTING,
498     [NF_IP_LOCAL_OUT] = NF_IP_LOCAL_IN,
499     [NF_IP_LOCAL_IN] = NF_IP_LOCAL_OUT,
500 };
501 
502 unsigned int
ip_nat_setup_info(struct ip_conntrack * conntrack,const struct ip_nat_multi_range * mr,unsigned int hooknum)503 ip_nat_setup_info(struct ip_conntrack *conntrack,
504 		  const struct ip_nat_multi_range *mr,
505 		  unsigned int hooknum)
506 {
507 	struct ip_conntrack_tuple new_tuple, inv_tuple, reply;
508 	struct ip_conntrack_tuple orig_tp;
509 	struct ip_nat_info *info = &conntrack->nat.info;
510 	int in_hashes = info->initialized;
511 
512 	MUST_BE_WRITE_LOCKED(&ip_nat_lock);
513 	IP_NF_ASSERT(hooknum == NF_IP_PRE_ROUTING
514 		     || hooknum == NF_IP_POST_ROUTING
515 		     || hooknum == NF_IP_LOCAL_IN
516 		     || hooknum == NF_IP_LOCAL_OUT);
517 	IP_NF_ASSERT(info->num_manips < IP_NAT_MAX_MANIPS);
518 	IP_NF_ASSERT(!(info->initialized & (1 << HOOK2MANIP(hooknum))));
519 
520 	/* What we've got will look like inverse of reply. Normally
521 	   this is what is in the conntrack, except for prior
522 	   manipulations (future optimization: if num_manips == 0,
523 	   orig_tp =
524 	   conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple) */
525 	invert_tuplepr(&orig_tp,
526 		       &conntrack->tuplehash[IP_CT_DIR_REPLY].tuple);
527 
528 #if 0
529 	{
530 	unsigned int i;
531 
532 	DEBUGP("Hook %u (%s), ", hooknum,
533 	       HOOK2MANIP(hooknum)==IP_NAT_MANIP_SRC ? "SRC" : "DST");
534 	DUMP_TUPLE(&orig_tp);
535 	DEBUGP("Range %p: ", mr);
536 	for (i = 0; i < mr->rangesize; i++) {
537 		DEBUGP("%u:%s%s%s %u.%u.%u.%u - %u.%u.%u.%u %u - %u\n",
538 		       i,
539 		       (mr->range[i].flags & IP_NAT_RANGE_MAP_IPS)
540 		       ? " MAP_IPS" : "",
541 		       (mr->range[i].flags
542 			& IP_NAT_RANGE_PROTO_SPECIFIED)
543 		       ? " PROTO_SPECIFIED" : "",
544 		       (mr->range[i].flags & IP_NAT_RANGE_FULL)
545 		       ? " FULL" : "",
546 		       NIPQUAD(mr->range[i].min_ip),
547 		       NIPQUAD(mr->range[i].max_ip),
548 		       mr->range[i].min.all,
549 		       mr->range[i].max.all);
550 	}
551 	}
552 #endif
553 
554 	do {
555 		if (!get_unique_tuple(&new_tuple, &orig_tp, mr, conntrack,
556 				      hooknum)) {
557 			DEBUGP("ip_nat_setup_info: Can't get unique for %p.\n",
558 			       conntrack);
559 			return NF_DROP;
560 		}
561 
562 #if 0
563 		DEBUGP("Hook %u (%s) %p\n", hooknum,
564 		       HOOK2MANIP(hooknum)==IP_NAT_MANIP_SRC ? "SRC" : "DST",
565 		       conntrack);
566 		DEBUGP("Original: ");
567 		DUMP_TUPLE(&orig_tp);
568 		DEBUGP("New: ");
569 		DUMP_TUPLE(&new_tuple);
570 #endif
571 
572 		/* We now have two tuples (SRCIP/SRCPT/DSTIP/DSTPT):
573 		   the original (A/B/C/D') and the mangled one (E/F/G/H').
574 
575 		   We're only allowed to work with the SRC per-proto
576 		   part, so we create inverses of both to start, then
577 		   derive the other fields we need.  */
578 
579 		/* Reply connection: simply invert the new tuple
580                    (G/H/E/F') */
581 		invert_tuplepr(&reply, &new_tuple);
582 
583 		/* Alter conntrack table so it recognizes replies.
584                    If fail this race (reply tuple now used), repeat. */
585 	} while (!ip_conntrack_alter_reply(conntrack, &reply));
586 
587 	/* FIXME: We can simply used existing conntrack reply tuple
588            here --RR */
589 	/* Create inverse of original: C/D/A/B' */
590 	invert_tuplepr(&inv_tuple, &orig_tp);
591 
592 	/* Has source changed?. */
593 	if (!ip_ct_tuple_src_equal(&new_tuple, &orig_tp)) {
594 		/* In this direction, a source manip. */
595 		info->manips[info->num_manips++] =
596 			((struct ip_nat_info_manip)
597 			 { IP_CT_DIR_ORIGINAL, hooknum,
598 			   IP_NAT_MANIP_SRC, new_tuple.src });
599 
600 		IP_NF_ASSERT(info->num_manips < IP_NAT_MAX_MANIPS);
601 
602 		/* In the reverse direction, a destination manip. */
603 		info->manips[info->num_manips++] =
604 			((struct ip_nat_info_manip)
605 			 { IP_CT_DIR_REPLY, opposite_hook[hooknum],
606 			   IP_NAT_MANIP_DST, orig_tp.src });
607 		IP_NF_ASSERT(info->num_manips <= IP_NAT_MAX_MANIPS);
608 	}
609 
610 	/* Has destination changed? */
611 	if (!ip_ct_tuple_dst_equal(&new_tuple, &orig_tp)) {
612 		/* In this direction, a destination manip */
613 		info->manips[info->num_manips++] =
614 			((struct ip_nat_info_manip)
615 			 { IP_CT_DIR_ORIGINAL, hooknum,
616 			   IP_NAT_MANIP_DST, reply.src });
617 
618 		IP_NF_ASSERT(info->num_manips < IP_NAT_MAX_MANIPS);
619 
620 		/* In the reverse direction, a source manip. */
621 		info->manips[info->num_manips++] =
622 			((struct ip_nat_info_manip)
623 			 { IP_CT_DIR_REPLY, opposite_hook[hooknum],
624 			   IP_NAT_MANIP_SRC, inv_tuple.src });
625 		IP_NF_ASSERT(info->num_manips <= IP_NAT_MAX_MANIPS);
626 	}
627 
628 	/* If there's a helper, assign it; based on new tuple. */
629 	if (!conntrack->master)
630 		info->helper = LIST_FIND(&helpers, helper_cmp, struct ip_nat_helper *,
631 					 &reply);
632 
633 	/* It's done. */
634 	info->initialized |= (1 << HOOK2MANIP(hooknum));
635 
636 	if (in_hashes) {
637 		IP_NF_ASSERT(info->bysource.conntrack);
638 		replace_in_hashes(conntrack, info);
639 	} else {
640 		place_in_hashes(conntrack, info);
641 	}
642 
643 	return NF_ACCEPT;
644 }
645 
replace_in_hashes(struct ip_conntrack * conntrack,struct ip_nat_info * info)646 void replace_in_hashes(struct ip_conntrack *conntrack,
647 		       struct ip_nat_info *info)
648 {
649 	/* Source has changed, so replace in hashes. */
650 	unsigned int srchash
651 		= hash_by_src(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
652 			      .tuple.src,
653 			      conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
654 			      .tuple.dst.protonum);
655 	/* We place packet as seen OUTGOUNG in byips_proto hash
656            (ie. reverse dst and src of reply packet. */
657 	unsigned int ipsprotohash
658 		= hash_by_ipsproto(conntrack->tuplehash[IP_CT_DIR_REPLY]
659 				   .tuple.dst.ip,
660 				   conntrack->tuplehash[IP_CT_DIR_REPLY]
661 				   .tuple.src.ip,
662 				   conntrack->tuplehash[IP_CT_DIR_REPLY]
663 				   .tuple.dst.protonum);
664 
665 	IP_NF_ASSERT(info->bysource.conntrack == conntrack);
666 	MUST_BE_WRITE_LOCKED(&ip_nat_lock);
667 
668 	list_del(&info->bysource.list);
669 	list_del(&info->byipsproto.list);
670 
671 	list_prepend(&bysource[srchash], &info->bysource);
672 	list_prepend(&byipsproto[ipsprotohash], &info->byipsproto);
673 }
674 
place_in_hashes(struct ip_conntrack * conntrack,struct ip_nat_info * info)675 void place_in_hashes(struct ip_conntrack *conntrack,
676 		     struct ip_nat_info *info)
677 {
678 	unsigned int srchash
679 		= hash_by_src(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
680 			      .tuple.src,
681 			      conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
682 			      .tuple.dst.protonum);
683 	/* We place packet as seen OUTGOUNG in byips_proto hash
684            (ie. reverse dst and src of reply packet. */
685 	unsigned int ipsprotohash
686 		= hash_by_ipsproto(conntrack->tuplehash[IP_CT_DIR_REPLY]
687 				   .tuple.dst.ip,
688 				   conntrack->tuplehash[IP_CT_DIR_REPLY]
689 				   .tuple.src.ip,
690 				   conntrack->tuplehash[IP_CT_DIR_REPLY]
691 				   .tuple.dst.protonum);
692 
693 	IP_NF_ASSERT(!info->bysource.conntrack);
694 
695 	MUST_BE_WRITE_LOCKED(&ip_nat_lock);
696 	info->byipsproto.conntrack = conntrack;
697 	info->bysource.conntrack = conntrack;
698 
699 	list_prepend(&bysource[srchash], &info->bysource);
700 	list_prepend(&byipsproto[ipsprotohash], &info->byipsproto);
701 }
702 
703 static void
manip_pkt(u_int16_t proto,struct iphdr * iph,size_t len,const struct ip_conntrack_manip * manip,enum ip_nat_manip_type maniptype,__u32 * nfcache)704 manip_pkt(u_int16_t proto, struct iphdr *iph, size_t len,
705 	  const struct ip_conntrack_manip *manip,
706 	  enum ip_nat_manip_type maniptype,
707 	  __u32 *nfcache)
708 {
709 	*nfcache |= NFC_ALTERED;
710 	find_nat_proto(proto)->manip_pkt(iph, len, manip, maniptype);
711 
712 	if (maniptype == IP_NAT_MANIP_SRC) {
713 		iph->check = ip_nat_cheat_check(~iph->saddr, manip->ip,
714 						iph->check);
715 		iph->saddr = manip->ip;
716 	} else {
717 		iph->check = ip_nat_cheat_check(~iph->daddr, manip->ip,
718 						iph->check);
719 		iph->daddr = manip->ip;
720 	}
721 #if 0
722 	if (ip_fast_csum((u8 *)iph, iph->ihl) != 0)
723 		DEBUGP("IP: checksum on packet bad.\n");
724 
725 	if (proto == IPPROTO_TCP) {
726 		void *th = (u_int32_t *)iph + iph->ihl;
727 		if (tcp_v4_check(th, len - 4*iph->ihl, iph->saddr, iph->daddr,
728 				 csum_partial((char *)th, len-4*iph->ihl, 0)))
729 			DEBUGP("TCP: checksum on packet bad\n");
730 	}
731 #endif
732 }
733 
exp_for_packet(struct ip_conntrack_expect * exp,struct sk_buff ** pskb)734 static inline int exp_for_packet(struct ip_conntrack_expect *exp,
735 			         struct sk_buff **pskb)
736 {
737 	struct ip_conntrack_protocol *proto;
738 	int ret = 1;
739 
740 	MUST_BE_READ_LOCKED(&ip_conntrack_lock);
741 	proto = __ip_ct_find_proto((*pskb)->nh.iph->protocol);
742 	if (proto->exp_matches_pkt)
743 		ret = proto->exp_matches_pkt(exp, pskb);
744 
745 	return ret;
746 }
747 
748 /* Do packet manipulations according to binding. */
749 unsigned int
do_bindings(struct ip_conntrack * ct,enum ip_conntrack_info ctinfo,struct ip_nat_info * info,unsigned int hooknum,struct sk_buff ** pskb)750 do_bindings(struct ip_conntrack *ct,
751 	    enum ip_conntrack_info ctinfo,
752 	    struct ip_nat_info *info,
753 	    unsigned int hooknum,
754 	    struct sk_buff **pskb)
755 {
756 	unsigned int i;
757 	struct ip_nat_helper *helper;
758 	enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
759 	int is_tcp = (*pskb)->nh.iph->protocol == IPPROTO_TCP;
760 
761 	/* Need nat lock to protect against modification, but neither
762 	   conntrack (referenced) and helper (deleted with
763 	   synchronize_bh()) can vanish. */
764 	READ_LOCK(&ip_nat_lock);
765 	for (i = 0; i < info->num_manips; i++) {
766 		/* raw socket (tcpdump) may have clone of incoming
767                    skb: don't disturb it --RR */
768 		if (skb_cloned(*pskb) && !(*pskb)->sk) {
769 			struct sk_buff *nskb = skb_copy(*pskb, GFP_ATOMIC);
770 			if (!nskb) {
771 				READ_UNLOCK(&ip_nat_lock);
772 				return NF_DROP;
773 			}
774 			kfree_skb(*pskb);
775 			*pskb = nskb;
776 		}
777 
778 		if (info->manips[i].direction == dir
779 		    && info->manips[i].hooknum == hooknum) {
780 			DEBUGP("Mangling %p: %s to %u.%u.%u.%u %u\n",
781 			       *pskb,
782 			       info->manips[i].maniptype == IP_NAT_MANIP_SRC
783 			       ? "SRC" : "DST",
784 			       NIPQUAD(info->manips[i].manip.ip),
785 			       htons(info->manips[i].manip.u.all));
786 			manip_pkt((*pskb)->nh.iph->protocol,
787 				  (*pskb)->nh.iph,
788 				  (*pskb)->len,
789 				  &info->manips[i].manip,
790 				  info->manips[i].maniptype,
791 				  &(*pskb)->nfcache);
792 		}
793 	}
794 	helper = info->helper;
795 	READ_UNLOCK(&ip_nat_lock);
796 
797 	if (helper) {
798 		struct ip_conntrack_expect *exp = NULL;
799 		struct list_head *cur_item;
800 		int ret = NF_ACCEPT;
801 		int helper_called = 0;
802 
803 		DEBUGP("do_bindings: helper existing for (%p)\n", ct);
804 
805 		/* Always defragged for helpers */
806 		IP_NF_ASSERT(!((*pskb)->nh.iph->frag_off
807 			       & htons(IP_MF|IP_OFFSET)));
808 
809 		/* Have to grab read lock before sibling_list traversal */
810 		READ_LOCK(&ip_conntrack_lock);
811 		list_for_each_prev(cur_item, &ct->sibling_list) {
812 			exp = list_entry(cur_item, struct ip_conntrack_expect,
813 					 expected_list);
814 
815 			/* if this expectation is already established, skip */
816 			if (exp->sibling)
817 				continue;
818 
819 			if (exp_for_packet(exp, pskb)) {
820 				/* FIXME: May be true multiple times in the
821 				 * case of UDP!! */
822 				DEBUGP("calling nat helper (exp=%p) for	packet\n", exp);
823 				ret = helper->help(ct, exp, info, ctinfo,
824 						   hooknum, pskb);
825 				if (ret != NF_ACCEPT) {
826 					READ_UNLOCK(&ip_conntrack_lock);
827 					return ret;
828 				}
829 				helper_called = 1;
830 			}
831 		}
832 		/* Helper might want to manip the packet even when there is no
833 		 * matching expectation for this packet */
834 		if (!helper_called && helper->flags & IP_NAT_HELPER_F_ALWAYS) {
835 			DEBUGP("calling nat helper for packet without expectation\n");
836 			ret = helper->help(ct, NULL, info, ctinfo,
837 					   hooknum, pskb);
838 			if (ret != NF_ACCEPT) {
839 				READ_UNLOCK(&ip_conntrack_lock);
840 				return ret;
841 			}
842 		}
843 		READ_UNLOCK(&ip_conntrack_lock);
844 
845 		/* Adjust sequence number only once per packet
846 		 * (helper is called at all hooks) */
847 		if (is_tcp && (hooknum == NF_IP_POST_ROUTING
848 			       || hooknum == NF_IP_LOCAL_IN)) {
849 			DEBUGP("ip_nat_core: adjusting sequence number\n");
850 			/* future: put this in a l4-proto specific function,
851 			 * and call this function here. */
852 			ip_nat_seq_adjust(*pskb, ct, ctinfo);
853 		}
854 
855 		return ret;
856 
857 	} else
858 		return NF_ACCEPT;
859 
860 	/* not reached */
861 }
862 
tuple_src_equal_dst(const struct ip_conntrack_tuple * t1,const struct ip_conntrack_tuple * t2)863 static inline int tuple_src_equal_dst(const struct ip_conntrack_tuple *t1,
864                                       const struct ip_conntrack_tuple *t2)
865 {
866 	if (t1->dst.protonum != t2->dst.protonum || t1->src.ip != t2->dst.ip)
867 		return 0;
868 	if (t1->dst.protonum != IPPROTO_ICMP)
869 		return t1->src.u.all == t2->dst.u.all;
870 	else {
871 		struct ip_conntrack_tuple inv;
872 
873 		/* ICMP tuples are asymetric */
874 		invert_tuplepr(&inv, t1);
875 		return inv.src.u.all == t2->src.u.all &&
876 		       inv.dst.u.all == t2->dst.u.all;
877 	}
878 }
879 
880 unsigned int
icmp_reply_translation(struct sk_buff * skb,struct ip_conntrack * conntrack,unsigned int hooknum,int dir)881 icmp_reply_translation(struct sk_buff *skb,
882 		       struct ip_conntrack *conntrack,
883 		       unsigned int hooknum,
884 		       int dir)
885 {
886 	struct iphdr *iph = skb->nh.iph;
887 	struct icmphdr *hdr = (struct icmphdr *)((u_int32_t *)iph + iph->ihl);
888 	struct iphdr *inner = (struct iphdr *)(hdr + 1);
889 	size_t datalen = skb->len - ((void *)inner - (void *)iph);
890 	unsigned int i;
891 	struct ip_nat_info *info = &conntrack->nat.info;
892 	struct ip_conntrack_tuple *cttuple, innertuple;
893 
894 	IP_NF_ASSERT(skb->len >= iph->ihl*4 + sizeof(struct icmphdr));
895 	/* Must be RELATED */
896 	IP_NF_ASSERT(skb->nfct
897 		     - ((struct ip_conntrack *)skb->nfct->master)->infos
898 		     == IP_CT_RELATED
899 		     || skb->nfct
900 		     - ((struct ip_conntrack *)skb->nfct->master)->infos
901 		     == IP_CT_RELATED+IP_CT_IS_REPLY);
902 
903 	/* Redirects on non-null nats must be dropped, else they'll
904            start talking to each other without our translation, and be
905            confused... --RR */
906 	if (hdr->type == ICMP_REDIRECT) {
907 		/* Don't care about races here. */
908 		if (info->initialized
909 		    != ((1 << IP_NAT_MANIP_SRC) | (1 << IP_NAT_MANIP_DST))
910 		    || info->num_manips != 0)
911 			return NF_DROP;
912 	}
913 
914 	DEBUGP("icmp_reply_translation: translating error %p hook %u dir %s\n",
915 	       skb, hooknum, dir == IP_CT_DIR_ORIGINAL ? "ORIG" : "REPLY");
916 	/* Note: May not be from a NAT'd host, but probably safest to
917 	   do translation always as if it came from the host itself
918 	   (even though a "host unreachable" coming from the host
919 	   itself is a bit weird).
920 
921 	   More explanation: some people use NAT for anonymizing.
922 	   Also, CERT recommends dropping all packets from private IP
923 	   addresses (although ICMP errors from internal links with
924 	   such addresses are not too uncommon, as Alan Cox points
925 	   out) */
926 
927 	if (!ip_ct_get_tuple(inner, datalen, &innertuple,
928 	                     ip_ct_find_proto(inner->protocol)))
929 		return 0;
930 	cttuple = &conntrack->tuplehash[dir].tuple;
931 
932 	READ_LOCK(&ip_nat_lock);
933 	for (i = 0; i < info->num_manips; i++) {
934 		DEBUGP("icmp_reply: manip %u dir %s hook %u\n",
935 		       i, info->manips[i].direction == IP_CT_DIR_ORIGINAL ?
936 		       "ORIG" : "REPLY", info->manips[i].hooknum);
937 
938 		if (info->manips[i].direction != dir)
939 			continue;
940 
941 		/* Mapping the inner packet is just like a normal packet, except
942 		 * it was never src/dst reversed, so where we would normally
943 		 * apply a dst manip, we apply a src, and vice versa. */
944 
945 		/* Only true for forwarded packets, locally generated packets
946 		 * never hit PRE_ROUTING, we need to apply their PRE_ROUTING
947 		 * manips in LOCAL_OUT. */
948 		if (hooknum == NF_IP_LOCAL_OUT &&
949 		    info->manips[i].hooknum == NF_IP_PRE_ROUTING)
950 			hooknum = info->manips[i].hooknum;
951 
952 		if (info->manips[i].hooknum != hooknum)
953 			continue;
954 
955 		/* ICMP errors may be generated locally for packets that
956 		 * don't have all NAT manips applied yet. Verify manips
957 		 * have been applied before reversing them */
958 		if (info->manips[i].maniptype == IP_NAT_MANIP_SRC) {
959 			if (!tuple_src_equal_dst(cttuple, &innertuple))
960 				continue;
961 		} else {
962 			if (!tuple_src_equal_dst(&innertuple, cttuple))
963 				continue;
964 		}
965 
966 		DEBUGP("icmp_reply: inner %s -> %u.%u.%u.%u %u\n",
967 		       info->manips[i].maniptype == IP_NAT_MANIP_SRC
968 		       ? "DST" : "SRC", NIPQUAD(info->manips[i].manip.ip),
969 		       ntohs(info->manips[i].manip.u.udp.port));
970 		manip_pkt(inner->protocol, inner,
971 			  skb->len - ((void *)inner - (void *)iph),
972 			  &info->manips[i].manip, !info->manips[i].maniptype,
973 			  &skb->nfcache);
974 		/* Outer packet needs to have IP header NATed like
975                    it's a reply. */
976 
977 		/* Use mapping to map outer packet: 0 give no
978                           per-proto mapping */
979 		DEBUGP("icmp_reply: outer %s -> %u.%u.%u.%u\n",
980 		       info->manips[i].maniptype == IP_NAT_MANIP_SRC
981 		       ? "SRC" : "DST", NIPQUAD(info->manips[i].manip.ip));
982 		manip_pkt(0, iph, skb->len, &info->manips[i].manip,
983 			  info->manips[i].maniptype, &skb->nfcache);
984 	}
985 	READ_UNLOCK(&ip_nat_lock);
986 
987 	/* Since we mangled inside ICMP packet, recalculate its
988 	   checksum from scratch.  (Hence the handling of incorrect
989 	   checksums in conntrack, so we don't accidentally fix one.)  */
990 	hdr->checksum = 0;
991 	hdr->checksum = ip_compute_csum((unsigned char *)hdr,
992 					sizeof(*hdr) + datalen);
993 
994 	return NF_ACCEPT;
995 }
996 
ip_nat_init(void)997 int __init ip_nat_init(void)
998 {
999 	size_t i;
1000 
1001 	/* Leave them the same for the moment. */
1002 	ip_nat_htable_size = ip_conntrack_htable_size;
1003 
1004 	/* One vmalloc for both hash tables */
1005 	bysource = vmalloc(sizeof(struct list_head) * ip_nat_htable_size*2);
1006 	if (!bysource) {
1007 		return -ENOMEM;
1008 	}
1009 	byipsproto = bysource + ip_nat_htable_size;
1010 
1011 	/* Sew in builtin protocols. */
1012 	WRITE_LOCK(&ip_nat_lock);
1013 	list_append(&protos, &ip_nat_protocol_tcp);
1014 	list_append(&protos, &ip_nat_protocol_udp);
1015 	list_append(&protos, &ip_nat_protocol_icmp);
1016 	WRITE_UNLOCK(&ip_nat_lock);
1017 
1018 	for (i = 0; i < ip_nat_htable_size; i++) {
1019 		INIT_LIST_HEAD(&bysource[i]);
1020 		INIT_LIST_HEAD(&byipsproto[i]);
1021 	}
1022 
1023 	/* FIXME: Man, this is a hack.  <SIGH> */
1024 	IP_NF_ASSERT(ip_conntrack_destroyed == NULL);
1025 	ip_conntrack_destroyed = &ip_nat_cleanup_conntrack;
1026 
1027 	return 0;
1028 }
1029 
1030 /* Clear NAT section of all conntracks, in case we're loaded again. */
clean_nat(struct ip_conntrack * i,void * data)1031 static int clean_nat(struct ip_conntrack *i, void *data)
1032 {
1033 	memset(&i->nat, 0, sizeof(i->nat));
1034 	return 0;
1035 }
1036 
1037 /* Not __exit: called from ip_nat_standalone.c:init_or_cleanup() --RR */
ip_nat_cleanup(void)1038 void ip_nat_cleanup(void)
1039 {
1040 	ip_ct_iterate_cleanup(&clean_nat, NULL);
1041 	ip_conntrack_destroyed = NULL;
1042 	vfree(bysource);
1043 }
1044