1 /*
2  * IPVS         An implementation of the IP virtual server support for the
3  *              LINUX operating system.  IPVS is now implemented as a module
4  *              over the Netfilter framework. IPVS can be used to build a
5  *              high-performance and highly available server based on a
6  *              cluster of servers.
7  *
8  * Version:     $Id: ip_vs_conn.c,v 1.28.2.5 2003/08/09 13:27:08 wensong Exp $
9  *
10  * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
11  *              Peter Kese <peter.kese@ijs.si>
12  *              Julian Anastasov <ja@ssi.bg>
13  *
14  *              This program is free software; you can redistribute it and/or
15  *              modify it under the terms of the GNU General Public License
16  *              as published by the Free Software Foundation; either version
17  *              2 of the License, or (at your option) any later version.
18  *
19  * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese,
20  * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms
21  * and others. Many code here is taken from IP MASQ code of kernel 2.2.
22  *
23  * Changes:
24  *
25  */
26 
27 #include <linux/module.h>
28 #include <linux/kernel.h>
29 #include <linux/vmalloc.h>
30 #include <linux/ip.h>
31 #include <linux/tcp.h>                  /* for tcphdr */
32 #include <linux/in.h>
33 #include <linux/proc_fs.h>              /* for proc_net_* */
34 #include <asm/softirq.h>                /* for local_bh_* */
35 #include <net/ip.h>
36 #include <net/tcp.h>                    /* for csum_tcpudp_magic */
37 #include <net/udp.h>
38 #include <net/icmp.h>                   /* for icmp_send */
39 #include <net/route.h>                  /* for ip_route_output */
40 #include <linux/netfilter.h>
41 #include <linux/netfilter_ipv4.h>
42 #include <linux/jhash.h>
43 #include <linux/random.h>
44 
45 #include <net/ip_vs.h>
46 
47 
48 /*
49  *  Connection hash table: for input and output packets lookups of IPVS
50  */
51 static struct list_head *ip_vs_conn_tab;
52 
53 /* SLAB cache for IPVS connections */
54 static kmem_cache_t *ip_vs_conn_cachep;
55 
56 /* counter for current IPVS connections */
57 static atomic_t ip_vs_conn_count = ATOMIC_INIT(0);
58 
59 /* counter for no-client-port connections */
60 static atomic_t ip_vs_conn_no_cport_cnt = ATOMIC_INIT(0);
61 
62 /* random value for IPVS connection hash */
63 static unsigned int ip_vs_conn_rnd;
64 
65 /*
66  *  Fine locking granularity for big connection hash table
67  */
68 #define CT_LOCKARRAY_BITS  4
69 #define CT_LOCKARRAY_SIZE  (1<<CT_LOCKARRAY_BITS)
70 #define CT_LOCKARRAY_MASK  (CT_LOCKARRAY_SIZE-1)
71 
72 struct ip_vs_aligned_lock
73 {
74 	rwlock_t	l;
75 } __attribute__((__aligned__(SMP_CACHE_BYTES)));
76 
77 /* lock array for conn table */
78 struct ip_vs_aligned_lock
79 __ip_vs_conntbl_lock_array[CT_LOCKARRAY_SIZE] __cacheline_aligned;
80 
ct_read_lock(unsigned key)81 static inline void ct_read_lock(unsigned key)
82 {
83 	read_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
84 }
85 
ct_read_unlock(unsigned key)86 static inline void ct_read_unlock(unsigned key)
87 {
88 	read_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
89 }
90 
ct_write_lock(unsigned key)91 static inline void ct_write_lock(unsigned key)
92 {
93 	write_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
94 }
95 
ct_write_unlock(unsigned key)96 static inline void ct_write_unlock(unsigned key)
97 {
98 	write_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
99 }
100 
ct_read_lock_bh(unsigned key)101 static inline void ct_read_lock_bh(unsigned key)
102 {
103 	read_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
104 }
105 
ct_read_unlock_bh(unsigned key)106 static inline void ct_read_unlock_bh(unsigned key)
107 {
108 	read_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
109 }
110 
ct_write_lock_bh(unsigned key)111 static inline void ct_write_lock_bh(unsigned key)
112 {
113 	write_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
114 }
115 
ct_write_unlock_bh(unsigned key)116 static inline void ct_write_unlock_bh(unsigned key)
117 {
118 	write_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
119 }
120 
121 
122 /*
123  *	Returns hash value for IPVS connection entry
124  */
125 static unsigned
ip_vs_conn_hashkey(unsigned proto,__u32 addr,__u16 port)126 ip_vs_conn_hashkey(unsigned proto, __u32 addr, __u16 port)
127 {
128 	return jhash_3words(addr, port, proto, ip_vs_conn_rnd)
129 		& IP_VS_CONN_TAB_MASK;
130 }
131 
132 
133 /*
134  *	Hashes ip_vs_conn in ip_vs_conn_tab by proto,addr,port.
135  *	returns bool success.
136  */
ip_vs_conn_hash(struct ip_vs_conn * cp)137 static int ip_vs_conn_hash(struct ip_vs_conn *cp)
138 {
139 	unsigned hash;
140 	int ret;
141 
142 	/* Hash by protocol, client address and port */
143 	hash = ip_vs_conn_hashkey(cp->protocol, cp->caddr, cp->cport);
144 
145 	ct_write_lock(hash);
146 
147 	if (!(cp->flags & IP_VS_CONN_F_HASHED)) {
148 		list_add(&cp->c_list, &ip_vs_conn_tab[hash]);
149 		cp->flags |= IP_VS_CONN_F_HASHED;
150 		atomic_inc(&cp->refcnt);
151 		ret = 1;
152 	} else {
153 		IP_VS_ERR("ip_vs_conn_hash(): request for already hashed, "
154 			  "called from %p\n", __builtin_return_address(0));
155 		ret = 0;
156 	}
157 
158 	ct_write_unlock(hash);
159 
160 	return ret;
161 }
162 
163 
164 /*
165  *	UNhashes ip_vs_conn from ip_vs_conn_tab.
166  *	returns bool success.
167  */
ip_vs_conn_unhash(struct ip_vs_conn * cp)168 static int ip_vs_conn_unhash(struct ip_vs_conn *cp)
169 {
170 	unsigned hash;
171 	int ret;
172 
173 	/* unhash it and decrease its reference counter */
174 	hash = ip_vs_conn_hashkey(cp->protocol, cp->caddr, cp->cport);
175 	ct_write_lock(hash);
176 
177 	if (cp->flags & IP_VS_CONN_F_HASHED) {
178 		list_del(&cp->c_list);
179 		cp->flags &= ~IP_VS_CONN_F_HASHED;
180 		atomic_dec(&cp->refcnt);
181 		ret = 1;
182 	} else
183 		ret = 0;
184 
185 	ct_write_unlock(hash);
186 
187 	return ret;
188 }
189 
190 
191 /*
192  *  Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab.
193  *  Called for pkts coming from OUTside-to-INside.
194  *	s_addr, s_port: pkt source address (foreign host)
195  *	d_addr, d_port: pkt dest address (load balancer)
196  */
__ip_vs_conn_in_get(int protocol,__u32 s_addr,__u16 s_port,__u32 d_addr,__u16 d_port)197 static inline struct ip_vs_conn *__ip_vs_conn_in_get
198 (int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port)
199 {
200 	unsigned hash;
201 	struct ip_vs_conn *cp;
202 	struct list_head *l,*e;
203 
204 	hash = ip_vs_conn_hashkey(protocol, s_addr, s_port);
205 	l = &ip_vs_conn_tab[hash];
206 
207 	ct_read_lock(hash);
208 
209 	for (e=l->next; e!=l; e=e->next) {
210 		cp = list_entry(e, struct ip_vs_conn, c_list);
211 		if (s_addr==cp->caddr && s_port==cp->cport &&
212 		    d_port==cp->vport && d_addr==cp->vaddr &&
213 		    ((!s_port) ^ (!(cp->flags & IP_VS_CONN_F_NO_CPORT))) &&
214 		    protocol==cp->protocol) {
215 			/* HIT */
216 			atomic_inc(&cp->refcnt);
217 			ct_read_unlock(hash);
218 			return cp;
219 		}
220 	}
221 
222 	ct_read_unlock(hash);
223 
224 	return NULL;
225 }
226 
ip_vs_conn_in_get(int protocol,__u32 s_addr,__u16 s_port,__u32 d_addr,__u16 d_port)227 struct ip_vs_conn *ip_vs_conn_in_get
228 (int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port)
229 {
230 	struct ip_vs_conn *cp;
231 
232 	cp = __ip_vs_conn_in_get(protocol, s_addr, s_port, d_addr, d_port);
233 	if (!cp && atomic_read(&ip_vs_conn_no_cport_cnt))
234 		cp = __ip_vs_conn_in_get(protocol, s_addr, 0, d_addr, d_port);
235 
236 	IP_VS_DBG(7, "lookup/in %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n",
237 		  ip_vs_proto_name(protocol),
238 		  NIPQUAD(s_addr), ntohs(s_port),
239 		  NIPQUAD(d_addr), ntohs(d_port),
240 		  cp?"hit":"not hit");
241 
242 	return cp;
243 }
244 
245 /* Get reference to connection template */
ip_vs_ct_in_get(int protocol,__u32 s_addr,__u16 s_port,__u32 d_addr,__u16 d_port)246 struct ip_vs_conn *ip_vs_ct_in_get
247 (int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port)
248 {
249 	unsigned hash;
250 	struct ip_vs_conn *cp;
251 
252 	hash = ip_vs_conn_hashkey(protocol, s_addr, s_port);
253 
254 	ct_read_lock(hash);
255 
256 	list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
257 		if (s_addr==cp->caddr && s_port==cp->cport &&
258 		    d_port==cp->vport && d_addr==cp->vaddr &&
259 		    cp->flags & IP_VS_CONN_F_TEMPLATE &&
260 		    protocol==cp->protocol) {
261 			/* HIT */
262 			atomic_inc(&cp->refcnt);
263 			goto out;
264 		}
265 	}
266 	cp = NULL;
267 
268   out:
269 	ct_read_unlock(hash);
270 
271 	IP_VS_DBG(7, "template lookup/in %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n",
272 		  ip_vs_proto_name(protocol),
273 		  NIPQUAD(s_addr), ntohs(s_port),
274 		  NIPQUAD(d_addr), ntohs(d_port),
275 		  cp?"hit":"not hit");
276 
277 	return cp;
278 }
279 
280 /*
281  *  Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab.
282  *  Called for pkts coming from inside-to-OUTside.
283  *	s_addr, s_port: pkt source address (inside host)
284  *	d_addr, d_port: pkt dest address (foreign host)
285  */
ip_vs_conn_out_get(int protocol,__u32 s_addr,__u16 s_port,__u32 d_addr,__u16 d_port)286 struct ip_vs_conn *ip_vs_conn_out_get
287 (int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port)
288 {
289 	unsigned hash;
290 	struct ip_vs_conn *cp, *ret=NULL;
291 	struct list_head *l,*e;
292 
293 	/*
294 	 *	Check for "full" addressed entries
295 	 */
296 	hash = ip_vs_conn_hashkey(protocol, d_addr, d_port);
297 	l = &ip_vs_conn_tab[hash];
298 
299 	ct_read_lock(hash);
300 
301 	for (e=l->next; e!=l; e=e->next) {
302 		cp = list_entry(e, struct ip_vs_conn, c_list);
303 		if (d_addr == cp->caddr && d_port == cp->cport &&
304 		    s_port == cp->dport && s_addr == cp->daddr &&
305 		    protocol == cp->protocol) {
306 			/* HIT */
307 			atomic_inc(&cp->refcnt);
308 			ret = cp;
309 			break;
310 		}
311 	}
312 
313 	ct_read_unlock(hash);
314 
315 	IP_VS_DBG(7, "lookup/out %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n",
316 		  ip_vs_proto_name(protocol),
317 		  NIPQUAD(s_addr), ntohs(s_port),
318 		  NIPQUAD(d_addr), ntohs(d_port),
319 		  ret?"hit":"not hit");
320 
321 	return ret;
322 }
323 
324 
325 /*
326  *      Put back the conn and restart its timer with its timeout
327  */
ip_vs_conn_put(struct ip_vs_conn * cp)328 void ip_vs_conn_put(struct ip_vs_conn *cp)
329 {
330 	/* reset it expire in its timeout */
331 	mod_timer(&cp->timer, jiffies+cp->timeout);
332 
333 	__ip_vs_conn_put(cp);
334 }
335 
336 
337 /*
338  *	Timeout table[state]
339  */
340 struct ip_vs_timeout_table vs_timeout_table = {
341 	ATOMIC_INIT(0),	/* refcnt */
342 	0,		/* scale  */
343 	{
344 		[IP_VS_S_NONE]          =	30*60*HZ,
345 		[IP_VS_S_ESTABLISHED]	=	15*60*HZ,
346 		[IP_VS_S_SYN_SENT]	=	2*60*HZ,
347 		[IP_VS_S_SYN_RECV]	=	1*60*HZ,
348 		[IP_VS_S_FIN_WAIT]	=	2*60*HZ,
349 		[IP_VS_S_TIME_WAIT]	=	2*60*HZ,
350 		[IP_VS_S_CLOSE]         =	10*HZ,
351 		[IP_VS_S_CLOSE_WAIT]	=	60*HZ,
352 		[IP_VS_S_LAST_ACK]	=	30*HZ,
353 		[IP_VS_S_LISTEN]	=	2*60*HZ,
354 		[IP_VS_S_SYNACK]	=	120*HZ,
355 		[IP_VS_S_UDP]		=	5*60*HZ,
356 		[IP_VS_S_ICMP]          =	1*60*HZ,
357 		[IP_VS_S_LAST]          =	2*HZ,
358 	},	/* timeout */
359 };
360 
361 
362 struct ip_vs_timeout_table vs_timeout_table_dos = {
363 	ATOMIC_INIT(0),	/* refcnt */
364 	0,		/* scale  */
365 	{
366 		[IP_VS_S_NONE]          =	15*60*HZ,
367 		[IP_VS_S_ESTABLISHED]	=	8*60*HZ,
368 		[IP_VS_S_SYN_SENT]	=	60*HZ,
369 		[IP_VS_S_SYN_RECV]	=	10*HZ,
370 		[IP_VS_S_FIN_WAIT]	=	60*HZ,
371 		[IP_VS_S_TIME_WAIT]	=	60*HZ,
372 		[IP_VS_S_CLOSE]         =	10*HZ,
373 		[IP_VS_S_CLOSE_WAIT]	=	60*HZ,
374 		[IP_VS_S_LAST_ACK]	=	30*HZ,
375 		[IP_VS_S_LISTEN]	=	2*60*HZ,
376 		[IP_VS_S_SYNACK]	=	100*HZ,
377 		[IP_VS_S_UDP]		=	3*60*HZ,
378 		[IP_VS_S_ICMP]          =	1*60*HZ,
379 		[IP_VS_S_LAST]          =	2*HZ,
380 	},	/* timeout */
381 };
382 
383 
384 /*
385  *	Timeout table to use for the VS entries
386  *	If NULL we use the default table (vs_timeout_table).
387  *	Under flood attack we switch to vs_timeout_table_dos
388  */
389 
390 static struct ip_vs_timeout_table *ip_vs_timeout_table = &vs_timeout_table;
391 
392 static const char * state_name_table[IP_VS_S_LAST+1] = {
393 	[IP_VS_S_NONE]          =	"NONE",
394 	[IP_VS_S_ESTABLISHED]	=	"ESTABLISHED",
395 	[IP_VS_S_SYN_SENT]	=	"SYN_SENT",
396 	[IP_VS_S_SYN_RECV]	=	"SYN_RECV",
397 	[IP_VS_S_FIN_WAIT]	=	"FIN_WAIT",
398 	[IP_VS_S_TIME_WAIT]	=	"TIME_WAIT",
399 	[IP_VS_S_CLOSE]         =	"CLOSE",
400 	[IP_VS_S_CLOSE_WAIT]	=	"CLOSE_WAIT",
401 	[IP_VS_S_LAST_ACK]	=	"LAST_ACK",
402 	[IP_VS_S_LISTEN]	=	"LISTEN",
403 	[IP_VS_S_SYNACK]	=	"SYNACK",
404 	[IP_VS_S_UDP]		=	"UDP",
405 	[IP_VS_S_ICMP]          =	"ICMP",
406 	[IP_VS_S_LAST]          =	"BUG!",
407 };
408 
409 #define sNO IP_VS_S_NONE
410 #define sES IP_VS_S_ESTABLISHED
411 #define sSS IP_VS_S_SYN_SENT
412 #define sSR IP_VS_S_SYN_RECV
413 #define sFW IP_VS_S_FIN_WAIT
414 #define sTW IP_VS_S_TIME_WAIT
415 #define sCL IP_VS_S_CLOSE
416 #define sCW IP_VS_S_CLOSE_WAIT
417 #define sLA IP_VS_S_LAST_ACK
418 #define sLI IP_VS_S_LISTEN
419 #define sSA IP_VS_S_SYNACK
420 
421 struct vs_tcp_states_t {
422 	int next_state[IP_VS_S_LAST];	/* should be _LAST_TCP */
423 };
424 
ip_vs_state_name(int state)425 const char * ip_vs_state_name(int state)
426 {
427 	if (state >= IP_VS_S_LAST)
428 		return "ERR!";
429 	return state_name_table[state] ? state_name_table[state] : "?";
430 }
431 
432 static struct vs_tcp_states_t vs_tcp_states [] = {
433 /*	INPUT */
434 /*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA	*/
435 /*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},
436 /*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sTW }},
437 /*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
438 /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sSR }},
439 
440 /*	OUTPUT */
441 /*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA	*/
442 /*syn*/ {{sSS, sES, sSS, sSR, sSS, sSS, sSS, sSS, sSS, sLI, sSR }},
443 /*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }},
444 /*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }},
445 /*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }},
446 
447 /*	INPUT-ONLY */
448 /*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA	*/
449 /*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},
450 /*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }},
451 /*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
452 /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
453 };
454 
455 static struct vs_tcp_states_t vs_tcp_states_dos [] = {
456 /*	INPUT */
457 /*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA	*/
458 /*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSA }},
459 /*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sSA }},
460 /*ack*/ {{sCL, sES, sSS, sSR, sFW, sTW, sCL, sCW, sCL, sLI, sSA }},
461 /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
462 
463 /*	OUTPUT */
464 /*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA	*/
465 /*syn*/ {{sSS, sES, sSS, sSA, sSS, sSS, sSS, sSS, sSS, sLI, sSA }},
466 /*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }},
467 /*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }},
468 /*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }},
469 
470 /*	INPUT-ONLY */
471 /*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA	*/
472 /*syn*/ {{sSA, sES, sES, sSR, sSA, sSA, sSA, sSA, sSA, sSA, sSA }},
473 /*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }},
474 /*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
475 /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
476 };
477 
478 static struct vs_tcp_states_t *ip_vs_state_table = vs_tcp_states;
479 
ip_vs_secure_tcp_set(int on)480 void ip_vs_secure_tcp_set(int on)
481 {
482 	if (on) {
483 		ip_vs_state_table = vs_tcp_states_dos;
484 		ip_vs_timeout_table = &vs_timeout_table_dos;
485 	} else {
486 		ip_vs_state_table = vs_tcp_states;
487 		ip_vs_timeout_table = &vs_timeout_table;
488 	}
489 }
490 
491 
vs_tcp_state_idx(struct tcphdr * th,int state_off)492 static inline int vs_tcp_state_idx(struct tcphdr *th, int state_off)
493 {
494 	/*
495 	 *	[0-3]: input states, [4-7]: output, [8-11] input only states.
496 	 */
497 	if (th->rst)
498 		return state_off+3;
499 	if (th->syn)
500 		return state_off+0;
501 	if (th->fin)
502 		return state_off+1;
503 	if (th->ack)
504 		return state_off+2;
505 	return -1;
506 }
507 
508 
vs_set_state_timeout(struct ip_vs_conn * cp,int state)509 static inline int vs_set_state_timeout(struct ip_vs_conn *cp, int state)
510 {
511 	struct ip_vs_timeout_table *vstim = cp->timeout_table;
512 
513 	/*
514 	 *	Use default timeout table if no specific for this entry
515 	 */
516 	if (!vstim)
517 		vstim = &vs_timeout_table;
518 
519 	cp->timeout = vstim->timeout[cp->state=state];
520 
521 	if (vstim->scale) {
522 		int scale = vstim->scale;
523 
524 		if (scale<0)
525 			cp->timeout >>= -scale;
526 		else if (scale > 0)
527 			cp->timeout <<= scale;
528 	}
529 
530 	return state;
531 }
532 
533 
534 static inline int
vs_tcp_state(struct ip_vs_conn * cp,int state_off,struct tcphdr * th)535 vs_tcp_state(struct ip_vs_conn *cp, int state_off, struct tcphdr *th)
536 {
537 	int state_idx;
538 	int new_state = IP_VS_S_CLOSE;
539 
540 	/*
541 	 *    Update state offset to INPUT_ONLY if necessary
542 	 *    or delete NO_OUTPUT flag if output packet detected
543 	 */
544 	if (cp->flags & IP_VS_CONN_F_NOOUTPUT) {
545 		if (state_off == VS_STATE_OUTPUT)
546 			cp->flags &= ~IP_VS_CONN_F_NOOUTPUT;
547 		else
548 			state_off = VS_STATE_INPUT_ONLY;
549 	}
550 
551 	if ((state_idx = vs_tcp_state_idx(th, state_off)) < 0) {
552 		IP_VS_DBG(8, "vs_tcp_state_idx(%d)=%d!!!\n",
553 			  state_off, state_idx);
554 		goto tcp_state_out;
555 	}
556 
557 	new_state = ip_vs_state_table[state_idx].next_state[cp->state];
558 
559   tcp_state_out:
560 	if (new_state != cp->state) {
561 		struct ip_vs_dest *dest = cp->dest;
562 
563 		IP_VS_DBG(8, "%s %s [%c%c%c%c] %u.%u.%u.%u:%d->"
564 			  "%u.%u.%u.%u:%d state: %s->%s cnt:%d\n",
565 			  ip_vs_proto_name(cp->protocol),
566 			  (state_off==VS_STATE_OUTPUT)?"output ":"input ",
567 			  th->syn? 'S' : '.',
568 			  th->fin? 'F' : '.',
569 			  th->ack? 'A' : '.',
570 			  th->rst? 'R' : '.',
571 			  NIPQUAD(cp->daddr), ntohs(cp->dport),
572 			  NIPQUAD(cp->caddr), ntohs(cp->cport),
573 			  ip_vs_state_name(cp->state),
574 			  ip_vs_state_name(new_state),
575 			  atomic_read(&cp->refcnt));
576 		if (dest) {
577 			if (!(cp->flags & IP_VS_CONN_F_INACTIVE) &&
578 			    (new_state != IP_VS_S_ESTABLISHED)) {
579 				atomic_dec(&dest->activeconns);
580 				atomic_inc(&dest->inactconns);
581 				cp->flags |= IP_VS_CONN_F_INACTIVE;
582 			} else if ((cp->flags & IP_VS_CONN_F_INACTIVE) &&
583 				   (new_state == IP_VS_S_ESTABLISHED)) {
584 				atomic_inc(&dest->activeconns);
585 				atomic_dec(&dest->inactconns);
586 				cp->flags &= ~IP_VS_CONN_F_INACTIVE;
587 			}
588 		}
589 	}
590 
591 	return vs_set_state_timeout(cp, new_state);
592 }
593 
594 
595 /*
596  *	Handle state transitions
597  */
ip_vs_set_state(struct ip_vs_conn * cp,int state_off,struct iphdr * iph,void * tp)598 int ip_vs_set_state(struct ip_vs_conn *cp,
599 		    int state_off, struct iphdr *iph, void *tp)
600 {
601 	int ret;
602 
603 	spin_lock(&cp->lock);
604 	switch (iph->protocol) {
605 	case IPPROTO_TCP:
606 		ret = vs_tcp_state(cp, state_off, tp);
607 		break;
608 	case IPPROTO_UDP:
609 		ret = vs_set_state_timeout(cp, IP_VS_S_UDP);
610 		break;
611 	case IPPROTO_ICMP:
612 		ret = vs_set_state_timeout(cp, IP_VS_S_ICMP);
613 		break;
614 	default:
615 		ret = -1;
616 	}
617 	spin_unlock(&cp->lock);
618 
619 	return ret;
620 }
621 
622 
623 /*
624  *	Set LISTEN timeout. (ip_vs_conn_put will setup timer)
625  */
ip_vs_conn_listen(struct ip_vs_conn * cp)626 int ip_vs_conn_listen(struct ip_vs_conn *cp)
627 {
628 	vs_set_state_timeout(cp, IP_VS_S_LISTEN);
629 	return cp->timeout;
630 }
631 
632 
633 /*
634  *      Bypass transmitter
635  *      Let packets bypass the destination when the destination is not
636  *      available, it may be only used in transparent cache cluster.
637  */
ip_vs_bypass_xmit(struct sk_buff * skb,struct ip_vs_conn * cp)638 static int ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp)
639 {
640 	struct rtable *rt;			/* Route to the other host */
641 	struct iphdr  *iph = skb->nh.iph;
642 	u8     tos = iph->tos;
643 	int    mtu;
644 
645 	EnterFunction(10);
646 
647 	if (ip_route_output(&rt, iph->daddr, 0, RT_TOS(tos), 0)) {
648 		IP_VS_DBG_RL("ip_vs_bypass_xmit(): ip_route_output error, "
649 			     "dest: %u.%u.%u.%u\n", NIPQUAD(iph->daddr));
650 		goto tx_error_icmp;
651 	}
652 
653 	/* MTU checking */
654 	mtu = rt->u.dst.pmtu;
655 	if ((skb->len > mtu) && (iph->frag_off&__constant_htons(IP_DF))) {
656 		ip_rt_put(rt);
657 		icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
658 		IP_VS_DBG_RL("ip_vs_bypass_xmit(): frag needed\n");
659 		goto tx_error;
660 	}
661 
662 	/* update checksum because skb might be defragmented */
663 	ip_send_check(iph);
664 
665 	if (unlikely(skb_headroom(skb) < rt->u.dst.dev->hard_header_len)) {
666 		if (skb_cow(skb, rt->u.dst.dev->hard_header_len)) {
667 			ip_rt_put(rt);
668 			IP_VS_ERR_RL("ip_vs_bypass_xmit(): no memory\n");
669 			goto tx_error;
670 		}
671 	}
672 
673 	/* drop old route */
674 	dst_release(skb->dst);
675 	skb->dst = &rt->u.dst;
676 
677 #ifdef CONFIG_NETFILTER_DEBUG
678 	skb->nf_debug = 1 << NF_IP_LOCAL_OUT;
679 #endif /* CONFIG_NETFILTER_DEBUG */
680 	skb->nfcache |= NFC_IPVS_PROPERTY;
681 	ip_send(skb);
682 
683 	LeaveFunction(10);
684 	return NF_STOLEN;
685 
686   tx_error_icmp:
687 	dst_link_failure(skb);
688   tx_error:
689 	kfree_skb(skb);
690 	return NF_STOLEN;
691 }
692 
693 
694 /*
695  *      NULL transmitter (do nothing except return NF_ACCEPT)
696  */
ip_vs_null_xmit(struct sk_buff * skb,struct ip_vs_conn * cp)697 static int ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp)
698 {
699 	return NF_ACCEPT;
700 }
701 
702 
703 /*
704  *      NAT transmitter (only for outside-to-inside nat forwarding)
705  */
ip_vs_nat_xmit(struct sk_buff * skb,struct ip_vs_conn * cp)706 static int ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp)
707 {
708 	struct rtable *rt;		/* Route to the other host */
709 	struct iphdr  *iph;
710 	union ip_vs_tphdr h;
711 	int ihl;
712 	unsigned short size;
713 	int mtu;
714 
715 	EnterFunction(10);
716 
717 	/*
718 	 * If it has ip_vs_app helper, the helper may change the payload,
719 	 * so it needs full checksum checking and checksum calculation.
720 	 * If not, only the header (such as IP address and port number)
721 	 * will be changed, so it is fast to do incremental checksum update,
722 	 * and let the destination host  do final checksum checking.
723 	 */
724 
725 	if (cp->app && skb_is_nonlinear(skb)
726 	    && skb_linearize(skb, GFP_ATOMIC) != 0)
727 		return NF_DROP;
728 
729 	iph = skb->nh.iph;
730 	ihl = iph->ihl << 2;
731 	h.raw = (char*) iph + ihl;
732 	size = ntohs(iph->tot_len) - ihl;
733 
734 	/* do TCP/UDP checksum checking if it has application helper */
735 	if (cp->app && (iph->protocol != IPPROTO_UDP || h.uh->check != 0)) {
736 		switch (skb->ip_summed) {
737 		case CHECKSUM_NONE:
738 			skb->csum = csum_partial(h.raw, size, 0);
739 
740 		case CHECKSUM_HW:
741 			if (csum_tcpudp_magic(iph->saddr, iph->daddr, size,
742 					      iph->protocol, skb->csum)) {
743 				IP_VS_DBG_RL("Incoming failed %s checksum "
744 					     "from %d.%d.%d.%d (size=%d)!\n",
745 					     ip_vs_proto_name(iph->protocol),
746 					     NIPQUAD(iph->saddr),
747 					     size);
748 				goto tx_error;
749 			}
750 			break;
751 		default:
752 			/* CHECKSUM_UNNECESSARY */
753 			break;
754 		}
755 	}
756 
757 	/*
758 	 *  Check if it is no_cport connection ...
759 	 */
760 	if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) {
761 		if (ip_vs_conn_unhash(cp)) {
762 			spin_lock(&cp->lock);
763 			if (cp->flags & IP_VS_CONN_F_NO_CPORT) {
764 				atomic_dec(&ip_vs_conn_no_cport_cnt);
765 				cp->flags &= ~IP_VS_CONN_F_NO_CPORT;
766 				cp->cport = h.portp[0];
767 				IP_VS_DBG(10, "filled cport=%d\n", ntohs(cp->dport));
768 			}
769 			spin_unlock(&cp->lock);
770 
771 			/* hash on new dport */
772 			ip_vs_conn_hash(cp);
773 		}
774 	}
775 
776 	if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos))))
777 		goto tx_error_icmp;
778 
779 	/* MTU checking */
780 	mtu = rt->u.dst.pmtu;
781 	if ((skb->len > mtu) && (iph->frag_off&__constant_htons(IP_DF))) {
782 		ip_rt_put(rt);
783 		icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
784 		IP_VS_DBG_RL("ip_vs_nat_xmit(): frag needed\n");
785 		goto tx_error;
786 	}
787 
788 	/* drop old route */
789 	dst_release(skb->dst);
790 	skb->dst = &rt->u.dst;
791 
792 	/* copy-on-write the packet before mangling it */
793 	if (ip_vs_skb_cow(skb, rt->u.dst.dev->hard_header_len, &iph, &h.raw))
794 		return NF_DROP;
795 
796 	/* mangle the packet */
797 	iph->daddr = cp->daddr;
798 	h.portp[1] = cp->dport;
799 
800 	/*
801 	 *	Attempt ip_vs_app call.
802 	 *	will fix ip_vs_conn and iph ack_seq stuff
803 	 */
804 	if (ip_vs_app_pkt_in(cp, skb) != 0) {
805 		/* skb data has probably changed, update pointers */
806 		iph = skb->nh.iph;
807 		h.raw = (char*) iph + ihl;
808 		size = skb->len - ihl;
809 	}
810 
811 	/*
812 	 *	Adjust TCP/UDP checksums
813 	 */
814 	if (!cp->app && (iph->protocol != IPPROTO_UDP || h.uh->check != 0)) {
815 		/* Only port and addr are changed, do fast csum update */
816 		ip_vs_fast_check_update(&h, cp->vaddr, cp->daddr,
817 					cp->vport, cp->dport, iph->protocol);
818 		if (skb->ip_summed == CHECKSUM_HW)
819 			skb->ip_summed = CHECKSUM_NONE;
820 	} else {
821 		/* full checksum calculation */
822 		switch (iph->protocol) {
823 		case IPPROTO_TCP:
824 			h.th->check = 0;
825 			h.th->check = csum_tcpudp_magic(iph->saddr, iph->daddr,
826 							size, iph->protocol,
827 							csum_partial(h.raw, size, 0));
828 			break;
829 		case IPPROTO_UDP:
830 			h.uh->check = 0;
831 			h.uh->check = csum_tcpudp_magic(iph->saddr, iph->daddr,
832 							size, iph->protocol,
833 							csum_partial(h.raw, size, 0));
834 			if (h.uh->check == 0)
835 				h.uh->check = 0xFFFF;
836 			break;
837 		}
838 		skb->ip_summed = CHECKSUM_UNNECESSARY;
839 	}
840 	ip_send_check(iph);
841 
842 	IP_VS_DBG(10, "NAT to %u.%u.%u.%u:%d\n",
843 		  NIPQUAD(iph->daddr), ntohs(h.portp[1]));
844 
845 	/* FIXME: when application helper enlarges the packet and the length
846 	   is larger than the MTU of outgoing device, there will be still
847 	   MTU problem. */
848 
849 #ifdef CONFIG_NETFILTER_DEBUG
850 	skb->nf_debug = 1 << NF_IP_LOCAL_OUT;
851 #endif /* CONFIG_NETFILTER_DEBUG */
852 	skb->nfcache |= NFC_IPVS_PROPERTY;
853 	ip_send(skb);
854 
855 	LeaveFunction(10);
856 	return NF_STOLEN;
857 
858   tx_error_icmp:
859 	dst_link_failure(skb);
860   tx_error:
861 	kfree_skb(skb);
862 	return NF_STOLEN;
863 }
864 
865 
866 /*
867  *   IP Tunneling transmitter
868  *
869  *   This function encapsulates the packet in a new IP packet, its
870  *   destination will be set to cp->daddr. Most code of this function
871  *   is taken from ipip.c.
872  *
873  *   It is used in VS/TUN cluster. The load balancer selects a real
874  *   server from a cluster based on a scheduling algorithm,
875  *   encapsulates the request packet and forwards it to the selected
876  *   server. For example, all real servers are configured with
877  *   "ifconfig tunl0 <Virtual IP Address> up". When the server receives
878  *   the encapsulated packet, it will decapsulate the packet, processe
879  *   the request and return the response packets directly to the client
880  *   without passing the load balancer. This can greatly increase the
881  *   scalability of virtual server.
882  */
ip_vs_tunnel_xmit(struct sk_buff * skb,struct ip_vs_conn * cp)883 static int ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp)
884 {
885 	struct rtable *rt;			/* Route to the other host */
886 	struct net_device *tdev;		/* Device to other host */
887 	struct iphdr  *old_iph = skb->nh.iph;
888 	u8     tos = old_iph->tos;
889 	u16    df = old_iph->frag_off;
890 	struct iphdr  *iph;			/* Our new IP header */
891 	int    max_headroom;			/* The extra header space needed */
892 	int    mtu;
893 
894 	EnterFunction(10);
895 
896 	if (skb->protocol != __constant_htons(ETH_P_IP)) {
897 		IP_VS_DBG_RL("ip_vs_tunnel_xmit(): protocol error, "
898 			     "ETH_P_IP: %d, skb protocol: %d\n",
899 			     __constant_htons(ETH_P_IP), skb->protocol);
900 		goto tx_error;
901 	}
902 
903 	if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(tos))))
904 		goto tx_error_icmp;
905 
906 	tdev = rt->u.dst.dev;
907 
908 	mtu = rt->u.dst.pmtu - sizeof(struct iphdr);
909 	if (mtu < 68) {
910 		ip_rt_put(rt);
911 		IP_VS_DBG_RL("ip_vs_tunnel_xmit(): mtu less than 68\n");
912 		goto tx_error;
913 	}
914 	if (skb->dst && mtu < skb->dst->pmtu)
915 		skb->dst->pmtu = mtu;
916 
917 	df |= (old_iph->frag_off&__constant_htons(IP_DF));
918 
919 	if ((old_iph->frag_off&__constant_htons(IP_DF))
920 	    && mtu < ntohs(old_iph->tot_len)) {
921 		icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
922 		ip_rt_put(rt);
923 		IP_VS_DBG_RL("ip_vs_tunnel_xmit(): frag needed\n");
924 		goto tx_error;
925 	}
926 
927 	/* update checksum because skb might be defragmented */
928 	ip_send_check(old_iph);
929 
930 	/*
931 	 * Okay, now see if we can stuff it in the buffer as-is.
932 	 */
933 	max_headroom = (((tdev->hard_header_len+15)&~15)+sizeof(struct iphdr));
934 
935 	if (skb_headroom(skb) < max_headroom
936 	    || skb_cloned(skb) || skb_shared(skb)) {
937 		struct sk_buff *new_skb =
938 			skb_realloc_headroom(skb, max_headroom);
939 		if (!new_skb) {
940 			ip_rt_put(rt);
941 			IP_VS_ERR_RL("ip_vs_tunnel_xmit(): no memory\n");
942 			return NF_DROP;
943 		}
944 		kfree_skb(skb);
945 		skb = new_skb;
946 		old_iph = skb->nh.iph;
947 	}
948 
949 	skb->h.raw = skb->nh.raw;
950 	skb->nh.raw = skb_push(skb, sizeof(struct iphdr));
951 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
952 
953 	/* drop old route */
954 	dst_release(skb->dst);
955 	skb->dst = &rt->u.dst;
956 
957 	/*
958 	 *	Push down and install the IPIP header.
959 	 */
960 	iph			=	skb->nh.iph;
961 	iph->version		=	4;
962 	iph->ihl		=	sizeof(struct iphdr)>>2;
963 	iph->frag_off		=	df;
964 	iph->protocol		=	IPPROTO_IPIP;
965 	iph->tos		=	tos;
966 	iph->daddr		=	rt->rt_dst;
967 	iph->saddr		=	rt->rt_src;
968 	iph->ttl		=	old_iph->ttl;
969 	iph->tot_len		=	htons(skb->len);
970 	ip_select_ident(iph, &rt->u.dst, NULL);
971 	ip_send_check(iph);
972 
973 	skb->ip_summed = CHECKSUM_NONE;
974 #ifdef CONFIG_NETFILTER_DEBUG
975 	skb->nf_debug = 1 << NF_IP_LOCAL_OUT;
976 #endif /* CONFIG_NETFILTER_DEBUG */
977 	skb->nfcache |= NFC_IPVS_PROPERTY;
978 	ip_send(skb);
979 
980 	LeaveFunction(10);
981 
982 	return NF_STOLEN;
983 
984   tx_error_icmp:
985 	dst_link_failure(skb);
986   tx_error:
987 	kfree_skb(skb);
988 	return NF_STOLEN;
989 }
990 
991 
992 /*
993  *      Direct Routing transmitter
994  */
ip_vs_dr_xmit(struct sk_buff * skb,struct ip_vs_conn * cp)995 static int ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp)
996 {
997 	struct rtable *rt;			/* Route to the other host */
998 	struct iphdr  *iph = skb->nh.iph;
999 	int    mtu;
1000 
1001 	EnterFunction(10);
1002 
1003 	if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos))))
1004 		goto tx_error_icmp;
1005 
1006 	/* MTU checking */
1007 	mtu = rt->u.dst.pmtu;
1008 	if ((iph->frag_off&__constant_htons(IP_DF)) && skb->len > mtu) {
1009 		icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
1010 		ip_rt_put(rt);
1011 		IP_VS_DBG_RL("ip_vs_dr_xmit(): frag needed\n");
1012 		goto tx_error;
1013 	}
1014 
1015 	/* update checksum because skb might be defragmented */
1016 	ip_send_check(iph);
1017 
1018 	if (unlikely(skb_headroom(skb) < rt->u.dst.dev->hard_header_len)) {
1019 		if (skb_cow(skb, rt->u.dst.dev->hard_header_len)) {
1020 			ip_rt_put(rt);
1021 			IP_VS_ERR_RL("ip_vs_dr_xmit(): no memory\n");
1022 			goto tx_error;
1023 		}
1024 	}
1025 
1026 	/* drop old route */
1027 	dst_release(skb->dst);
1028 	skb->dst = &rt->u.dst;
1029 
1030 #ifdef CONFIG_NETFILTER_DEBUG
1031 	skb->nf_debug = 1 << NF_IP_LOCAL_OUT;
1032 #endif /* CONFIG_NETFILTER_DEBUG */
1033 	skb->nfcache |= NFC_IPVS_PROPERTY;
1034 	ip_send(skb);
1035 
1036 #if 0000
1037 	NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
1038 		do_ip_send);
1039 #endif
1040 	LeaveFunction(10);
1041 	return NF_STOLEN;
1042 
1043   tx_error_icmp:
1044 	dst_link_failure(skb);
1045   tx_error:
1046 	kfree_skb(skb);
1047 	return NF_STOLEN;
1048 }
1049 
1050 
1051 /*
1052  *  Bind a connection entry with the corresponding packet_xmit.
1053  *  Called by ip_vs_conn_new.
1054  */
ip_vs_bind_xmit(struct ip_vs_conn * cp)1055 static inline void ip_vs_bind_xmit(struct ip_vs_conn *cp)
1056 {
1057 	switch (IP_VS_FWD_METHOD(cp)) {
1058 	case IP_VS_CONN_F_MASQ:
1059 		cp->packet_xmit = ip_vs_nat_xmit;
1060 		break;
1061 
1062 	case IP_VS_CONN_F_TUNNEL:
1063 		cp->packet_xmit = ip_vs_tunnel_xmit;
1064 		break;
1065 
1066 	case IP_VS_CONN_F_DROUTE:
1067 		cp->packet_xmit = ip_vs_dr_xmit;
1068 		break;
1069 
1070 	case IP_VS_CONN_F_LOCALNODE:
1071 		cp->packet_xmit = ip_vs_null_xmit;
1072 		break;
1073 
1074 	case IP_VS_CONN_F_BYPASS:
1075 		cp->packet_xmit = ip_vs_bypass_xmit;
1076 		break;
1077 	}
1078 }
1079 
1080 
1081 /*
1082  *  Bind a connection entry with a virtual service destination
1083  *  Called just after a new connection entry is created.
1084  */
1085 static inline void
ip_vs_bind_dest(struct ip_vs_conn * cp,struct ip_vs_dest * dest)1086 ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest)
1087 {
1088 	/* if dest is NULL, then return directly */
1089 	if (!dest)
1090 		return;
1091 
1092 	/* Increase the refcnt counter of the dest */
1093 	atomic_inc(&dest->refcnt);
1094 
1095 	/* Bind with the destination and its corresponding transmitter */
1096 	cp->flags |= atomic_read(&dest->conn_flags);
1097 	cp->dest = dest;
1098 
1099 	IP_VS_DBG(9, "Bind-dest %s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d "
1100 		  "d:%u.%u.%u.%u:%d fwd:%c s:%s flg:%X cnt:%d destcnt:%d\n",
1101 		  ip_vs_proto_name(cp->protocol),
1102 		  NIPQUAD(cp->caddr), ntohs(cp->cport),
1103 		  NIPQUAD(cp->vaddr), ntohs(cp->vport),
1104 		  NIPQUAD(cp->daddr), ntohs(cp->dport),
1105 		  ip_vs_fwd_tag(cp), ip_vs_state_name(cp->state),
1106 		  cp->flags, atomic_read(&cp->refcnt),
1107 		  atomic_read(&dest->refcnt));
1108 }
1109 
1110 
1111 /*
1112  *  Unbind a connection entry with its VS destination
1113  *  Called by the ip_vs_conn_expire function.
1114  */
ip_vs_unbind_dest(struct ip_vs_conn * cp)1115 static inline void ip_vs_unbind_dest(struct ip_vs_conn *cp)
1116 {
1117 	struct ip_vs_dest *dest = cp->dest;
1118 
1119 	/* if dest is NULL, then return directly */
1120 	if (!dest)
1121 		return;
1122 
1123 	IP_VS_DBG(9, "Unbind-dest %s c:%u.%u.%u.%u:%d "
1124 		  "v:%u.%u.%u.%u:%d d:%u.%u.%u.%u:%d fwd:%c "
1125 		  "s:%s flg:%X cnt:%d destcnt:%d\n",
1126 		  ip_vs_proto_name(cp->protocol),
1127 		  NIPQUAD(cp->caddr), ntohs(cp->cport),
1128 		  NIPQUAD(cp->vaddr), ntohs(cp->vport),
1129 		  NIPQUAD(cp->daddr), ntohs(cp->dport),
1130 		  ip_vs_fwd_tag(cp), ip_vs_state_name(cp->state),
1131 		  cp->flags, atomic_read(&cp->refcnt),
1132 		  atomic_read(&dest->refcnt));
1133 
1134 	/*
1135 	 * Decrease the inactconns or activeconns counter
1136 	 * if it is not a connection template
1137 	 */
1138 	if (!(cp->flags & IP_VS_CONN_F_TEMPLATE)) {
1139 		if (cp->flags & IP_VS_CONN_F_INACTIVE) {
1140 			atomic_dec(&dest->inactconns);
1141 		} else {
1142 			atomic_dec(&dest->activeconns);
1143 		}
1144 	}
1145 
1146 	/*
1147 	 * Simply decrease the refcnt of the dest, because the
1148 	 * dest will be either in service's destination list
1149 	 * or in the trash.
1150 	 */
1151 	atomic_dec(&dest->refcnt);
1152 }
1153 
1154 
1155 /*
1156  *  Checking if the destination of a connection template is available.
1157  *  If available, return 1, otherwise invalidate this connection
1158  *  template and return 0.
1159  */
ip_vs_check_template(struct ip_vs_conn * ct)1160 int ip_vs_check_template(struct ip_vs_conn *ct)
1161 {
1162 	struct ip_vs_dest *dest = ct->dest;
1163 
1164 	/*
1165 	 * Checking the dest server status.
1166 	 */
1167 	if ((dest == NULL) ||
1168 	    !(dest->flags & IP_VS_DEST_F_AVAILABLE) ||
1169 	    (sysctl_ip_vs_expire_quiescent_template &&
1170 	     (atomic_read(&dest->weight) == 0))) {
1171 		IP_VS_DBG(9, "check_template: dest not available for "
1172 			  "protocol %s s:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d "
1173 			  "-> d:%u.%u.%u.%u:%d\n",
1174 			  ip_vs_proto_name(ct->protocol),
1175 			  NIPQUAD(ct->caddr), ntohs(ct->cport),
1176 			  NIPQUAD(ct->vaddr), ntohs(ct->vport),
1177 			  NIPQUAD(ct->daddr), ntohs(ct->dport));
1178 
1179 		/*
1180 		 * Invalidate the connection template
1181 		 */
1182 		if (ct->vport != 65535) {
1183 			if (ip_vs_conn_unhash(ct)) {
1184 				ct->dport = 65535;
1185 				ct->vport = 65535;
1186 				ct->cport = 0;
1187 				ip_vs_conn_hash(ct);
1188 			}
1189 		}
1190 
1191 		/*
1192 		 * Simply decrease the refcnt of the template,
1193 		 * don't restart its timer.
1194 		 */
1195 		atomic_dec(&ct->refcnt);
1196 		return 0;
1197 	}
1198 	return 1;
1199 }
1200 
1201 
1202 static inline void
ip_vs_timeout_attach(struct ip_vs_conn * cp,struct ip_vs_timeout_table * vstim)1203 ip_vs_timeout_attach(struct ip_vs_conn *cp, struct ip_vs_timeout_table *vstim)
1204 {
1205 	atomic_inc(&vstim->refcnt);
1206 	cp->timeout_table = vstim;
1207 }
1208 
ip_vs_timeout_detach(struct ip_vs_conn * cp)1209 static inline void ip_vs_timeout_detach(struct ip_vs_conn *cp)
1210 {
1211 	struct ip_vs_timeout_table *vstim = cp->timeout_table;
1212 
1213 	if (!vstim)
1214 		return;
1215 	cp->timeout_table = NULL;
1216 	atomic_dec(&vstim->refcnt);
1217 }
1218 
1219 
ip_vs_conn_expire(unsigned long data)1220 static void ip_vs_conn_expire(unsigned long data)
1221 {
1222 	struct ip_vs_conn *cp = (struct ip_vs_conn *)data;
1223 
1224 	if (cp->timeout_table)
1225 		cp->timeout = cp->timeout_table->timeout[IP_VS_S_TIME_WAIT];
1226 	else
1227 		cp->timeout = vs_timeout_table.timeout[IP_VS_S_TIME_WAIT];
1228 
1229 	/*
1230 	 *	hey, I'm using it
1231 	 */
1232 	atomic_inc(&cp->refcnt);
1233 
1234 	/*
1235 	 *	do I control anybody?
1236 	 */
1237 	if (atomic_read(&cp->n_control))
1238 		goto expire_later;
1239 
1240 	/*
1241 	 *	unhash it if it is hashed in the conn table
1242 	 */
1243 	if (!ip_vs_conn_unhash(cp))
1244 		goto expire_later;
1245 
1246 	/*
1247 	 *	refcnt==1 implies I'm the only one referrer
1248 	 */
1249 	if (likely(atomic_read(&cp->refcnt) == 1)) {
1250 		/* make sure that there is no timer on it now */
1251 		if (timer_pending(&cp->timer))
1252 			del_timer(&cp->timer);
1253 
1254 		/* does anybody control me? */
1255 		if (cp->control)
1256 			ip_vs_control_del(cp);
1257 
1258 		ip_vs_unbind_dest(cp);
1259 		ip_vs_unbind_app(cp);
1260 		ip_vs_timeout_detach(cp);
1261 		if (cp->flags & IP_VS_CONN_F_NO_CPORT)
1262 			atomic_dec(&ip_vs_conn_no_cport_cnt);
1263 		atomic_dec(&ip_vs_conn_count);
1264 
1265 		kmem_cache_free(ip_vs_conn_cachep, cp);
1266 		return;
1267 	}
1268 
1269 	/* hash it back to the table */
1270 	ip_vs_conn_hash(cp);
1271 
1272   expire_later:
1273 	IP_VS_DBG(7, "delayed: refcnt-1=%d conn.n_control=%d\n",
1274 		  atomic_read(&cp->refcnt)-1,
1275 		  atomic_read(&cp->n_control));
1276 
1277 	ip_vs_conn_put(cp);
1278 }
1279 
1280 
ip_vs_conn_expire_now(struct ip_vs_conn * cp)1281 void ip_vs_conn_expire_now(struct ip_vs_conn *cp)
1282 {
1283 	cp->timeout = 0;
1284 	mod_timer(&cp->timer, jiffies);
1285 }
1286 
1287 /*
1288  *  Create a new connection entry and hash it into the ip_vs_conn_tab.
1289  */
1290 struct ip_vs_conn *
ip_vs_conn_new(int proto,__u32 caddr,__u16 cport,__u32 vaddr,__u16 vport,__u32 daddr,__u16 dport,unsigned flags,struct ip_vs_dest * dest)1291 ip_vs_conn_new(int proto, __u32 caddr, __u16 cport, __u32 vaddr, __u16 vport,
1292 	       __u32 daddr, __u16 dport, unsigned flags,
1293 	       struct ip_vs_dest *dest)
1294 {
1295 	struct ip_vs_conn *cp;
1296 
1297 	cp = kmem_cache_alloc(ip_vs_conn_cachep, GFP_ATOMIC);
1298 	if (cp == NULL) {
1299 		IP_VS_ERR_RL("ip_vs_conn_new: no memory available.\n");
1300 		return NULL;
1301 	}
1302 
1303 	memset(cp, 0, sizeof(*cp));
1304 	INIT_LIST_HEAD(&cp->c_list);
1305 	init_timer(&cp->timer);
1306 	cp->timer.data     = (unsigned long)cp;
1307 	cp->timer.function = ip_vs_conn_expire;
1308 	ip_vs_timeout_attach(cp, ip_vs_timeout_table);
1309 	cp->protocol	   = proto;
1310 	cp->caddr	   = caddr;
1311 	cp->cport	   = cport;
1312 	cp->vaddr	   = vaddr;
1313 	cp->vport	   = vport;
1314 	cp->daddr          = daddr;
1315 	cp->dport          = dport;
1316 	cp->flags	   = flags;
1317 	cp->app_data	   = NULL;
1318 	cp->control	   = NULL;
1319 	cp->lock           = SPIN_LOCK_UNLOCKED;
1320 
1321 	atomic_set(&cp->n_control, 0);
1322 	atomic_set(&cp->in_pkts, 0);
1323 
1324 	atomic_inc(&ip_vs_conn_count);
1325 	if (flags & IP_VS_CONN_F_NO_CPORT)
1326 		atomic_inc(&ip_vs_conn_no_cport_cnt);
1327 
1328 	/* Bind its application helper (only for VS/NAT) if any */
1329 	ip_vs_bind_app(cp);
1330 
1331 	/* Bind the connection with a destination server */
1332 	ip_vs_bind_dest(cp, dest);
1333 
1334 	/* Set its state and timeout */
1335 	vs_set_state_timeout(cp, IP_VS_S_NONE);
1336 
1337 	/* Bind its packet transmitter */
1338 	ip_vs_bind_xmit(cp);
1339 
1340 	/*
1341 	 * Set the entry is referenced by the current thread before hashing
1342 	 * it in the table, so that other thread run ip_vs_random_dropentry
1343 	 * but cannot drop this entry.
1344 	 */
1345 	atomic_set(&cp->refcnt, 1);
1346 
1347 	/* Hash it in the ip_vs_conn_tab finally */
1348 	ip_vs_conn_hash(cp);
1349 
1350 	return cp;
1351 }
1352 
1353 
1354 /*
1355  *	/proc/net/ip_vs_conn entries
1356  */
1357 static int
ip_vs_conn_getinfo(char * buffer,char ** start,off_t offset,int length)1358 ip_vs_conn_getinfo(char *buffer, char **start, off_t offset, int length)
1359 {
1360 	off_t pos=0;
1361 	int idx, len=0;
1362 	char temp[70];
1363 	struct ip_vs_conn *cp;
1364 	struct list_head *l, *e;
1365 
1366 	pos = 128;
1367 	if (pos > offset) {
1368 		len += sprintf(buffer+len, "%-127s\n",
1369 			       "Pro FromIP   FPrt ToIP     TPrt DestIP   DPrt State       Expires");
1370 	}
1371 
1372 	for(idx = 0; idx < IP_VS_CONN_TAB_SIZE; idx++) {
1373 		/*
1374 		 *	Lock is actually only need in next loop
1375 		 *	we are called from uspace: must stop bh.
1376 		 */
1377 		ct_read_lock_bh(idx);
1378 
1379 		l = &ip_vs_conn_tab[idx];
1380 		for (e=l->next; e!=l; e=e->next) {
1381 			cp = list_entry(e, struct ip_vs_conn, c_list);
1382 			pos += 128;
1383 			if (pos <= offset)
1384 				continue;
1385 			sprintf(temp,
1386 				"%-3s %08X %04X %08X %04X %08X %04X %-11s %7lu",
1387 				ip_vs_proto_name(cp->protocol),
1388 				ntohl(cp->caddr), ntohs(cp->cport),
1389 				ntohl(cp->vaddr), ntohs(cp->vport),
1390 				ntohl(cp->daddr), ntohs(cp->dport),
1391 				ip_vs_state_name(cp->state),
1392 				(cp->timer.expires-jiffies)/HZ);
1393 			len += sprintf(buffer+len, "%-127s\n", temp);
1394 			if (pos >= offset+length) {
1395 				ct_read_unlock_bh(idx);
1396 				goto done;
1397 			}
1398 		}
1399 		ct_read_unlock_bh(idx);
1400 	}
1401 
1402   done:
1403 	*start = buffer+len-(pos-offset);       /* Start of wanted data */
1404 	len = pos-offset;
1405 	if (len > length)
1406 		len = length;
1407 	if (len < 0)
1408 		len = 0;
1409 	return len;
1410 }
1411 
1412 
1413 /*
1414  *      Randomly drop connection entries before running out of memory
1415  */
todrop_entry(struct ip_vs_conn * cp)1416 static inline int todrop_entry(struct ip_vs_conn *cp)
1417 {
1418 	/*
1419 	 * The drop rate array needs tuning for real environments.
1420 	 * Called from timer bh only => no locking
1421 	 */
1422 	static char todrop_rate[9] = {0, 1, 2, 3, 4, 5, 6, 7, 8};
1423 	static char todrop_counter[9] = {0};
1424 	int i;
1425 
1426 	/* if the conn entry hasn't lasted for 60 seconds, don't drop it.
1427 	   This will leave enough time for normal connection to get
1428 	   through. */
1429 	if (cp->timeout+jiffies-cp->timer.expires < 60*HZ)
1430 		return 0;
1431 
1432 	/* Don't drop the entry if its number of incoming packets is not
1433 	   located in [0, 8] */
1434 	i = atomic_read(&cp->in_pkts);
1435 	if (i > 8 || i < 0) return 0;
1436 
1437 	if (!todrop_rate[i]) return 0;
1438 	if (--todrop_counter[i] > 0) return 0;
1439 
1440 	todrop_counter[i] = todrop_rate[i];
1441 	return 1;
1442 }
1443 
1444 
ip_vs_random_dropentry(void)1445 void ip_vs_random_dropentry(void)
1446 {
1447 	int idx;
1448 	struct ip_vs_conn *cp;
1449 	struct list_head *l,*e;
1450 
1451 	/*
1452 	 * Randomly scan 1/32 of the whole table every second
1453 	 */
1454 	for (idx=0; idx<(IP_VS_CONN_TAB_SIZE>>5); idx++) {
1455 		unsigned hash = net_random()&IP_VS_CONN_TAB_MASK;
1456 
1457 		/*
1458 		 *  Lock is actually needed in this loop.
1459 		 */
1460 		ct_write_lock(hash);
1461 
1462 		l = &ip_vs_conn_tab[hash];
1463 		for (e=l->next; e!=l; e=e->next) {
1464 			cp = list_entry(e, struct ip_vs_conn, c_list);
1465 			if (cp->flags & IP_VS_CONN_F_TEMPLATE)
1466 				/* connection template */
1467 				continue;
1468 			switch(cp->state) {
1469 			case IP_VS_S_SYN_RECV:
1470 			case IP_VS_S_SYNACK:
1471 				break;
1472 
1473 			case IP_VS_S_ESTABLISHED:
1474 			case IP_VS_S_UDP:
1475 				if (todrop_entry(cp))
1476 					break;
1477 				continue;
1478 
1479 			default:
1480 				continue;
1481 			}
1482 
1483 			IP_VS_DBG(4, "del connection\n");
1484 			ip_vs_conn_expire_now(cp);
1485 			if (cp->control) {
1486 				IP_VS_DBG(4, "del conn template\n");
1487 				ip_vs_conn_expire_now(cp->control);
1488 			}
1489 		}
1490 		ct_write_unlock(hash);
1491 	}
1492 }
1493 
1494 
1495 /*
1496  *      Flush all the connection entries in the ip_vs_conn_tab
1497  */
ip_vs_conn_flush(void)1498 static void ip_vs_conn_flush(void)
1499 {
1500 	int idx;
1501 	struct ip_vs_conn *cp;
1502 	struct list_head *l,*e;
1503 
1504   flush_again:
1505 	for (idx=0; idx<IP_VS_CONN_TAB_SIZE; idx++) {
1506 		/*
1507 		 *  Lock is actually needed in this loop.
1508 		 */
1509 		ct_write_lock_bh(idx);
1510 
1511 		l = &ip_vs_conn_tab[idx];
1512 		for (e=l->next; e!=l; e=e->next) {
1513 			cp = list_entry(e, struct ip_vs_conn, c_list);
1514 
1515 			IP_VS_DBG(4, "del connection\n");
1516 			ip_vs_conn_expire_now(cp);
1517 			if (cp->control) {
1518 				IP_VS_DBG(4, "del conn template\n");
1519 				ip_vs_conn_expire_now(cp->control);
1520 			}
1521 		}
1522 		ct_write_unlock_bh(idx);
1523 	}
1524 
1525 	/* the counter may be not NULL, because maybe some conn entries
1526 	   are run by slow timer handler or unhashed but still referred */
1527 	if (atomic_read(&ip_vs_conn_count) != 0) {
1528 		schedule();
1529 		goto flush_again;
1530 	}
1531 }
1532 
1533 
ip_vs_conn_init(void)1534 int ip_vs_conn_init(void)
1535 {
1536 	int idx;
1537 
1538 	/*
1539 	 * Allocate the connection hash table and initialize its list heads
1540 	 */
1541 	ip_vs_conn_tab = vmalloc(IP_VS_CONN_TAB_SIZE*sizeof(struct list_head));
1542 	if (!ip_vs_conn_tab)
1543 		return -ENOMEM;
1544 
1545 	IP_VS_INFO("Connection hash table configured "
1546 		   "(size=%d, memory=%ldKbytes)\n",
1547 		   IP_VS_CONN_TAB_SIZE,
1548 		   (long)(IP_VS_CONN_TAB_SIZE*sizeof(struct list_head))/1024);
1549 	IP_VS_DBG(0, "Each connection entry needs %d bytes at least\n",
1550 		  sizeof(struct ip_vs_conn));
1551 
1552 	for (idx = 0; idx < IP_VS_CONN_TAB_SIZE; idx++) {
1553 		INIT_LIST_HEAD(&ip_vs_conn_tab[idx]);
1554 	}
1555 
1556 	for (idx = 0; idx < CT_LOCKARRAY_SIZE; idx++)  {
1557 		__ip_vs_conntbl_lock_array[idx].l = RW_LOCK_UNLOCKED;
1558 	}
1559 
1560 	/* Allocate ip_vs_conn slab cache */
1561 	ip_vs_conn_cachep = kmem_cache_create("ip_vs_conn",
1562 					      sizeof(struct ip_vs_conn), 0,
1563 					      SLAB_HWCACHE_ALIGN, NULL, NULL);
1564 	if (!ip_vs_conn_cachep) {
1565 		vfree(ip_vs_conn_tab);
1566 		return -ENOMEM;
1567 	}
1568 
1569 	proc_net_create("ip_vs_conn", 0, ip_vs_conn_getinfo);
1570 
1571 	/* calculate the random value for connection hash */
1572 	get_random_bytes(&ip_vs_conn_rnd, sizeof(ip_vs_conn_rnd));
1573 
1574 	return 0;
1575 }
1576 
ip_vs_conn_cleanup(void)1577 void ip_vs_conn_cleanup(void)
1578 {
1579 	/* flush all the connection entries first */
1580 	ip_vs_conn_flush();
1581 
1582 	/* Release the empty cache */
1583 	kmem_cache_destroy(ip_vs_conn_cachep);
1584 	proc_net_remove("ip_vs_conn");
1585 	vfree(ip_vs_conn_tab);
1586 }
1587