1 /*
2 * IPVS An implementation of the IP virtual server support for the
3 * LINUX operating system. IPVS is now implemented as a module
4 * over the Netfilter framework. IPVS can be used to build a
5 * high-performance and highly available server based on a
6 * cluster of servers.
7 *
8 * Version: $Id: ip_vs_conn.c,v 1.28.2.5 2003/08/09 13:27:08 wensong Exp $
9 *
10 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
11 * Peter Kese <peter.kese@ijs.si>
12 * Julian Anastasov <ja@ssi.bg>
13 *
14 * This program is free software; you can redistribute it and/or
15 * modify it under the terms of the GNU General Public License
16 * as published by the Free Software Foundation; either version
17 * 2 of the License, or (at your option) any later version.
18 *
19 * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese,
20 * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms
21 * and others. Many code here is taken from IP MASQ code of kernel 2.2.
22 *
23 * Changes:
24 *
25 */
26
27 #include <linux/module.h>
28 #include <linux/kernel.h>
29 #include <linux/vmalloc.h>
30 #include <linux/ip.h>
31 #include <linux/tcp.h> /* for tcphdr */
32 #include <linux/in.h>
33 #include <linux/proc_fs.h> /* for proc_net_* */
34 #include <asm/softirq.h> /* for local_bh_* */
35 #include <net/ip.h>
36 #include <net/tcp.h> /* for csum_tcpudp_magic */
37 #include <net/udp.h>
38 #include <net/icmp.h> /* for icmp_send */
39 #include <net/route.h> /* for ip_route_output */
40 #include <linux/netfilter.h>
41 #include <linux/netfilter_ipv4.h>
42 #include <linux/jhash.h>
43 #include <linux/random.h>
44
45 #include <net/ip_vs.h>
46
47
48 /*
49 * Connection hash table: for input and output packets lookups of IPVS
50 */
51 static struct list_head *ip_vs_conn_tab;
52
53 /* SLAB cache for IPVS connections */
54 static kmem_cache_t *ip_vs_conn_cachep;
55
56 /* counter for current IPVS connections */
57 static atomic_t ip_vs_conn_count = ATOMIC_INIT(0);
58
59 /* counter for no-client-port connections */
60 static atomic_t ip_vs_conn_no_cport_cnt = ATOMIC_INIT(0);
61
62 /* random value for IPVS connection hash */
63 static unsigned int ip_vs_conn_rnd;
64
65 /*
66 * Fine locking granularity for big connection hash table
67 */
68 #define CT_LOCKARRAY_BITS 4
69 #define CT_LOCKARRAY_SIZE (1<<CT_LOCKARRAY_BITS)
70 #define CT_LOCKARRAY_MASK (CT_LOCKARRAY_SIZE-1)
71
72 struct ip_vs_aligned_lock
73 {
74 rwlock_t l;
75 } __attribute__((__aligned__(SMP_CACHE_BYTES)));
76
77 /* lock array for conn table */
78 struct ip_vs_aligned_lock
79 __ip_vs_conntbl_lock_array[CT_LOCKARRAY_SIZE] __cacheline_aligned;
80
ct_read_lock(unsigned key)81 static inline void ct_read_lock(unsigned key)
82 {
83 read_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
84 }
85
ct_read_unlock(unsigned key)86 static inline void ct_read_unlock(unsigned key)
87 {
88 read_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
89 }
90
ct_write_lock(unsigned key)91 static inline void ct_write_lock(unsigned key)
92 {
93 write_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
94 }
95
ct_write_unlock(unsigned key)96 static inline void ct_write_unlock(unsigned key)
97 {
98 write_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
99 }
100
ct_read_lock_bh(unsigned key)101 static inline void ct_read_lock_bh(unsigned key)
102 {
103 read_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
104 }
105
ct_read_unlock_bh(unsigned key)106 static inline void ct_read_unlock_bh(unsigned key)
107 {
108 read_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
109 }
110
ct_write_lock_bh(unsigned key)111 static inline void ct_write_lock_bh(unsigned key)
112 {
113 write_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
114 }
115
ct_write_unlock_bh(unsigned key)116 static inline void ct_write_unlock_bh(unsigned key)
117 {
118 write_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
119 }
120
121
122 /*
123 * Returns hash value for IPVS connection entry
124 */
125 static unsigned
ip_vs_conn_hashkey(unsigned proto,__u32 addr,__u16 port)126 ip_vs_conn_hashkey(unsigned proto, __u32 addr, __u16 port)
127 {
128 return jhash_3words(addr, port, proto, ip_vs_conn_rnd)
129 & IP_VS_CONN_TAB_MASK;
130 }
131
132
133 /*
134 * Hashes ip_vs_conn in ip_vs_conn_tab by proto,addr,port.
135 * returns bool success.
136 */
ip_vs_conn_hash(struct ip_vs_conn * cp)137 static int ip_vs_conn_hash(struct ip_vs_conn *cp)
138 {
139 unsigned hash;
140 int ret;
141
142 /* Hash by protocol, client address and port */
143 hash = ip_vs_conn_hashkey(cp->protocol, cp->caddr, cp->cport);
144
145 ct_write_lock(hash);
146
147 if (!(cp->flags & IP_VS_CONN_F_HASHED)) {
148 list_add(&cp->c_list, &ip_vs_conn_tab[hash]);
149 cp->flags |= IP_VS_CONN_F_HASHED;
150 atomic_inc(&cp->refcnt);
151 ret = 1;
152 } else {
153 IP_VS_ERR("ip_vs_conn_hash(): request for already hashed, "
154 "called from %p\n", __builtin_return_address(0));
155 ret = 0;
156 }
157
158 ct_write_unlock(hash);
159
160 return ret;
161 }
162
163
164 /*
165 * UNhashes ip_vs_conn from ip_vs_conn_tab.
166 * returns bool success.
167 */
ip_vs_conn_unhash(struct ip_vs_conn * cp)168 static int ip_vs_conn_unhash(struct ip_vs_conn *cp)
169 {
170 unsigned hash;
171 int ret;
172
173 /* unhash it and decrease its reference counter */
174 hash = ip_vs_conn_hashkey(cp->protocol, cp->caddr, cp->cport);
175 ct_write_lock(hash);
176
177 if (cp->flags & IP_VS_CONN_F_HASHED) {
178 list_del(&cp->c_list);
179 cp->flags &= ~IP_VS_CONN_F_HASHED;
180 atomic_dec(&cp->refcnt);
181 ret = 1;
182 } else
183 ret = 0;
184
185 ct_write_unlock(hash);
186
187 return ret;
188 }
189
190
191 /*
192 * Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab.
193 * Called for pkts coming from OUTside-to-INside.
194 * s_addr, s_port: pkt source address (foreign host)
195 * d_addr, d_port: pkt dest address (load balancer)
196 */
__ip_vs_conn_in_get(int protocol,__u32 s_addr,__u16 s_port,__u32 d_addr,__u16 d_port)197 static inline struct ip_vs_conn *__ip_vs_conn_in_get
198 (int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port)
199 {
200 unsigned hash;
201 struct ip_vs_conn *cp;
202 struct list_head *l,*e;
203
204 hash = ip_vs_conn_hashkey(protocol, s_addr, s_port);
205 l = &ip_vs_conn_tab[hash];
206
207 ct_read_lock(hash);
208
209 for (e=l->next; e!=l; e=e->next) {
210 cp = list_entry(e, struct ip_vs_conn, c_list);
211 if (s_addr==cp->caddr && s_port==cp->cport &&
212 d_port==cp->vport && d_addr==cp->vaddr &&
213 ((!s_port) ^ (!(cp->flags & IP_VS_CONN_F_NO_CPORT))) &&
214 protocol==cp->protocol) {
215 /* HIT */
216 atomic_inc(&cp->refcnt);
217 ct_read_unlock(hash);
218 return cp;
219 }
220 }
221
222 ct_read_unlock(hash);
223
224 return NULL;
225 }
226
ip_vs_conn_in_get(int protocol,__u32 s_addr,__u16 s_port,__u32 d_addr,__u16 d_port)227 struct ip_vs_conn *ip_vs_conn_in_get
228 (int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port)
229 {
230 struct ip_vs_conn *cp;
231
232 cp = __ip_vs_conn_in_get(protocol, s_addr, s_port, d_addr, d_port);
233 if (!cp && atomic_read(&ip_vs_conn_no_cport_cnt))
234 cp = __ip_vs_conn_in_get(protocol, s_addr, 0, d_addr, d_port);
235
236 IP_VS_DBG(7, "lookup/in %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n",
237 ip_vs_proto_name(protocol),
238 NIPQUAD(s_addr), ntohs(s_port),
239 NIPQUAD(d_addr), ntohs(d_port),
240 cp?"hit":"not hit");
241
242 return cp;
243 }
244
245 /* Get reference to connection template */
ip_vs_ct_in_get(int protocol,__u32 s_addr,__u16 s_port,__u32 d_addr,__u16 d_port)246 struct ip_vs_conn *ip_vs_ct_in_get
247 (int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port)
248 {
249 unsigned hash;
250 struct ip_vs_conn *cp;
251
252 hash = ip_vs_conn_hashkey(protocol, s_addr, s_port);
253
254 ct_read_lock(hash);
255
256 list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
257 if (s_addr==cp->caddr && s_port==cp->cport &&
258 d_port==cp->vport && d_addr==cp->vaddr &&
259 cp->flags & IP_VS_CONN_F_TEMPLATE &&
260 protocol==cp->protocol) {
261 /* HIT */
262 atomic_inc(&cp->refcnt);
263 goto out;
264 }
265 }
266 cp = NULL;
267
268 out:
269 ct_read_unlock(hash);
270
271 IP_VS_DBG(7, "template lookup/in %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n",
272 ip_vs_proto_name(protocol),
273 NIPQUAD(s_addr), ntohs(s_port),
274 NIPQUAD(d_addr), ntohs(d_port),
275 cp?"hit":"not hit");
276
277 return cp;
278 }
279
280 /*
281 * Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab.
282 * Called for pkts coming from inside-to-OUTside.
283 * s_addr, s_port: pkt source address (inside host)
284 * d_addr, d_port: pkt dest address (foreign host)
285 */
ip_vs_conn_out_get(int protocol,__u32 s_addr,__u16 s_port,__u32 d_addr,__u16 d_port)286 struct ip_vs_conn *ip_vs_conn_out_get
287 (int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port)
288 {
289 unsigned hash;
290 struct ip_vs_conn *cp, *ret=NULL;
291 struct list_head *l,*e;
292
293 /*
294 * Check for "full" addressed entries
295 */
296 hash = ip_vs_conn_hashkey(protocol, d_addr, d_port);
297 l = &ip_vs_conn_tab[hash];
298
299 ct_read_lock(hash);
300
301 for (e=l->next; e!=l; e=e->next) {
302 cp = list_entry(e, struct ip_vs_conn, c_list);
303 if (d_addr == cp->caddr && d_port == cp->cport &&
304 s_port == cp->dport && s_addr == cp->daddr &&
305 protocol == cp->protocol) {
306 /* HIT */
307 atomic_inc(&cp->refcnt);
308 ret = cp;
309 break;
310 }
311 }
312
313 ct_read_unlock(hash);
314
315 IP_VS_DBG(7, "lookup/out %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n",
316 ip_vs_proto_name(protocol),
317 NIPQUAD(s_addr), ntohs(s_port),
318 NIPQUAD(d_addr), ntohs(d_port),
319 ret?"hit":"not hit");
320
321 return ret;
322 }
323
324
325 /*
326 * Put back the conn and restart its timer with its timeout
327 */
ip_vs_conn_put(struct ip_vs_conn * cp)328 void ip_vs_conn_put(struct ip_vs_conn *cp)
329 {
330 /* reset it expire in its timeout */
331 mod_timer(&cp->timer, jiffies+cp->timeout);
332
333 __ip_vs_conn_put(cp);
334 }
335
336
337 /*
338 * Timeout table[state]
339 */
340 struct ip_vs_timeout_table vs_timeout_table = {
341 ATOMIC_INIT(0), /* refcnt */
342 0, /* scale */
343 {
344 [IP_VS_S_NONE] = 30*60*HZ,
345 [IP_VS_S_ESTABLISHED] = 15*60*HZ,
346 [IP_VS_S_SYN_SENT] = 2*60*HZ,
347 [IP_VS_S_SYN_RECV] = 1*60*HZ,
348 [IP_VS_S_FIN_WAIT] = 2*60*HZ,
349 [IP_VS_S_TIME_WAIT] = 2*60*HZ,
350 [IP_VS_S_CLOSE] = 10*HZ,
351 [IP_VS_S_CLOSE_WAIT] = 60*HZ,
352 [IP_VS_S_LAST_ACK] = 30*HZ,
353 [IP_VS_S_LISTEN] = 2*60*HZ,
354 [IP_VS_S_SYNACK] = 120*HZ,
355 [IP_VS_S_UDP] = 5*60*HZ,
356 [IP_VS_S_ICMP] = 1*60*HZ,
357 [IP_VS_S_LAST] = 2*HZ,
358 }, /* timeout */
359 };
360
361
362 struct ip_vs_timeout_table vs_timeout_table_dos = {
363 ATOMIC_INIT(0), /* refcnt */
364 0, /* scale */
365 {
366 [IP_VS_S_NONE] = 15*60*HZ,
367 [IP_VS_S_ESTABLISHED] = 8*60*HZ,
368 [IP_VS_S_SYN_SENT] = 60*HZ,
369 [IP_VS_S_SYN_RECV] = 10*HZ,
370 [IP_VS_S_FIN_WAIT] = 60*HZ,
371 [IP_VS_S_TIME_WAIT] = 60*HZ,
372 [IP_VS_S_CLOSE] = 10*HZ,
373 [IP_VS_S_CLOSE_WAIT] = 60*HZ,
374 [IP_VS_S_LAST_ACK] = 30*HZ,
375 [IP_VS_S_LISTEN] = 2*60*HZ,
376 [IP_VS_S_SYNACK] = 100*HZ,
377 [IP_VS_S_UDP] = 3*60*HZ,
378 [IP_VS_S_ICMP] = 1*60*HZ,
379 [IP_VS_S_LAST] = 2*HZ,
380 }, /* timeout */
381 };
382
383
384 /*
385 * Timeout table to use for the VS entries
386 * If NULL we use the default table (vs_timeout_table).
387 * Under flood attack we switch to vs_timeout_table_dos
388 */
389
390 static struct ip_vs_timeout_table *ip_vs_timeout_table = &vs_timeout_table;
391
392 static const char * state_name_table[IP_VS_S_LAST+1] = {
393 [IP_VS_S_NONE] = "NONE",
394 [IP_VS_S_ESTABLISHED] = "ESTABLISHED",
395 [IP_VS_S_SYN_SENT] = "SYN_SENT",
396 [IP_VS_S_SYN_RECV] = "SYN_RECV",
397 [IP_VS_S_FIN_WAIT] = "FIN_WAIT",
398 [IP_VS_S_TIME_WAIT] = "TIME_WAIT",
399 [IP_VS_S_CLOSE] = "CLOSE",
400 [IP_VS_S_CLOSE_WAIT] = "CLOSE_WAIT",
401 [IP_VS_S_LAST_ACK] = "LAST_ACK",
402 [IP_VS_S_LISTEN] = "LISTEN",
403 [IP_VS_S_SYNACK] = "SYNACK",
404 [IP_VS_S_UDP] = "UDP",
405 [IP_VS_S_ICMP] = "ICMP",
406 [IP_VS_S_LAST] = "BUG!",
407 };
408
409 #define sNO IP_VS_S_NONE
410 #define sES IP_VS_S_ESTABLISHED
411 #define sSS IP_VS_S_SYN_SENT
412 #define sSR IP_VS_S_SYN_RECV
413 #define sFW IP_VS_S_FIN_WAIT
414 #define sTW IP_VS_S_TIME_WAIT
415 #define sCL IP_VS_S_CLOSE
416 #define sCW IP_VS_S_CLOSE_WAIT
417 #define sLA IP_VS_S_LAST_ACK
418 #define sLI IP_VS_S_LISTEN
419 #define sSA IP_VS_S_SYNACK
420
421 struct vs_tcp_states_t {
422 int next_state[IP_VS_S_LAST]; /* should be _LAST_TCP */
423 };
424
ip_vs_state_name(int state)425 const char * ip_vs_state_name(int state)
426 {
427 if (state >= IP_VS_S_LAST)
428 return "ERR!";
429 return state_name_table[state] ? state_name_table[state] : "?";
430 }
431
432 static struct vs_tcp_states_t vs_tcp_states [] = {
433 /* INPUT */
434 /* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
435 /*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},
436 /*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sTW }},
437 /*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
438 /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sSR }},
439
440 /* OUTPUT */
441 /* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
442 /*syn*/ {{sSS, sES, sSS, sSR, sSS, sSS, sSS, sSS, sSS, sLI, sSR }},
443 /*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }},
444 /*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }},
445 /*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }},
446
447 /* INPUT-ONLY */
448 /* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
449 /*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},
450 /*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }},
451 /*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
452 /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
453 };
454
455 static struct vs_tcp_states_t vs_tcp_states_dos [] = {
456 /* INPUT */
457 /* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
458 /*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSA }},
459 /*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sSA }},
460 /*ack*/ {{sCL, sES, sSS, sSR, sFW, sTW, sCL, sCW, sCL, sLI, sSA }},
461 /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
462
463 /* OUTPUT */
464 /* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
465 /*syn*/ {{sSS, sES, sSS, sSA, sSS, sSS, sSS, sSS, sSS, sLI, sSA }},
466 /*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }},
467 /*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }},
468 /*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }},
469
470 /* INPUT-ONLY */
471 /* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
472 /*syn*/ {{sSA, sES, sES, sSR, sSA, sSA, sSA, sSA, sSA, sSA, sSA }},
473 /*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }},
474 /*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
475 /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
476 };
477
478 static struct vs_tcp_states_t *ip_vs_state_table = vs_tcp_states;
479
ip_vs_secure_tcp_set(int on)480 void ip_vs_secure_tcp_set(int on)
481 {
482 if (on) {
483 ip_vs_state_table = vs_tcp_states_dos;
484 ip_vs_timeout_table = &vs_timeout_table_dos;
485 } else {
486 ip_vs_state_table = vs_tcp_states;
487 ip_vs_timeout_table = &vs_timeout_table;
488 }
489 }
490
491
vs_tcp_state_idx(struct tcphdr * th,int state_off)492 static inline int vs_tcp_state_idx(struct tcphdr *th, int state_off)
493 {
494 /*
495 * [0-3]: input states, [4-7]: output, [8-11] input only states.
496 */
497 if (th->rst)
498 return state_off+3;
499 if (th->syn)
500 return state_off+0;
501 if (th->fin)
502 return state_off+1;
503 if (th->ack)
504 return state_off+2;
505 return -1;
506 }
507
508
vs_set_state_timeout(struct ip_vs_conn * cp,int state)509 static inline int vs_set_state_timeout(struct ip_vs_conn *cp, int state)
510 {
511 struct ip_vs_timeout_table *vstim = cp->timeout_table;
512
513 /*
514 * Use default timeout table if no specific for this entry
515 */
516 if (!vstim)
517 vstim = &vs_timeout_table;
518
519 cp->timeout = vstim->timeout[cp->state=state];
520
521 if (vstim->scale) {
522 int scale = vstim->scale;
523
524 if (scale<0)
525 cp->timeout >>= -scale;
526 else if (scale > 0)
527 cp->timeout <<= scale;
528 }
529
530 return state;
531 }
532
533
534 static inline int
vs_tcp_state(struct ip_vs_conn * cp,int state_off,struct tcphdr * th)535 vs_tcp_state(struct ip_vs_conn *cp, int state_off, struct tcphdr *th)
536 {
537 int state_idx;
538 int new_state = IP_VS_S_CLOSE;
539
540 /*
541 * Update state offset to INPUT_ONLY if necessary
542 * or delete NO_OUTPUT flag if output packet detected
543 */
544 if (cp->flags & IP_VS_CONN_F_NOOUTPUT) {
545 if (state_off == VS_STATE_OUTPUT)
546 cp->flags &= ~IP_VS_CONN_F_NOOUTPUT;
547 else
548 state_off = VS_STATE_INPUT_ONLY;
549 }
550
551 if ((state_idx = vs_tcp_state_idx(th, state_off)) < 0) {
552 IP_VS_DBG(8, "vs_tcp_state_idx(%d)=%d!!!\n",
553 state_off, state_idx);
554 goto tcp_state_out;
555 }
556
557 new_state = ip_vs_state_table[state_idx].next_state[cp->state];
558
559 tcp_state_out:
560 if (new_state != cp->state) {
561 struct ip_vs_dest *dest = cp->dest;
562
563 IP_VS_DBG(8, "%s %s [%c%c%c%c] %u.%u.%u.%u:%d->"
564 "%u.%u.%u.%u:%d state: %s->%s cnt:%d\n",
565 ip_vs_proto_name(cp->protocol),
566 (state_off==VS_STATE_OUTPUT)?"output ":"input ",
567 th->syn? 'S' : '.',
568 th->fin? 'F' : '.',
569 th->ack? 'A' : '.',
570 th->rst? 'R' : '.',
571 NIPQUAD(cp->daddr), ntohs(cp->dport),
572 NIPQUAD(cp->caddr), ntohs(cp->cport),
573 ip_vs_state_name(cp->state),
574 ip_vs_state_name(new_state),
575 atomic_read(&cp->refcnt));
576 if (dest) {
577 if (!(cp->flags & IP_VS_CONN_F_INACTIVE) &&
578 (new_state != IP_VS_S_ESTABLISHED)) {
579 atomic_dec(&dest->activeconns);
580 atomic_inc(&dest->inactconns);
581 cp->flags |= IP_VS_CONN_F_INACTIVE;
582 } else if ((cp->flags & IP_VS_CONN_F_INACTIVE) &&
583 (new_state == IP_VS_S_ESTABLISHED)) {
584 atomic_inc(&dest->activeconns);
585 atomic_dec(&dest->inactconns);
586 cp->flags &= ~IP_VS_CONN_F_INACTIVE;
587 }
588 }
589 }
590
591 return vs_set_state_timeout(cp, new_state);
592 }
593
594
595 /*
596 * Handle state transitions
597 */
ip_vs_set_state(struct ip_vs_conn * cp,int state_off,struct iphdr * iph,void * tp)598 int ip_vs_set_state(struct ip_vs_conn *cp,
599 int state_off, struct iphdr *iph, void *tp)
600 {
601 int ret;
602
603 spin_lock(&cp->lock);
604 switch (iph->protocol) {
605 case IPPROTO_TCP:
606 ret = vs_tcp_state(cp, state_off, tp);
607 break;
608 case IPPROTO_UDP:
609 ret = vs_set_state_timeout(cp, IP_VS_S_UDP);
610 break;
611 case IPPROTO_ICMP:
612 ret = vs_set_state_timeout(cp, IP_VS_S_ICMP);
613 break;
614 default:
615 ret = -1;
616 }
617 spin_unlock(&cp->lock);
618
619 return ret;
620 }
621
622
623 /*
624 * Set LISTEN timeout. (ip_vs_conn_put will setup timer)
625 */
ip_vs_conn_listen(struct ip_vs_conn * cp)626 int ip_vs_conn_listen(struct ip_vs_conn *cp)
627 {
628 vs_set_state_timeout(cp, IP_VS_S_LISTEN);
629 return cp->timeout;
630 }
631
632
633 /*
634 * Bypass transmitter
635 * Let packets bypass the destination when the destination is not
636 * available, it may be only used in transparent cache cluster.
637 */
ip_vs_bypass_xmit(struct sk_buff * skb,struct ip_vs_conn * cp)638 static int ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp)
639 {
640 struct rtable *rt; /* Route to the other host */
641 struct iphdr *iph = skb->nh.iph;
642 u8 tos = iph->tos;
643 int mtu;
644
645 EnterFunction(10);
646
647 if (ip_route_output(&rt, iph->daddr, 0, RT_TOS(tos), 0)) {
648 IP_VS_DBG_RL("ip_vs_bypass_xmit(): ip_route_output error, "
649 "dest: %u.%u.%u.%u\n", NIPQUAD(iph->daddr));
650 goto tx_error_icmp;
651 }
652
653 /* MTU checking */
654 mtu = rt->u.dst.pmtu;
655 if ((skb->len > mtu) && (iph->frag_off&__constant_htons(IP_DF))) {
656 ip_rt_put(rt);
657 icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
658 IP_VS_DBG_RL("ip_vs_bypass_xmit(): frag needed\n");
659 goto tx_error;
660 }
661
662 /* update checksum because skb might be defragmented */
663 ip_send_check(iph);
664
665 if (unlikely(skb_headroom(skb) < rt->u.dst.dev->hard_header_len)) {
666 if (skb_cow(skb, rt->u.dst.dev->hard_header_len)) {
667 ip_rt_put(rt);
668 IP_VS_ERR_RL("ip_vs_bypass_xmit(): no memory\n");
669 goto tx_error;
670 }
671 }
672
673 /* drop old route */
674 dst_release(skb->dst);
675 skb->dst = &rt->u.dst;
676
677 #ifdef CONFIG_NETFILTER_DEBUG
678 skb->nf_debug = 1 << NF_IP_LOCAL_OUT;
679 #endif /* CONFIG_NETFILTER_DEBUG */
680 skb->nfcache |= NFC_IPVS_PROPERTY;
681 ip_send(skb);
682
683 LeaveFunction(10);
684 return NF_STOLEN;
685
686 tx_error_icmp:
687 dst_link_failure(skb);
688 tx_error:
689 kfree_skb(skb);
690 return NF_STOLEN;
691 }
692
693
694 /*
695 * NULL transmitter (do nothing except return NF_ACCEPT)
696 */
ip_vs_null_xmit(struct sk_buff * skb,struct ip_vs_conn * cp)697 static int ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp)
698 {
699 return NF_ACCEPT;
700 }
701
702
703 /*
704 * NAT transmitter (only for outside-to-inside nat forwarding)
705 */
ip_vs_nat_xmit(struct sk_buff * skb,struct ip_vs_conn * cp)706 static int ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp)
707 {
708 struct rtable *rt; /* Route to the other host */
709 struct iphdr *iph;
710 union ip_vs_tphdr h;
711 int ihl;
712 unsigned short size;
713 int mtu;
714
715 EnterFunction(10);
716
717 /*
718 * If it has ip_vs_app helper, the helper may change the payload,
719 * so it needs full checksum checking and checksum calculation.
720 * If not, only the header (such as IP address and port number)
721 * will be changed, so it is fast to do incremental checksum update,
722 * and let the destination host do final checksum checking.
723 */
724
725 if (cp->app && skb_is_nonlinear(skb)
726 && skb_linearize(skb, GFP_ATOMIC) != 0)
727 return NF_DROP;
728
729 iph = skb->nh.iph;
730 ihl = iph->ihl << 2;
731 h.raw = (char*) iph + ihl;
732 size = ntohs(iph->tot_len) - ihl;
733
734 /* do TCP/UDP checksum checking if it has application helper */
735 if (cp->app && (iph->protocol != IPPROTO_UDP || h.uh->check != 0)) {
736 switch (skb->ip_summed) {
737 case CHECKSUM_NONE:
738 skb->csum = csum_partial(h.raw, size, 0);
739
740 case CHECKSUM_HW:
741 if (csum_tcpudp_magic(iph->saddr, iph->daddr, size,
742 iph->protocol, skb->csum)) {
743 IP_VS_DBG_RL("Incoming failed %s checksum "
744 "from %d.%d.%d.%d (size=%d)!\n",
745 ip_vs_proto_name(iph->protocol),
746 NIPQUAD(iph->saddr),
747 size);
748 goto tx_error;
749 }
750 break;
751 default:
752 /* CHECKSUM_UNNECESSARY */
753 break;
754 }
755 }
756
757 /*
758 * Check if it is no_cport connection ...
759 */
760 if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) {
761 if (ip_vs_conn_unhash(cp)) {
762 spin_lock(&cp->lock);
763 if (cp->flags & IP_VS_CONN_F_NO_CPORT) {
764 atomic_dec(&ip_vs_conn_no_cport_cnt);
765 cp->flags &= ~IP_VS_CONN_F_NO_CPORT;
766 cp->cport = h.portp[0];
767 IP_VS_DBG(10, "filled cport=%d\n", ntohs(cp->dport));
768 }
769 spin_unlock(&cp->lock);
770
771 /* hash on new dport */
772 ip_vs_conn_hash(cp);
773 }
774 }
775
776 if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos))))
777 goto tx_error_icmp;
778
779 /* MTU checking */
780 mtu = rt->u.dst.pmtu;
781 if ((skb->len > mtu) && (iph->frag_off&__constant_htons(IP_DF))) {
782 ip_rt_put(rt);
783 icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
784 IP_VS_DBG_RL("ip_vs_nat_xmit(): frag needed\n");
785 goto tx_error;
786 }
787
788 /* drop old route */
789 dst_release(skb->dst);
790 skb->dst = &rt->u.dst;
791
792 /* copy-on-write the packet before mangling it */
793 if (ip_vs_skb_cow(skb, rt->u.dst.dev->hard_header_len, &iph, &h.raw))
794 return NF_DROP;
795
796 /* mangle the packet */
797 iph->daddr = cp->daddr;
798 h.portp[1] = cp->dport;
799
800 /*
801 * Attempt ip_vs_app call.
802 * will fix ip_vs_conn and iph ack_seq stuff
803 */
804 if (ip_vs_app_pkt_in(cp, skb) != 0) {
805 /* skb data has probably changed, update pointers */
806 iph = skb->nh.iph;
807 h.raw = (char*) iph + ihl;
808 size = skb->len - ihl;
809 }
810
811 /*
812 * Adjust TCP/UDP checksums
813 */
814 if (!cp->app && (iph->protocol != IPPROTO_UDP || h.uh->check != 0)) {
815 /* Only port and addr are changed, do fast csum update */
816 ip_vs_fast_check_update(&h, cp->vaddr, cp->daddr,
817 cp->vport, cp->dport, iph->protocol);
818 if (skb->ip_summed == CHECKSUM_HW)
819 skb->ip_summed = CHECKSUM_NONE;
820 } else {
821 /* full checksum calculation */
822 switch (iph->protocol) {
823 case IPPROTO_TCP:
824 h.th->check = 0;
825 h.th->check = csum_tcpudp_magic(iph->saddr, iph->daddr,
826 size, iph->protocol,
827 csum_partial(h.raw, size, 0));
828 break;
829 case IPPROTO_UDP:
830 h.uh->check = 0;
831 h.uh->check = csum_tcpudp_magic(iph->saddr, iph->daddr,
832 size, iph->protocol,
833 csum_partial(h.raw, size, 0));
834 if (h.uh->check == 0)
835 h.uh->check = 0xFFFF;
836 break;
837 }
838 skb->ip_summed = CHECKSUM_UNNECESSARY;
839 }
840 ip_send_check(iph);
841
842 IP_VS_DBG(10, "NAT to %u.%u.%u.%u:%d\n",
843 NIPQUAD(iph->daddr), ntohs(h.portp[1]));
844
845 /* FIXME: when application helper enlarges the packet and the length
846 is larger than the MTU of outgoing device, there will be still
847 MTU problem. */
848
849 #ifdef CONFIG_NETFILTER_DEBUG
850 skb->nf_debug = 1 << NF_IP_LOCAL_OUT;
851 #endif /* CONFIG_NETFILTER_DEBUG */
852 skb->nfcache |= NFC_IPVS_PROPERTY;
853 ip_send(skb);
854
855 LeaveFunction(10);
856 return NF_STOLEN;
857
858 tx_error_icmp:
859 dst_link_failure(skb);
860 tx_error:
861 kfree_skb(skb);
862 return NF_STOLEN;
863 }
864
865
866 /*
867 * IP Tunneling transmitter
868 *
869 * This function encapsulates the packet in a new IP packet, its
870 * destination will be set to cp->daddr. Most code of this function
871 * is taken from ipip.c.
872 *
873 * It is used in VS/TUN cluster. The load balancer selects a real
874 * server from a cluster based on a scheduling algorithm,
875 * encapsulates the request packet and forwards it to the selected
876 * server. For example, all real servers are configured with
877 * "ifconfig tunl0 <Virtual IP Address> up". When the server receives
878 * the encapsulated packet, it will decapsulate the packet, processe
879 * the request and return the response packets directly to the client
880 * without passing the load balancer. This can greatly increase the
881 * scalability of virtual server.
882 */
ip_vs_tunnel_xmit(struct sk_buff * skb,struct ip_vs_conn * cp)883 static int ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp)
884 {
885 struct rtable *rt; /* Route to the other host */
886 struct net_device *tdev; /* Device to other host */
887 struct iphdr *old_iph = skb->nh.iph;
888 u8 tos = old_iph->tos;
889 u16 df = old_iph->frag_off;
890 struct iphdr *iph; /* Our new IP header */
891 int max_headroom; /* The extra header space needed */
892 int mtu;
893
894 EnterFunction(10);
895
896 if (skb->protocol != __constant_htons(ETH_P_IP)) {
897 IP_VS_DBG_RL("ip_vs_tunnel_xmit(): protocol error, "
898 "ETH_P_IP: %d, skb protocol: %d\n",
899 __constant_htons(ETH_P_IP), skb->protocol);
900 goto tx_error;
901 }
902
903 if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(tos))))
904 goto tx_error_icmp;
905
906 tdev = rt->u.dst.dev;
907
908 mtu = rt->u.dst.pmtu - sizeof(struct iphdr);
909 if (mtu < 68) {
910 ip_rt_put(rt);
911 IP_VS_DBG_RL("ip_vs_tunnel_xmit(): mtu less than 68\n");
912 goto tx_error;
913 }
914 if (skb->dst && mtu < skb->dst->pmtu)
915 skb->dst->pmtu = mtu;
916
917 df |= (old_iph->frag_off&__constant_htons(IP_DF));
918
919 if ((old_iph->frag_off&__constant_htons(IP_DF))
920 && mtu < ntohs(old_iph->tot_len)) {
921 icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
922 ip_rt_put(rt);
923 IP_VS_DBG_RL("ip_vs_tunnel_xmit(): frag needed\n");
924 goto tx_error;
925 }
926
927 /* update checksum because skb might be defragmented */
928 ip_send_check(old_iph);
929
930 /*
931 * Okay, now see if we can stuff it in the buffer as-is.
932 */
933 max_headroom = (((tdev->hard_header_len+15)&~15)+sizeof(struct iphdr));
934
935 if (skb_headroom(skb) < max_headroom
936 || skb_cloned(skb) || skb_shared(skb)) {
937 struct sk_buff *new_skb =
938 skb_realloc_headroom(skb, max_headroom);
939 if (!new_skb) {
940 ip_rt_put(rt);
941 IP_VS_ERR_RL("ip_vs_tunnel_xmit(): no memory\n");
942 return NF_DROP;
943 }
944 kfree_skb(skb);
945 skb = new_skb;
946 old_iph = skb->nh.iph;
947 }
948
949 skb->h.raw = skb->nh.raw;
950 skb->nh.raw = skb_push(skb, sizeof(struct iphdr));
951 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
952
953 /* drop old route */
954 dst_release(skb->dst);
955 skb->dst = &rt->u.dst;
956
957 /*
958 * Push down and install the IPIP header.
959 */
960 iph = skb->nh.iph;
961 iph->version = 4;
962 iph->ihl = sizeof(struct iphdr)>>2;
963 iph->frag_off = df;
964 iph->protocol = IPPROTO_IPIP;
965 iph->tos = tos;
966 iph->daddr = rt->rt_dst;
967 iph->saddr = rt->rt_src;
968 iph->ttl = old_iph->ttl;
969 iph->tot_len = htons(skb->len);
970 ip_select_ident(iph, &rt->u.dst, NULL);
971 ip_send_check(iph);
972
973 skb->ip_summed = CHECKSUM_NONE;
974 #ifdef CONFIG_NETFILTER_DEBUG
975 skb->nf_debug = 1 << NF_IP_LOCAL_OUT;
976 #endif /* CONFIG_NETFILTER_DEBUG */
977 skb->nfcache |= NFC_IPVS_PROPERTY;
978 ip_send(skb);
979
980 LeaveFunction(10);
981
982 return NF_STOLEN;
983
984 tx_error_icmp:
985 dst_link_failure(skb);
986 tx_error:
987 kfree_skb(skb);
988 return NF_STOLEN;
989 }
990
991
992 /*
993 * Direct Routing transmitter
994 */
ip_vs_dr_xmit(struct sk_buff * skb,struct ip_vs_conn * cp)995 static int ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp)
996 {
997 struct rtable *rt; /* Route to the other host */
998 struct iphdr *iph = skb->nh.iph;
999 int mtu;
1000
1001 EnterFunction(10);
1002
1003 if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos))))
1004 goto tx_error_icmp;
1005
1006 /* MTU checking */
1007 mtu = rt->u.dst.pmtu;
1008 if ((iph->frag_off&__constant_htons(IP_DF)) && skb->len > mtu) {
1009 icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
1010 ip_rt_put(rt);
1011 IP_VS_DBG_RL("ip_vs_dr_xmit(): frag needed\n");
1012 goto tx_error;
1013 }
1014
1015 /* update checksum because skb might be defragmented */
1016 ip_send_check(iph);
1017
1018 if (unlikely(skb_headroom(skb) < rt->u.dst.dev->hard_header_len)) {
1019 if (skb_cow(skb, rt->u.dst.dev->hard_header_len)) {
1020 ip_rt_put(rt);
1021 IP_VS_ERR_RL("ip_vs_dr_xmit(): no memory\n");
1022 goto tx_error;
1023 }
1024 }
1025
1026 /* drop old route */
1027 dst_release(skb->dst);
1028 skb->dst = &rt->u.dst;
1029
1030 #ifdef CONFIG_NETFILTER_DEBUG
1031 skb->nf_debug = 1 << NF_IP_LOCAL_OUT;
1032 #endif /* CONFIG_NETFILTER_DEBUG */
1033 skb->nfcache |= NFC_IPVS_PROPERTY;
1034 ip_send(skb);
1035
1036 #if 0000
1037 NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
1038 do_ip_send);
1039 #endif
1040 LeaveFunction(10);
1041 return NF_STOLEN;
1042
1043 tx_error_icmp:
1044 dst_link_failure(skb);
1045 tx_error:
1046 kfree_skb(skb);
1047 return NF_STOLEN;
1048 }
1049
1050
1051 /*
1052 * Bind a connection entry with the corresponding packet_xmit.
1053 * Called by ip_vs_conn_new.
1054 */
ip_vs_bind_xmit(struct ip_vs_conn * cp)1055 static inline void ip_vs_bind_xmit(struct ip_vs_conn *cp)
1056 {
1057 switch (IP_VS_FWD_METHOD(cp)) {
1058 case IP_VS_CONN_F_MASQ:
1059 cp->packet_xmit = ip_vs_nat_xmit;
1060 break;
1061
1062 case IP_VS_CONN_F_TUNNEL:
1063 cp->packet_xmit = ip_vs_tunnel_xmit;
1064 break;
1065
1066 case IP_VS_CONN_F_DROUTE:
1067 cp->packet_xmit = ip_vs_dr_xmit;
1068 break;
1069
1070 case IP_VS_CONN_F_LOCALNODE:
1071 cp->packet_xmit = ip_vs_null_xmit;
1072 break;
1073
1074 case IP_VS_CONN_F_BYPASS:
1075 cp->packet_xmit = ip_vs_bypass_xmit;
1076 break;
1077 }
1078 }
1079
1080
1081 /*
1082 * Bind a connection entry with a virtual service destination
1083 * Called just after a new connection entry is created.
1084 */
1085 static inline void
ip_vs_bind_dest(struct ip_vs_conn * cp,struct ip_vs_dest * dest)1086 ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest)
1087 {
1088 /* if dest is NULL, then return directly */
1089 if (!dest)
1090 return;
1091
1092 /* Increase the refcnt counter of the dest */
1093 atomic_inc(&dest->refcnt);
1094
1095 /* Bind with the destination and its corresponding transmitter */
1096 cp->flags |= atomic_read(&dest->conn_flags);
1097 cp->dest = dest;
1098
1099 IP_VS_DBG(9, "Bind-dest %s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d "
1100 "d:%u.%u.%u.%u:%d fwd:%c s:%s flg:%X cnt:%d destcnt:%d\n",
1101 ip_vs_proto_name(cp->protocol),
1102 NIPQUAD(cp->caddr), ntohs(cp->cport),
1103 NIPQUAD(cp->vaddr), ntohs(cp->vport),
1104 NIPQUAD(cp->daddr), ntohs(cp->dport),
1105 ip_vs_fwd_tag(cp), ip_vs_state_name(cp->state),
1106 cp->flags, atomic_read(&cp->refcnt),
1107 atomic_read(&dest->refcnt));
1108 }
1109
1110
1111 /*
1112 * Unbind a connection entry with its VS destination
1113 * Called by the ip_vs_conn_expire function.
1114 */
ip_vs_unbind_dest(struct ip_vs_conn * cp)1115 static inline void ip_vs_unbind_dest(struct ip_vs_conn *cp)
1116 {
1117 struct ip_vs_dest *dest = cp->dest;
1118
1119 /* if dest is NULL, then return directly */
1120 if (!dest)
1121 return;
1122
1123 IP_VS_DBG(9, "Unbind-dest %s c:%u.%u.%u.%u:%d "
1124 "v:%u.%u.%u.%u:%d d:%u.%u.%u.%u:%d fwd:%c "
1125 "s:%s flg:%X cnt:%d destcnt:%d\n",
1126 ip_vs_proto_name(cp->protocol),
1127 NIPQUAD(cp->caddr), ntohs(cp->cport),
1128 NIPQUAD(cp->vaddr), ntohs(cp->vport),
1129 NIPQUAD(cp->daddr), ntohs(cp->dport),
1130 ip_vs_fwd_tag(cp), ip_vs_state_name(cp->state),
1131 cp->flags, atomic_read(&cp->refcnt),
1132 atomic_read(&dest->refcnt));
1133
1134 /*
1135 * Decrease the inactconns or activeconns counter
1136 * if it is not a connection template
1137 */
1138 if (!(cp->flags & IP_VS_CONN_F_TEMPLATE)) {
1139 if (cp->flags & IP_VS_CONN_F_INACTIVE) {
1140 atomic_dec(&dest->inactconns);
1141 } else {
1142 atomic_dec(&dest->activeconns);
1143 }
1144 }
1145
1146 /*
1147 * Simply decrease the refcnt of the dest, because the
1148 * dest will be either in service's destination list
1149 * or in the trash.
1150 */
1151 atomic_dec(&dest->refcnt);
1152 }
1153
1154
1155 /*
1156 * Checking if the destination of a connection template is available.
1157 * If available, return 1, otherwise invalidate this connection
1158 * template and return 0.
1159 */
ip_vs_check_template(struct ip_vs_conn * ct)1160 int ip_vs_check_template(struct ip_vs_conn *ct)
1161 {
1162 struct ip_vs_dest *dest = ct->dest;
1163
1164 /*
1165 * Checking the dest server status.
1166 */
1167 if ((dest == NULL) ||
1168 !(dest->flags & IP_VS_DEST_F_AVAILABLE) ||
1169 (sysctl_ip_vs_expire_quiescent_template &&
1170 (atomic_read(&dest->weight) == 0))) {
1171 IP_VS_DBG(9, "check_template: dest not available for "
1172 "protocol %s s:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d "
1173 "-> d:%u.%u.%u.%u:%d\n",
1174 ip_vs_proto_name(ct->protocol),
1175 NIPQUAD(ct->caddr), ntohs(ct->cport),
1176 NIPQUAD(ct->vaddr), ntohs(ct->vport),
1177 NIPQUAD(ct->daddr), ntohs(ct->dport));
1178
1179 /*
1180 * Invalidate the connection template
1181 */
1182 if (ct->vport != 65535) {
1183 if (ip_vs_conn_unhash(ct)) {
1184 ct->dport = 65535;
1185 ct->vport = 65535;
1186 ct->cport = 0;
1187 ip_vs_conn_hash(ct);
1188 }
1189 }
1190
1191 /*
1192 * Simply decrease the refcnt of the template,
1193 * don't restart its timer.
1194 */
1195 atomic_dec(&ct->refcnt);
1196 return 0;
1197 }
1198 return 1;
1199 }
1200
1201
1202 static inline void
ip_vs_timeout_attach(struct ip_vs_conn * cp,struct ip_vs_timeout_table * vstim)1203 ip_vs_timeout_attach(struct ip_vs_conn *cp, struct ip_vs_timeout_table *vstim)
1204 {
1205 atomic_inc(&vstim->refcnt);
1206 cp->timeout_table = vstim;
1207 }
1208
ip_vs_timeout_detach(struct ip_vs_conn * cp)1209 static inline void ip_vs_timeout_detach(struct ip_vs_conn *cp)
1210 {
1211 struct ip_vs_timeout_table *vstim = cp->timeout_table;
1212
1213 if (!vstim)
1214 return;
1215 cp->timeout_table = NULL;
1216 atomic_dec(&vstim->refcnt);
1217 }
1218
1219
ip_vs_conn_expire(unsigned long data)1220 static void ip_vs_conn_expire(unsigned long data)
1221 {
1222 struct ip_vs_conn *cp = (struct ip_vs_conn *)data;
1223
1224 if (cp->timeout_table)
1225 cp->timeout = cp->timeout_table->timeout[IP_VS_S_TIME_WAIT];
1226 else
1227 cp->timeout = vs_timeout_table.timeout[IP_VS_S_TIME_WAIT];
1228
1229 /*
1230 * hey, I'm using it
1231 */
1232 atomic_inc(&cp->refcnt);
1233
1234 /*
1235 * do I control anybody?
1236 */
1237 if (atomic_read(&cp->n_control))
1238 goto expire_later;
1239
1240 /*
1241 * unhash it if it is hashed in the conn table
1242 */
1243 if (!ip_vs_conn_unhash(cp))
1244 goto expire_later;
1245
1246 /*
1247 * refcnt==1 implies I'm the only one referrer
1248 */
1249 if (likely(atomic_read(&cp->refcnt) == 1)) {
1250 /* make sure that there is no timer on it now */
1251 if (timer_pending(&cp->timer))
1252 del_timer(&cp->timer);
1253
1254 /* does anybody control me? */
1255 if (cp->control)
1256 ip_vs_control_del(cp);
1257
1258 ip_vs_unbind_dest(cp);
1259 ip_vs_unbind_app(cp);
1260 ip_vs_timeout_detach(cp);
1261 if (cp->flags & IP_VS_CONN_F_NO_CPORT)
1262 atomic_dec(&ip_vs_conn_no_cport_cnt);
1263 atomic_dec(&ip_vs_conn_count);
1264
1265 kmem_cache_free(ip_vs_conn_cachep, cp);
1266 return;
1267 }
1268
1269 /* hash it back to the table */
1270 ip_vs_conn_hash(cp);
1271
1272 expire_later:
1273 IP_VS_DBG(7, "delayed: refcnt-1=%d conn.n_control=%d\n",
1274 atomic_read(&cp->refcnt)-1,
1275 atomic_read(&cp->n_control));
1276
1277 ip_vs_conn_put(cp);
1278 }
1279
1280
ip_vs_conn_expire_now(struct ip_vs_conn * cp)1281 void ip_vs_conn_expire_now(struct ip_vs_conn *cp)
1282 {
1283 cp->timeout = 0;
1284 mod_timer(&cp->timer, jiffies);
1285 }
1286
1287 /*
1288 * Create a new connection entry and hash it into the ip_vs_conn_tab.
1289 */
1290 struct ip_vs_conn *
ip_vs_conn_new(int proto,__u32 caddr,__u16 cport,__u32 vaddr,__u16 vport,__u32 daddr,__u16 dport,unsigned flags,struct ip_vs_dest * dest)1291 ip_vs_conn_new(int proto, __u32 caddr, __u16 cport, __u32 vaddr, __u16 vport,
1292 __u32 daddr, __u16 dport, unsigned flags,
1293 struct ip_vs_dest *dest)
1294 {
1295 struct ip_vs_conn *cp;
1296
1297 cp = kmem_cache_alloc(ip_vs_conn_cachep, GFP_ATOMIC);
1298 if (cp == NULL) {
1299 IP_VS_ERR_RL("ip_vs_conn_new: no memory available.\n");
1300 return NULL;
1301 }
1302
1303 memset(cp, 0, sizeof(*cp));
1304 INIT_LIST_HEAD(&cp->c_list);
1305 init_timer(&cp->timer);
1306 cp->timer.data = (unsigned long)cp;
1307 cp->timer.function = ip_vs_conn_expire;
1308 ip_vs_timeout_attach(cp, ip_vs_timeout_table);
1309 cp->protocol = proto;
1310 cp->caddr = caddr;
1311 cp->cport = cport;
1312 cp->vaddr = vaddr;
1313 cp->vport = vport;
1314 cp->daddr = daddr;
1315 cp->dport = dport;
1316 cp->flags = flags;
1317 cp->app_data = NULL;
1318 cp->control = NULL;
1319 cp->lock = SPIN_LOCK_UNLOCKED;
1320
1321 atomic_set(&cp->n_control, 0);
1322 atomic_set(&cp->in_pkts, 0);
1323
1324 atomic_inc(&ip_vs_conn_count);
1325 if (flags & IP_VS_CONN_F_NO_CPORT)
1326 atomic_inc(&ip_vs_conn_no_cport_cnt);
1327
1328 /* Bind its application helper (only for VS/NAT) if any */
1329 ip_vs_bind_app(cp);
1330
1331 /* Bind the connection with a destination server */
1332 ip_vs_bind_dest(cp, dest);
1333
1334 /* Set its state and timeout */
1335 vs_set_state_timeout(cp, IP_VS_S_NONE);
1336
1337 /* Bind its packet transmitter */
1338 ip_vs_bind_xmit(cp);
1339
1340 /*
1341 * Set the entry is referenced by the current thread before hashing
1342 * it in the table, so that other thread run ip_vs_random_dropentry
1343 * but cannot drop this entry.
1344 */
1345 atomic_set(&cp->refcnt, 1);
1346
1347 /* Hash it in the ip_vs_conn_tab finally */
1348 ip_vs_conn_hash(cp);
1349
1350 return cp;
1351 }
1352
1353
1354 /*
1355 * /proc/net/ip_vs_conn entries
1356 */
1357 static int
ip_vs_conn_getinfo(char * buffer,char ** start,off_t offset,int length)1358 ip_vs_conn_getinfo(char *buffer, char **start, off_t offset, int length)
1359 {
1360 off_t pos=0;
1361 int idx, len=0;
1362 char temp[70];
1363 struct ip_vs_conn *cp;
1364 struct list_head *l, *e;
1365
1366 pos = 128;
1367 if (pos > offset) {
1368 len += sprintf(buffer+len, "%-127s\n",
1369 "Pro FromIP FPrt ToIP TPrt DestIP DPrt State Expires");
1370 }
1371
1372 for(idx = 0; idx < IP_VS_CONN_TAB_SIZE; idx++) {
1373 /*
1374 * Lock is actually only need in next loop
1375 * we are called from uspace: must stop bh.
1376 */
1377 ct_read_lock_bh(idx);
1378
1379 l = &ip_vs_conn_tab[idx];
1380 for (e=l->next; e!=l; e=e->next) {
1381 cp = list_entry(e, struct ip_vs_conn, c_list);
1382 pos += 128;
1383 if (pos <= offset)
1384 continue;
1385 sprintf(temp,
1386 "%-3s %08X %04X %08X %04X %08X %04X %-11s %7lu",
1387 ip_vs_proto_name(cp->protocol),
1388 ntohl(cp->caddr), ntohs(cp->cport),
1389 ntohl(cp->vaddr), ntohs(cp->vport),
1390 ntohl(cp->daddr), ntohs(cp->dport),
1391 ip_vs_state_name(cp->state),
1392 (cp->timer.expires-jiffies)/HZ);
1393 len += sprintf(buffer+len, "%-127s\n", temp);
1394 if (pos >= offset+length) {
1395 ct_read_unlock_bh(idx);
1396 goto done;
1397 }
1398 }
1399 ct_read_unlock_bh(idx);
1400 }
1401
1402 done:
1403 *start = buffer+len-(pos-offset); /* Start of wanted data */
1404 len = pos-offset;
1405 if (len > length)
1406 len = length;
1407 if (len < 0)
1408 len = 0;
1409 return len;
1410 }
1411
1412
1413 /*
1414 * Randomly drop connection entries before running out of memory
1415 */
todrop_entry(struct ip_vs_conn * cp)1416 static inline int todrop_entry(struct ip_vs_conn *cp)
1417 {
1418 /*
1419 * The drop rate array needs tuning for real environments.
1420 * Called from timer bh only => no locking
1421 */
1422 static char todrop_rate[9] = {0, 1, 2, 3, 4, 5, 6, 7, 8};
1423 static char todrop_counter[9] = {0};
1424 int i;
1425
1426 /* if the conn entry hasn't lasted for 60 seconds, don't drop it.
1427 This will leave enough time for normal connection to get
1428 through. */
1429 if (cp->timeout+jiffies-cp->timer.expires < 60*HZ)
1430 return 0;
1431
1432 /* Don't drop the entry if its number of incoming packets is not
1433 located in [0, 8] */
1434 i = atomic_read(&cp->in_pkts);
1435 if (i > 8 || i < 0) return 0;
1436
1437 if (!todrop_rate[i]) return 0;
1438 if (--todrop_counter[i] > 0) return 0;
1439
1440 todrop_counter[i] = todrop_rate[i];
1441 return 1;
1442 }
1443
1444
ip_vs_random_dropentry(void)1445 void ip_vs_random_dropentry(void)
1446 {
1447 int idx;
1448 struct ip_vs_conn *cp;
1449 struct list_head *l,*e;
1450
1451 /*
1452 * Randomly scan 1/32 of the whole table every second
1453 */
1454 for (idx=0; idx<(IP_VS_CONN_TAB_SIZE>>5); idx++) {
1455 unsigned hash = net_random()&IP_VS_CONN_TAB_MASK;
1456
1457 /*
1458 * Lock is actually needed in this loop.
1459 */
1460 ct_write_lock(hash);
1461
1462 l = &ip_vs_conn_tab[hash];
1463 for (e=l->next; e!=l; e=e->next) {
1464 cp = list_entry(e, struct ip_vs_conn, c_list);
1465 if (cp->flags & IP_VS_CONN_F_TEMPLATE)
1466 /* connection template */
1467 continue;
1468 switch(cp->state) {
1469 case IP_VS_S_SYN_RECV:
1470 case IP_VS_S_SYNACK:
1471 break;
1472
1473 case IP_VS_S_ESTABLISHED:
1474 case IP_VS_S_UDP:
1475 if (todrop_entry(cp))
1476 break;
1477 continue;
1478
1479 default:
1480 continue;
1481 }
1482
1483 IP_VS_DBG(4, "del connection\n");
1484 ip_vs_conn_expire_now(cp);
1485 if (cp->control) {
1486 IP_VS_DBG(4, "del conn template\n");
1487 ip_vs_conn_expire_now(cp->control);
1488 }
1489 }
1490 ct_write_unlock(hash);
1491 }
1492 }
1493
1494
1495 /*
1496 * Flush all the connection entries in the ip_vs_conn_tab
1497 */
ip_vs_conn_flush(void)1498 static void ip_vs_conn_flush(void)
1499 {
1500 int idx;
1501 struct ip_vs_conn *cp;
1502 struct list_head *l,*e;
1503
1504 flush_again:
1505 for (idx=0; idx<IP_VS_CONN_TAB_SIZE; idx++) {
1506 /*
1507 * Lock is actually needed in this loop.
1508 */
1509 ct_write_lock_bh(idx);
1510
1511 l = &ip_vs_conn_tab[idx];
1512 for (e=l->next; e!=l; e=e->next) {
1513 cp = list_entry(e, struct ip_vs_conn, c_list);
1514
1515 IP_VS_DBG(4, "del connection\n");
1516 ip_vs_conn_expire_now(cp);
1517 if (cp->control) {
1518 IP_VS_DBG(4, "del conn template\n");
1519 ip_vs_conn_expire_now(cp->control);
1520 }
1521 }
1522 ct_write_unlock_bh(idx);
1523 }
1524
1525 /* the counter may be not NULL, because maybe some conn entries
1526 are run by slow timer handler or unhashed but still referred */
1527 if (atomic_read(&ip_vs_conn_count) != 0) {
1528 schedule();
1529 goto flush_again;
1530 }
1531 }
1532
1533
ip_vs_conn_init(void)1534 int ip_vs_conn_init(void)
1535 {
1536 int idx;
1537
1538 /*
1539 * Allocate the connection hash table and initialize its list heads
1540 */
1541 ip_vs_conn_tab = vmalloc(IP_VS_CONN_TAB_SIZE*sizeof(struct list_head));
1542 if (!ip_vs_conn_tab)
1543 return -ENOMEM;
1544
1545 IP_VS_INFO("Connection hash table configured "
1546 "(size=%d, memory=%ldKbytes)\n",
1547 IP_VS_CONN_TAB_SIZE,
1548 (long)(IP_VS_CONN_TAB_SIZE*sizeof(struct list_head))/1024);
1549 IP_VS_DBG(0, "Each connection entry needs %d bytes at least\n",
1550 sizeof(struct ip_vs_conn));
1551
1552 for (idx = 0; idx < IP_VS_CONN_TAB_SIZE; idx++) {
1553 INIT_LIST_HEAD(&ip_vs_conn_tab[idx]);
1554 }
1555
1556 for (idx = 0; idx < CT_LOCKARRAY_SIZE; idx++) {
1557 __ip_vs_conntbl_lock_array[idx].l = RW_LOCK_UNLOCKED;
1558 }
1559
1560 /* Allocate ip_vs_conn slab cache */
1561 ip_vs_conn_cachep = kmem_cache_create("ip_vs_conn",
1562 sizeof(struct ip_vs_conn), 0,
1563 SLAB_HWCACHE_ALIGN, NULL, NULL);
1564 if (!ip_vs_conn_cachep) {
1565 vfree(ip_vs_conn_tab);
1566 return -ENOMEM;
1567 }
1568
1569 proc_net_create("ip_vs_conn", 0, ip_vs_conn_getinfo);
1570
1571 /* calculate the random value for connection hash */
1572 get_random_bytes(&ip_vs_conn_rnd, sizeof(ip_vs_conn_rnd));
1573
1574 return 0;
1575 }
1576
ip_vs_conn_cleanup(void)1577 void ip_vs_conn_cleanup(void)
1578 {
1579 /* flush all the connection entries first */
1580 ip_vs_conn_flush();
1581
1582 /* Release the empty cache */
1583 kmem_cache_destroy(ip_vs_conn_cachep);
1584 proc_net_remove("ip_vs_conn");
1585 vfree(ip_vs_conn_tab);
1586 }
1587