1 /*
2  * IPVS         An implementation of the IP virtual server support for the
3  *              LINUX operating system.  IPVS is now implemented as a module
4  *              over the Netfilter framework. IPVS can be used to build a
5  *              high-performance and highly available server based on a
6  *              cluster of servers.
7  *
8  * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
9  *              Peter Kese <peter.kese@ijs.si>
10  *              Julian Anastasov <ja@ssi.bg>
11  *
12  *              This program is free software; you can redistribute it and/or
13  *              modify it under the terms of the GNU General Public License
14  *              as published by the Free Software Foundation; either version
15  *              2 of the License, or (at your option) any later version.
16  *
17  * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese,
18  * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms
19  * and others. Many code here is taken from IP MASQ code of kernel 2.2.
20  *
21  * Changes:
22  *
23  */
24 
25 #define KMSG_COMPONENT "IPVS"
26 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
27 
28 #include <linux/interrupt.h>
29 #include <linux/in.h>
30 #include <linux/net.h>
31 #include <linux/kernel.h>
32 #include <linux/module.h>
33 #include <linux/vmalloc.h>
34 #include <linux/proc_fs.h>		/* for proc_net_* */
35 #include <linux/slab.h>
36 #include <linux/seq_file.h>
37 #include <linux/jhash.h>
38 #include <linux/random.h>
39 
40 #include <net/net_namespace.h>
41 #include <net/ip_vs.h>
42 
43 
44 #ifndef CONFIG_IP_VS_TAB_BITS
45 #define CONFIG_IP_VS_TAB_BITS	12
46 #endif
47 
48 /*
49  * Connection hash size. Default is what was selected at compile time.
50 */
51 static int ip_vs_conn_tab_bits = CONFIG_IP_VS_TAB_BITS;
52 module_param_named(conn_tab_bits, ip_vs_conn_tab_bits, int, 0444);
53 MODULE_PARM_DESC(conn_tab_bits, "Set connections' hash size");
54 
55 /* size and mask values */
56 int ip_vs_conn_tab_size __read_mostly;
57 static int ip_vs_conn_tab_mask __read_mostly;
58 
59 /*
60  *  Connection hash table: for input and output packets lookups of IPVS
61  */
62 static struct hlist_head *ip_vs_conn_tab __read_mostly;
63 
64 /*  SLAB cache for IPVS connections */
65 static struct kmem_cache *ip_vs_conn_cachep __read_mostly;
66 
67 /*  counter for no client port connections */
68 static atomic_t ip_vs_conn_no_cport_cnt = ATOMIC_INIT(0);
69 
70 /* random value for IPVS connection hash */
71 static unsigned int ip_vs_conn_rnd __read_mostly;
72 
73 /*
74  *  Fine locking granularity for big connection hash table
75  */
76 #define CT_LOCKARRAY_BITS  5
77 #define CT_LOCKARRAY_SIZE  (1<<CT_LOCKARRAY_BITS)
78 #define CT_LOCKARRAY_MASK  (CT_LOCKARRAY_SIZE-1)
79 
80 struct ip_vs_aligned_lock
81 {
82 	rwlock_t	l;
83 } __attribute__((__aligned__(SMP_CACHE_BYTES)));
84 
85 /* lock array for conn table */
86 static struct ip_vs_aligned_lock
87 __ip_vs_conntbl_lock_array[CT_LOCKARRAY_SIZE] __cacheline_aligned;
88 
ct_read_lock(unsigned key)89 static inline void ct_read_lock(unsigned key)
90 {
91 	read_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
92 }
93 
ct_read_unlock(unsigned key)94 static inline void ct_read_unlock(unsigned key)
95 {
96 	read_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
97 }
98 
ct_write_lock(unsigned key)99 static inline void ct_write_lock(unsigned key)
100 {
101 	write_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
102 }
103 
ct_write_unlock(unsigned key)104 static inline void ct_write_unlock(unsigned key)
105 {
106 	write_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
107 }
108 
ct_read_lock_bh(unsigned key)109 static inline void ct_read_lock_bh(unsigned key)
110 {
111 	read_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
112 }
113 
ct_read_unlock_bh(unsigned key)114 static inline void ct_read_unlock_bh(unsigned key)
115 {
116 	read_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
117 }
118 
ct_write_lock_bh(unsigned key)119 static inline void ct_write_lock_bh(unsigned key)
120 {
121 	write_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
122 }
123 
ct_write_unlock_bh(unsigned key)124 static inline void ct_write_unlock_bh(unsigned key)
125 {
126 	write_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
127 }
128 
129 
130 /*
131  *	Returns hash value for IPVS connection entry
132  */
ip_vs_conn_hashkey(struct net * net,int af,unsigned proto,const union nf_inet_addr * addr,__be16 port)133 static unsigned int ip_vs_conn_hashkey(struct net *net, int af, unsigned proto,
134 				       const union nf_inet_addr *addr,
135 				       __be16 port)
136 {
137 #ifdef CONFIG_IP_VS_IPV6
138 	if (af == AF_INET6)
139 		return (jhash_3words(jhash(addr, 16, ip_vs_conn_rnd),
140 				    (__force u32)port, proto, ip_vs_conn_rnd) ^
141 			((size_t)net>>8)) & ip_vs_conn_tab_mask;
142 #endif
143 	return (jhash_3words((__force u32)addr->ip, (__force u32)port, proto,
144 			    ip_vs_conn_rnd) ^
145 		((size_t)net>>8)) & ip_vs_conn_tab_mask;
146 }
147 
ip_vs_conn_hashkey_param(const struct ip_vs_conn_param * p,bool inverse)148 static unsigned int ip_vs_conn_hashkey_param(const struct ip_vs_conn_param *p,
149 					     bool inverse)
150 {
151 	const union nf_inet_addr *addr;
152 	__be16 port;
153 
154 	if (p->pe_data && p->pe->hashkey_raw)
155 		return p->pe->hashkey_raw(p, ip_vs_conn_rnd, inverse) &
156 			ip_vs_conn_tab_mask;
157 
158 	if (likely(!inverse)) {
159 		addr = p->caddr;
160 		port = p->cport;
161 	} else {
162 		addr = p->vaddr;
163 		port = p->vport;
164 	}
165 
166 	return ip_vs_conn_hashkey(p->net, p->af, p->protocol, addr, port);
167 }
168 
ip_vs_conn_hashkey_conn(const struct ip_vs_conn * cp)169 static unsigned int ip_vs_conn_hashkey_conn(const struct ip_vs_conn *cp)
170 {
171 	struct ip_vs_conn_param p;
172 
173 	ip_vs_conn_fill_param(ip_vs_conn_net(cp), cp->af, cp->protocol,
174 			      &cp->caddr, cp->cport, NULL, 0, &p);
175 
176 	if (cp->pe) {
177 		p.pe = cp->pe;
178 		p.pe_data = cp->pe_data;
179 		p.pe_data_len = cp->pe_data_len;
180 	}
181 
182 	return ip_vs_conn_hashkey_param(&p, false);
183 }
184 
185 /*
186  *	Hashes ip_vs_conn in ip_vs_conn_tab by netns,proto,addr,port.
187  *	returns bool success.
188  */
ip_vs_conn_hash(struct ip_vs_conn * cp)189 static inline int ip_vs_conn_hash(struct ip_vs_conn *cp)
190 {
191 	unsigned hash;
192 	int ret;
193 
194 	if (cp->flags & IP_VS_CONN_F_ONE_PACKET)
195 		return 0;
196 
197 	/* Hash by protocol, client address and port */
198 	hash = ip_vs_conn_hashkey_conn(cp);
199 
200 	ct_write_lock(hash);
201 	spin_lock(&cp->lock);
202 
203 	if (!(cp->flags & IP_VS_CONN_F_HASHED)) {
204 		hlist_add_head(&cp->c_list, &ip_vs_conn_tab[hash]);
205 		cp->flags |= IP_VS_CONN_F_HASHED;
206 		atomic_inc(&cp->refcnt);
207 		ret = 1;
208 	} else {
209 		pr_err("%s(): request for already hashed, called from %pF\n",
210 		       __func__, __builtin_return_address(0));
211 		ret = 0;
212 	}
213 
214 	spin_unlock(&cp->lock);
215 	ct_write_unlock(hash);
216 
217 	return ret;
218 }
219 
220 
221 /*
222  *	UNhashes ip_vs_conn from ip_vs_conn_tab.
223  *	returns bool success.
224  */
ip_vs_conn_unhash(struct ip_vs_conn * cp)225 static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp)
226 {
227 	unsigned hash;
228 	int ret;
229 
230 	/* unhash it and decrease its reference counter */
231 	hash = ip_vs_conn_hashkey_conn(cp);
232 
233 	ct_write_lock(hash);
234 	spin_lock(&cp->lock);
235 
236 	if (cp->flags & IP_VS_CONN_F_HASHED) {
237 		hlist_del(&cp->c_list);
238 		cp->flags &= ~IP_VS_CONN_F_HASHED;
239 		atomic_dec(&cp->refcnt);
240 		ret = 1;
241 	} else
242 		ret = 0;
243 
244 	spin_unlock(&cp->lock);
245 	ct_write_unlock(hash);
246 
247 	return ret;
248 }
249 
250 
251 /*
252  *  Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab.
253  *  Called for pkts coming from OUTside-to-INside.
254  *	p->caddr, p->cport: pkt source address (foreign host)
255  *	p->vaddr, p->vport: pkt dest address (load balancer)
256  */
257 static inline struct ip_vs_conn *
__ip_vs_conn_in_get(const struct ip_vs_conn_param * p)258 __ip_vs_conn_in_get(const struct ip_vs_conn_param *p)
259 {
260 	unsigned hash;
261 	struct ip_vs_conn *cp;
262 	struct hlist_node *n;
263 
264 	hash = ip_vs_conn_hashkey_param(p, false);
265 
266 	ct_read_lock(hash);
267 
268 	hlist_for_each_entry(cp, n, &ip_vs_conn_tab[hash], c_list) {
269 		if (cp->af == p->af &&
270 		    p->cport == cp->cport && p->vport == cp->vport &&
271 		    ip_vs_addr_equal(p->af, p->caddr, &cp->caddr) &&
272 		    ip_vs_addr_equal(p->af, p->vaddr, &cp->vaddr) &&
273 		    ((!p->cport) ^ (!(cp->flags & IP_VS_CONN_F_NO_CPORT))) &&
274 		    p->protocol == cp->protocol &&
275 		    ip_vs_conn_net_eq(cp, p->net)) {
276 			/* HIT */
277 			atomic_inc(&cp->refcnt);
278 			ct_read_unlock(hash);
279 			return cp;
280 		}
281 	}
282 
283 	ct_read_unlock(hash);
284 
285 	return NULL;
286 }
287 
ip_vs_conn_in_get(const struct ip_vs_conn_param * p)288 struct ip_vs_conn *ip_vs_conn_in_get(const struct ip_vs_conn_param *p)
289 {
290 	struct ip_vs_conn *cp;
291 
292 	cp = __ip_vs_conn_in_get(p);
293 	if (!cp && atomic_read(&ip_vs_conn_no_cport_cnt)) {
294 		struct ip_vs_conn_param cport_zero_p = *p;
295 		cport_zero_p.cport = 0;
296 		cp = __ip_vs_conn_in_get(&cport_zero_p);
297 	}
298 
299 	IP_VS_DBG_BUF(9, "lookup/in %s %s:%d->%s:%d %s\n",
300 		      ip_vs_proto_name(p->protocol),
301 		      IP_VS_DBG_ADDR(p->af, p->caddr), ntohs(p->cport),
302 		      IP_VS_DBG_ADDR(p->af, p->vaddr), ntohs(p->vport),
303 		      cp ? "hit" : "not hit");
304 
305 	return cp;
306 }
307 
308 static int
ip_vs_conn_fill_param_proto(int af,const struct sk_buff * skb,const struct ip_vs_iphdr * iph,unsigned int proto_off,int inverse,struct ip_vs_conn_param * p)309 ip_vs_conn_fill_param_proto(int af, const struct sk_buff *skb,
310 			    const struct ip_vs_iphdr *iph,
311 			    unsigned int proto_off, int inverse,
312 			    struct ip_vs_conn_param *p)
313 {
314 	__be16 _ports[2], *pptr;
315 	struct net *net = skb_net(skb);
316 
317 	pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports);
318 	if (pptr == NULL)
319 		return 1;
320 
321 	if (likely(!inverse))
322 		ip_vs_conn_fill_param(net, af, iph->protocol, &iph->saddr,
323 				      pptr[0], &iph->daddr, pptr[1], p);
324 	else
325 		ip_vs_conn_fill_param(net, af, iph->protocol, &iph->daddr,
326 				      pptr[1], &iph->saddr, pptr[0], p);
327 	return 0;
328 }
329 
330 struct ip_vs_conn *
ip_vs_conn_in_get_proto(int af,const struct sk_buff * skb,const struct ip_vs_iphdr * iph,unsigned int proto_off,int inverse)331 ip_vs_conn_in_get_proto(int af, const struct sk_buff *skb,
332 			const struct ip_vs_iphdr *iph,
333 			unsigned int proto_off, int inverse)
334 {
335 	struct ip_vs_conn_param p;
336 
337 	if (ip_vs_conn_fill_param_proto(af, skb, iph, proto_off, inverse, &p))
338 		return NULL;
339 
340 	return ip_vs_conn_in_get(&p);
341 }
342 EXPORT_SYMBOL_GPL(ip_vs_conn_in_get_proto);
343 
344 /* Get reference to connection template */
ip_vs_ct_in_get(const struct ip_vs_conn_param * p)345 struct ip_vs_conn *ip_vs_ct_in_get(const struct ip_vs_conn_param *p)
346 {
347 	unsigned hash;
348 	struct ip_vs_conn *cp;
349 	struct hlist_node *n;
350 
351 	hash = ip_vs_conn_hashkey_param(p, false);
352 
353 	ct_read_lock(hash);
354 
355 	hlist_for_each_entry(cp, n, &ip_vs_conn_tab[hash], c_list) {
356 		if (!ip_vs_conn_net_eq(cp, p->net))
357 			continue;
358 		if (p->pe_data && p->pe->ct_match) {
359 			if (p->pe == cp->pe && p->pe->ct_match(p, cp))
360 				goto out;
361 			continue;
362 		}
363 
364 		if (cp->af == p->af &&
365 		    ip_vs_addr_equal(p->af, p->caddr, &cp->caddr) &&
366 		    /* protocol should only be IPPROTO_IP if
367 		     * p->vaddr is a fwmark */
368 		    ip_vs_addr_equal(p->protocol == IPPROTO_IP ? AF_UNSPEC :
369 				     p->af, p->vaddr, &cp->vaddr) &&
370 		    p->cport == cp->cport && p->vport == cp->vport &&
371 		    cp->flags & IP_VS_CONN_F_TEMPLATE &&
372 		    p->protocol == cp->protocol)
373 			goto out;
374 	}
375 	cp = NULL;
376 
377   out:
378 	if (cp)
379 		atomic_inc(&cp->refcnt);
380 	ct_read_unlock(hash);
381 
382 	IP_VS_DBG_BUF(9, "template lookup/in %s %s:%d->%s:%d %s\n",
383 		      ip_vs_proto_name(p->protocol),
384 		      IP_VS_DBG_ADDR(p->af, p->caddr), ntohs(p->cport),
385 		      IP_VS_DBG_ADDR(p->af, p->vaddr), ntohs(p->vport),
386 		      cp ? "hit" : "not hit");
387 
388 	return cp;
389 }
390 
391 /* Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab.
392  * Called for pkts coming from inside-to-OUTside.
393  *	p->caddr, p->cport: pkt source address (inside host)
394  *	p->vaddr, p->vport: pkt dest address (foreign host) */
ip_vs_conn_out_get(const struct ip_vs_conn_param * p)395 struct ip_vs_conn *ip_vs_conn_out_get(const struct ip_vs_conn_param *p)
396 {
397 	unsigned hash;
398 	struct ip_vs_conn *cp, *ret=NULL;
399 	struct hlist_node *n;
400 
401 	/*
402 	 *	Check for "full" addressed entries
403 	 */
404 	hash = ip_vs_conn_hashkey_param(p, true);
405 
406 	ct_read_lock(hash);
407 
408 	hlist_for_each_entry(cp, n, &ip_vs_conn_tab[hash], c_list) {
409 		if (cp->af == p->af &&
410 		    p->vport == cp->cport && p->cport == cp->dport &&
411 		    ip_vs_addr_equal(p->af, p->vaddr, &cp->caddr) &&
412 		    ip_vs_addr_equal(p->af, p->caddr, &cp->daddr) &&
413 		    p->protocol == cp->protocol &&
414 		    ip_vs_conn_net_eq(cp, p->net)) {
415 			/* HIT */
416 			atomic_inc(&cp->refcnt);
417 			ret = cp;
418 			break;
419 		}
420 	}
421 
422 	ct_read_unlock(hash);
423 
424 	IP_VS_DBG_BUF(9, "lookup/out %s %s:%d->%s:%d %s\n",
425 		      ip_vs_proto_name(p->protocol),
426 		      IP_VS_DBG_ADDR(p->af, p->caddr), ntohs(p->cport),
427 		      IP_VS_DBG_ADDR(p->af, p->vaddr), ntohs(p->vport),
428 		      ret ? "hit" : "not hit");
429 
430 	return ret;
431 }
432 
433 struct ip_vs_conn *
ip_vs_conn_out_get_proto(int af,const struct sk_buff * skb,const struct ip_vs_iphdr * iph,unsigned int proto_off,int inverse)434 ip_vs_conn_out_get_proto(int af, const struct sk_buff *skb,
435 			 const struct ip_vs_iphdr *iph,
436 			 unsigned int proto_off, int inverse)
437 {
438 	struct ip_vs_conn_param p;
439 
440 	if (ip_vs_conn_fill_param_proto(af, skb, iph, proto_off, inverse, &p))
441 		return NULL;
442 
443 	return ip_vs_conn_out_get(&p);
444 }
445 EXPORT_SYMBOL_GPL(ip_vs_conn_out_get_proto);
446 
447 /*
448  *      Put back the conn and restart its timer with its timeout
449  */
ip_vs_conn_put(struct ip_vs_conn * cp)450 void ip_vs_conn_put(struct ip_vs_conn *cp)
451 {
452 	unsigned long t = (cp->flags & IP_VS_CONN_F_ONE_PACKET) ?
453 		0 : cp->timeout;
454 	mod_timer(&cp->timer, jiffies+t);
455 
456 	__ip_vs_conn_put(cp);
457 }
458 
459 
460 /*
461  *	Fill a no_client_port connection with a client port number
462  */
ip_vs_conn_fill_cport(struct ip_vs_conn * cp,__be16 cport)463 void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport)
464 {
465 	if (ip_vs_conn_unhash(cp)) {
466 		spin_lock(&cp->lock);
467 		if (cp->flags & IP_VS_CONN_F_NO_CPORT) {
468 			atomic_dec(&ip_vs_conn_no_cport_cnt);
469 			cp->flags &= ~IP_VS_CONN_F_NO_CPORT;
470 			cp->cport = cport;
471 		}
472 		spin_unlock(&cp->lock);
473 
474 		/* hash on new dport */
475 		ip_vs_conn_hash(cp);
476 	}
477 }
478 
479 
480 /*
481  *	Bind a connection entry with the corresponding packet_xmit.
482  *	Called by ip_vs_conn_new.
483  */
ip_vs_bind_xmit(struct ip_vs_conn * cp)484 static inline void ip_vs_bind_xmit(struct ip_vs_conn *cp)
485 {
486 	switch (IP_VS_FWD_METHOD(cp)) {
487 	case IP_VS_CONN_F_MASQ:
488 		cp->packet_xmit = ip_vs_nat_xmit;
489 		break;
490 
491 	case IP_VS_CONN_F_TUNNEL:
492 		cp->packet_xmit = ip_vs_tunnel_xmit;
493 		break;
494 
495 	case IP_VS_CONN_F_DROUTE:
496 		cp->packet_xmit = ip_vs_dr_xmit;
497 		break;
498 
499 	case IP_VS_CONN_F_LOCALNODE:
500 		cp->packet_xmit = ip_vs_null_xmit;
501 		break;
502 
503 	case IP_VS_CONN_F_BYPASS:
504 		cp->packet_xmit = ip_vs_bypass_xmit;
505 		break;
506 	}
507 }
508 
509 #ifdef CONFIG_IP_VS_IPV6
ip_vs_bind_xmit_v6(struct ip_vs_conn * cp)510 static inline void ip_vs_bind_xmit_v6(struct ip_vs_conn *cp)
511 {
512 	switch (IP_VS_FWD_METHOD(cp)) {
513 	case IP_VS_CONN_F_MASQ:
514 		cp->packet_xmit = ip_vs_nat_xmit_v6;
515 		break;
516 
517 	case IP_VS_CONN_F_TUNNEL:
518 		cp->packet_xmit = ip_vs_tunnel_xmit_v6;
519 		break;
520 
521 	case IP_VS_CONN_F_DROUTE:
522 		cp->packet_xmit = ip_vs_dr_xmit_v6;
523 		break;
524 
525 	case IP_VS_CONN_F_LOCALNODE:
526 		cp->packet_xmit = ip_vs_null_xmit;
527 		break;
528 
529 	case IP_VS_CONN_F_BYPASS:
530 		cp->packet_xmit = ip_vs_bypass_xmit_v6;
531 		break;
532 	}
533 }
534 #endif
535 
536 
ip_vs_dest_totalconns(struct ip_vs_dest * dest)537 static inline int ip_vs_dest_totalconns(struct ip_vs_dest *dest)
538 {
539 	return atomic_read(&dest->activeconns)
540 		+ atomic_read(&dest->inactconns);
541 }
542 
543 /*
544  *	Bind a connection entry with a virtual service destination
545  *	Called just after a new connection entry is created.
546  */
547 static inline void
ip_vs_bind_dest(struct ip_vs_conn * cp,struct ip_vs_dest * dest)548 ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest)
549 {
550 	unsigned int conn_flags;
551 
552 	/* if dest is NULL, then return directly */
553 	if (!dest)
554 		return;
555 
556 	/* Increase the refcnt counter of the dest */
557 	atomic_inc(&dest->refcnt);
558 
559 	conn_flags = atomic_read(&dest->conn_flags);
560 	if (cp->protocol != IPPROTO_UDP)
561 		conn_flags &= ~IP_VS_CONN_F_ONE_PACKET;
562 	/* Bind with the destination and its corresponding transmitter */
563 	if (cp->flags & IP_VS_CONN_F_SYNC) {
564 		/* if the connection is not template and is created
565 		 * by sync, preserve the activity flag.
566 		 */
567 		if (!(cp->flags & IP_VS_CONN_F_TEMPLATE))
568 			conn_flags &= ~IP_VS_CONN_F_INACTIVE;
569 		/* connections inherit forwarding method from dest */
570 		cp->flags &= ~IP_VS_CONN_F_FWD_MASK;
571 	}
572 	cp->flags |= conn_flags;
573 	cp->dest = dest;
574 
575 	IP_VS_DBG_BUF(7, "Bind-dest %s c:%s:%d v:%s:%d "
576 		      "d:%s:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d "
577 		      "dest->refcnt:%d\n",
578 		      ip_vs_proto_name(cp->protocol),
579 		      IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport),
580 		      IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport),
581 		      IP_VS_DBG_ADDR(cp->af, &cp->daddr), ntohs(cp->dport),
582 		      ip_vs_fwd_tag(cp), cp->state,
583 		      cp->flags, atomic_read(&cp->refcnt),
584 		      atomic_read(&dest->refcnt));
585 
586 	/* Update the connection counters */
587 	if (!(cp->flags & IP_VS_CONN_F_TEMPLATE)) {
588 		/* It is a normal connection, so increase the inactive
589 		   connection counter because it is in TCP SYNRECV
590 		   state (inactive) or other protocol inacive state */
591 		if ((cp->flags & IP_VS_CONN_F_SYNC) &&
592 		    (!(cp->flags & IP_VS_CONN_F_INACTIVE)))
593 			atomic_inc(&dest->activeconns);
594 		else
595 			atomic_inc(&dest->inactconns);
596 	} else {
597 		/* It is a persistent connection/template, so increase
598 		   the persistent connection counter */
599 		atomic_inc(&dest->persistconns);
600 	}
601 
602 	if (dest->u_threshold != 0 &&
603 	    ip_vs_dest_totalconns(dest) >= dest->u_threshold)
604 		dest->flags |= IP_VS_DEST_F_OVERLOAD;
605 }
606 
607 
608 /*
609  * Check if there is a destination for the connection, if so
610  * bind the connection to the destination.
611  */
ip_vs_try_bind_dest(struct ip_vs_conn * cp)612 struct ip_vs_dest *ip_vs_try_bind_dest(struct ip_vs_conn *cp)
613 {
614 	struct ip_vs_dest *dest;
615 
616 	if ((cp) && (!cp->dest)) {
617 		dest = ip_vs_find_dest(ip_vs_conn_net(cp), cp->af, &cp->daddr,
618 				       cp->dport, &cp->vaddr, cp->vport,
619 				       cp->protocol, cp->fwmark, cp->flags);
620 		ip_vs_bind_dest(cp, dest);
621 		return dest;
622 	} else
623 		return NULL;
624 }
625 
626 
627 /*
628  *	Unbind a connection entry with its VS destination
629  *	Called by the ip_vs_conn_expire function.
630  */
ip_vs_unbind_dest(struct ip_vs_conn * cp)631 static inline void ip_vs_unbind_dest(struct ip_vs_conn *cp)
632 {
633 	struct ip_vs_dest *dest = cp->dest;
634 
635 	if (!dest)
636 		return;
637 
638 	IP_VS_DBG_BUF(7, "Unbind-dest %s c:%s:%d v:%s:%d "
639 		      "d:%s:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d "
640 		      "dest->refcnt:%d\n",
641 		      ip_vs_proto_name(cp->protocol),
642 		      IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport),
643 		      IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport),
644 		      IP_VS_DBG_ADDR(cp->af, &cp->daddr), ntohs(cp->dport),
645 		      ip_vs_fwd_tag(cp), cp->state,
646 		      cp->flags, atomic_read(&cp->refcnt),
647 		      atomic_read(&dest->refcnt));
648 
649 	/* Update the connection counters */
650 	if (!(cp->flags & IP_VS_CONN_F_TEMPLATE)) {
651 		/* It is a normal connection, so decrease the inactconns
652 		   or activeconns counter */
653 		if (cp->flags & IP_VS_CONN_F_INACTIVE) {
654 			atomic_dec(&dest->inactconns);
655 		} else {
656 			atomic_dec(&dest->activeconns);
657 		}
658 	} else {
659 		/* It is a persistent connection/template, so decrease
660 		   the persistent connection counter */
661 		atomic_dec(&dest->persistconns);
662 	}
663 
664 	if (dest->l_threshold != 0) {
665 		if (ip_vs_dest_totalconns(dest) < dest->l_threshold)
666 			dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
667 	} else if (dest->u_threshold != 0) {
668 		if (ip_vs_dest_totalconns(dest) * 4 < dest->u_threshold * 3)
669 			dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
670 	} else {
671 		if (dest->flags & IP_VS_DEST_F_OVERLOAD)
672 			dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
673 	}
674 
675 	/*
676 	 * Simply decrease the refcnt of the dest, because the
677 	 * dest will be either in service's destination list
678 	 * or in the trash.
679 	 */
680 	atomic_dec(&dest->refcnt);
681 }
682 
expire_quiescent_template(struct netns_ipvs * ipvs,struct ip_vs_dest * dest)683 static int expire_quiescent_template(struct netns_ipvs *ipvs,
684 				     struct ip_vs_dest *dest)
685 {
686 #ifdef CONFIG_SYSCTL
687 	return ipvs->sysctl_expire_quiescent_template &&
688 		(atomic_read(&dest->weight) == 0);
689 #else
690 	return 0;
691 #endif
692 }
693 
694 /*
695  *	Checking if the destination of a connection template is available.
696  *	If available, return 1, otherwise invalidate this connection
697  *	template and return 0.
698  */
ip_vs_check_template(struct ip_vs_conn * ct)699 int ip_vs_check_template(struct ip_vs_conn *ct)
700 {
701 	struct ip_vs_dest *dest = ct->dest;
702 	struct netns_ipvs *ipvs = net_ipvs(ip_vs_conn_net(ct));
703 
704 	/*
705 	 * Checking the dest server status.
706 	 */
707 	if ((dest == NULL) ||
708 	    !(dest->flags & IP_VS_DEST_F_AVAILABLE) ||
709 	    expire_quiescent_template(ipvs, dest)) {
710 		IP_VS_DBG_BUF(9, "check_template: dest not available for "
711 			      "protocol %s s:%s:%d v:%s:%d "
712 			      "-> d:%s:%d\n",
713 			      ip_vs_proto_name(ct->protocol),
714 			      IP_VS_DBG_ADDR(ct->af, &ct->caddr),
715 			      ntohs(ct->cport),
716 			      IP_VS_DBG_ADDR(ct->af, &ct->vaddr),
717 			      ntohs(ct->vport),
718 			      IP_VS_DBG_ADDR(ct->af, &ct->daddr),
719 			      ntohs(ct->dport));
720 
721 		/*
722 		 * Invalidate the connection template
723 		 */
724 		if (ct->vport != htons(0xffff)) {
725 			if (ip_vs_conn_unhash(ct)) {
726 				ct->dport = htons(0xffff);
727 				ct->vport = htons(0xffff);
728 				ct->cport = 0;
729 				ip_vs_conn_hash(ct);
730 			}
731 		}
732 
733 		/*
734 		 * Simply decrease the refcnt of the template,
735 		 * don't restart its timer.
736 		 */
737 		atomic_dec(&ct->refcnt);
738 		return 0;
739 	}
740 	return 1;
741 }
742 
ip_vs_conn_expire(unsigned long data)743 static void ip_vs_conn_expire(unsigned long data)
744 {
745 	struct ip_vs_conn *cp = (struct ip_vs_conn *)data;
746 	struct netns_ipvs *ipvs = net_ipvs(ip_vs_conn_net(cp));
747 
748 	cp->timeout = 60*HZ;
749 
750 	/*
751 	 *	hey, I'm using it
752 	 */
753 	atomic_inc(&cp->refcnt);
754 
755 	/*
756 	 *	do I control anybody?
757 	 */
758 	if (atomic_read(&cp->n_control))
759 		goto expire_later;
760 
761 	/*
762 	 *	unhash it if it is hashed in the conn table
763 	 */
764 	if (!ip_vs_conn_unhash(cp) && !(cp->flags & IP_VS_CONN_F_ONE_PACKET))
765 		goto expire_later;
766 
767 	/*
768 	 *	refcnt==1 implies I'm the only one referrer
769 	 */
770 	if (likely(atomic_read(&cp->refcnt) == 1)) {
771 		/* delete the timer if it is activated by other users */
772 		if (timer_pending(&cp->timer))
773 			del_timer(&cp->timer);
774 
775 		/* does anybody control me? */
776 		if (cp->control)
777 			ip_vs_control_del(cp);
778 
779 		if (cp->flags & IP_VS_CONN_F_NFCT) {
780 			ip_vs_conn_drop_conntrack(cp);
781 			/* Do not access conntracks during subsys cleanup
782 			 * because nf_conntrack_find_get can not be used after
783 			 * conntrack cleanup for the net.
784 			 */
785 			smp_rmb();
786 			if (ipvs->enable)
787 				ip_vs_conn_drop_conntrack(cp);
788 		}
789 
790 		ip_vs_pe_put(cp->pe);
791 		kfree(cp->pe_data);
792 		if (unlikely(cp->app != NULL))
793 			ip_vs_unbind_app(cp);
794 		ip_vs_unbind_dest(cp);
795 		if (cp->flags & IP_VS_CONN_F_NO_CPORT)
796 			atomic_dec(&ip_vs_conn_no_cport_cnt);
797 		atomic_dec(&ipvs->conn_count);
798 
799 		kmem_cache_free(ip_vs_conn_cachep, cp);
800 		return;
801 	}
802 
803 	/* hash it back to the table */
804 	ip_vs_conn_hash(cp);
805 
806   expire_later:
807 	IP_VS_DBG(7, "delayed: conn->refcnt-1=%d conn->n_control=%d\n",
808 		  atomic_read(&cp->refcnt)-1,
809 		  atomic_read(&cp->n_control));
810 
811 	ip_vs_conn_put(cp);
812 }
813 
814 
ip_vs_conn_expire_now(struct ip_vs_conn * cp)815 void ip_vs_conn_expire_now(struct ip_vs_conn *cp)
816 {
817 	if (del_timer(&cp->timer))
818 		mod_timer(&cp->timer, jiffies);
819 }
820 
821 
822 /*
823  *	Create a new connection entry and hash it into the ip_vs_conn_tab
824  */
825 struct ip_vs_conn *
ip_vs_conn_new(const struct ip_vs_conn_param * p,const union nf_inet_addr * daddr,__be16 dport,unsigned flags,struct ip_vs_dest * dest,__u32 fwmark)826 ip_vs_conn_new(const struct ip_vs_conn_param *p,
827 	       const union nf_inet_addr *daddr, __be16 dport, unsigned flags,
828 	       struct ip_vs_dest *dest, __u32 fwmark)
829 {
830 	struct ip_vs_conn *cp;
831 	struct netns_ipvs *ipvs = net_ipvs(p->net);
832 	struct ip_vs_proto_data *pd = ip_vs_proto_data_get(p->net,
833 							   p->protocol);
834 
835 	cp = kmem_cache_zalloc(ip_vs_conn_cachep, GFP_ATOMIC);
836 	if (cp == NULL) {
837 		IP_VS_ERR_RL("%s(): no memory\n", __func__);
838 		return NULL;
839 	}
840 
841 	INIT_HLIST_NODE(&cp->c_list);
842 	setup_timer(&cp->timer, ip_vs_conn_expire, (unsigned long)cp);
843 	ip_vs_conn_net_set(cp, p->net);
844 	cp->af		   = p->af;
845 	cp->protocol	   = p->protocol;
846 	ip_vs_addr_copy(p->af, &cp->caddr, p->caddr);
847 	cp->cport	   = p->cport;
848 	ip_vs_addr_copy(p->af, &cp->vaddr, p->vaddr);
849 	cp->vport	   = p->vport;
850 	/* proto should only be IPPROTO_IP if d_addr is a fwmark */
851 	ip_vs_addr_copy(p->protocol == IPPROTO_IP ? AF_UNSPEC : p->af,
852 			&cp->daddr, daddr);
853 	cp->dport          = dport;
854 	cp->flags	   = flags;
855 	cp->fwmark         = fwmark;
856 	if (flags & IP_VS_CONN_F_TEMPLATE && p->pe) {
857 		ip_vs_pe_get(p->pe);
858 		cp->pe = p->pe;
859 		cp->pe_data = p->pe_data;
860 		cp->pe_data_len = p->pe_data_len;
861 	}
862 	spin_lock_init(&cp->lock);
863 
864 	/*
865 	 * Set the entry is referenced by the current thread before hashing
866 	 * it in the table, so that other thread run ip_vs_random_dropentry
867 	 * but cannot drop this entry.
868 	 */
869 	atomic_set(&cp->refcnt, 1);
870 
871 	atomic_set(&cp->n_control, 0);
872 	atomic_set(&cp->in_pkts, 0);
873 
874 	atomic_inc(&ipvs->conn_count);
875 	if (flags & IP_VS_CONN_F_NO_CPORT)
876 		atomic_inc(&ip_vs_conn_no_cport_cnt);
877 
878 	/* Bind the connection with a destination server */
879 	ip_vs_bind_dest(cp, dest);
880 
881 	/* Set its state and timeout */
882 	cp->state = 0;
883 	cp->timeout = 3*HZ;
884 
885 	/* Bind its packet transmitter */
886 #ifdef CONFIG_IP_VS_IPV6
887 	if (p->af == AF_INET6)
888 		ip_vs_bind_xmit_v6(cp);
889 	else
890 #endif
891 		ip_vs_bind_xmit(cp);
892 
893 	if (unlikely(pd && atomic_read(&pd->appcnt)))
894 		ip_vs_bind_app(cp, pd->pp);
895 
896 	/*
897 	 * Allow conntrack to be preserved. By default, conntrack
898 	 * is created and destroyed for every packet.
899 	 * Sometimes keeping conntrack can be useful for
900 	 * IP_VS_CONN_F_ONE_PACKET too.
901 	 */
902 
903 	if (ip_vs_conntrack_enabled(ipvs))
904 		cp->flags |= IP_VS_CONN_F_NFCT;
905 
906 	/* Hash it in the ip_vs_conn_tab finally */
907 	ip_vs_conn_hash(cp);
908 
909 	return cp;
910 }
911 
912 /*
913  *	/proc/net/ip_vs_conn entries
914  */
915 #ifdef CONFIG_PROC_FS
916 struct ip_vs_iter_state {
917 	struct seq_net_private	p;
918 	struct hlist_head	*l;
919 };
920 
ip_vs_conn_array(struct seq_file * seq,loff_t pos)921 static void *ip_vs_conn_array(struct seq_file *seq, loff_t pos)
922 {
923 	int idx;
924 	struct ip_vs_conn *cp;
925 	struct ip_vs_iter_state *iter = seq->private;
926 	struct hlist_node *n;
927 
928 	for (idx = 0; idx < ip_vs_conn_tab_size; idx++) {
929 		ct_read_lock_bh(idx);
930 		hlist_for_each_entry(cp, n, &ip_vs_conn_tab[idx], c_list) {
931 			if (pos-- == 0) {
932 				iter->l = &ip_vs_conn_tab[idx];
933 				return cp;
934 			}
935 		}
936 		ct_read_unlock_bh(idx);
937 	}
938 
939 	return NULL;
940 }
941 
ip_vs_conn_seq_start(struct seq_file * seq,loff_t * pos)942 static void *ip_vs_conn_seq_start(struct seq_file *seq, loff_t *pos)
943 {
944 	struct ip_vs_iter_state *iter = seq->private;
945 
946 	iter->l = NULL;
947 	return *pos ? ip_vs_conn_array(seq, *pos - 1) :SEQ_START_TOKEN;
948 }
949 
ip_vs_conn_seq_next(struct seq_file * seq,void * v,loff_t * pos)950 static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos)
951 {
952 	struct ip_vs_conn *cp = v;
953 	struct ip_vs_iter_state *iter = seq->private;
954 	struct hlist_node *e;
955 	struct hlist_head *l = iter->l;
956 	int idx;
957 
958 	++*pos;
959 	if (v == SEQ_START_TOKEN)
960 		return ip_vs_conn_array(seq, 0);
961 
962 	/* more on same hash chain? */
963 	if ((e = cp->c_list.next))
964 		return hlist_entry(e, struct ip_vs_conn, c_list);
965 
966 	idx = l - ip_vs_conn_tab;
967 	ct_read_unlock_bh(idx);
968 
969 	while (++idx < ip_vs_conn_tab_size) {
970 		ct_read_lock_bh(idx);
971 		hlist_for_each_entry(cp, e, &ip_vs_conn_tab[idx], c_list) {
972 			iter->l = &ip_vs_conn_tab[idx];
973 			return cp;
974 		}
975 		ct_read_unlock_bh(idx);
976 	}
977 	iter->l = NULL;
978 	return NULL;
979 }
980 
ip_vs_conn_seq_stop(struct seq_file * seq,void * v)981 static void ip_vs_conn_seq_stop(struct seq_file *seq, void *v)
982 {
983 	struct ip_vs_iter_state *iter = seq->private;
984 	struct hlist_head *l = iter->l;
985 
986 	if (l)
987 		ct_read_unlock_bh(l - ip_vs_conn_tab);
988 }
989 
ip_vs_conn_seq_show(struct seq_file * seq,void * v)990 static int ip_vs_conn_seq_show(struct seq_file *seq, void *v)
991 {
992 
993 	if (v == SEQ_START_TOKEN)
994 		seq_puts(seq,
995    "Pro FromIP   FPrt ToIP     TPrt DestIP   DPrt State       Expires PEName PEData\n");
996 	else {
997 		const struct ip_vs_conn *cp = v;
998 		struct net *net = seq_file_net(seq);
999 		char pe_data[IP_VS_PENAME_MAXLEN + IP_VS_PEDATA_MAXLEN + 3];
1000 		size_t len = 0;
1001 
1002 		if (!ip_vs_conn_net_eq(cp, net))
1003 			return 0;
1004 		if (cp->pe_data) {
1005 			pe_data[0] = ' ';
1006 			len = strlen(cp->pe->name);
1007 			memcpy(pe_data + 1, cp->pe->name, len);
1008 			pe_data[len + 1] = ' ';
1009 			len += 2;
1010 			len += cp->pe->show_pe_data(cp, pe_data + len);
1011 		}
1012 		pe_data[len] = '\0';
1013 
1014 #ifdef CONFIG_IP_VS_IPV6
1015 		if (cp->af == AF_INET6)
1016 			seq_printf(seq, "%-3s %pI6 %04X %pI6 %04X "
1017 				"%pI6 %04X %-11s %7lu%s\n",
1018 				ip_vs_proto_name(cp->protocol),
1019 				&cp->caddr.in6, ntohs(cp->cport),
1020 				&cp->vaddr.in6, ntohs(cp->vport),
1021 				&cp->daddr.in6, ntohs(cp->dport),
1022 				ip_vs_state_name(cp->protocol, cp->state),
1023 				(cp->timer.expires-jiffies)/HZ, pe_data);
1024 		else
1025 #endif
1026 			seq_printf(seq,
1027 				"%-3s %08X %04X %08X %04X"
1028 				" %08X %04X %-11s %7lu%s\n",
1029 				ip_vs_proto_name(cp->protocol),
1030 				ntohl(cp->caddr.ip), ntohs(cp->cport),
1031 				ntohl(cp->vaddr.ip), ntohs(cp->vport),
1032 				ntohl(cp->daddr.ip), ntohs(cp->dport),
1033 				ip_vs_state_name(cp->protocol, cp->state),
1034 				(cp->timer.expires-jiffies)/HZ, pe_data);
1035 	}
1036 	return 0;
1037 }
1038 
1039 static const struct seq_operations ip_vs_conn_seq_ops = {
1040 	.start = ip_vs_conn_seq_start,
1041 	.next  = ip_vs_conn_seq_next,
1042 	.stop  = ip_vs_conn_seq_stop,
1043 	.show  = ip_vs_conn_seq_show,
1044 };
1045 
ip_vs_conn_open(struct inode * inode,struct file * file)1046 static int ip_vs_conn_open(struct inode *inode, struct file *file)
1047 {
1048 	return seq_open_net(inode, file, &ip_vs_conn_seq_ops,
1049 			    sizeof(struct ip_vs_iter_state));
1050 }
1051 
1052 static const struct file_operations ip_vs_conn_fops = {
1053 	.owner	 = THIS_MODULE,
1054 	.open    = ip_vs_conn_open,
1055 	.read    = seq_read,
1056 	.llseek  = seq_lseek,
1057 	.release = seq_release_net,
1058 };
1059 
ip_vs_origin_name(unsigned flags)1060 static const char *ip_vs_origin_name(unsigned flags)
1061 {
1062 	if (flags & IP_VS_CONN_F_SYNC)
1063 		return "SYNC";
1064 	else
1065 		return "LOCAL";
1066 }
1067 
ip_vs_conn_sync_seq_show(struct seq_file * seq,void * v)1068 static int ip_vs_conn_sync_seq_show(struct seq_file *seq, void *v)
1069 {
1070 
1071 	if (v == SEQ_START_TOKEN)
1072 		seq_puts(seq,
1073    "Pro FromIP   FPrt ToIP     TPrt DestIP   DPrt State       Origin Expires\n");
1074 	else {
1075 		const struct ip_vs_conn *cp = v;
1076 		struct net *net = seq_file_net(seq);
1077 
1078 		if (!ip_vs_conn_net_eq(cp, net))
1079 			return 0;
1080 
1081 #ifdef CONFIG_IP_VS_IPV6
1082 		if (cp->af == AF_INET6)
1083 			seq_printf(seq, "%-3s %pI6 %04X %pI6 %04X %pI6 %04X %-11s %-6s %7lu\n",
1084 				ip_vs_proto_name(cp->protocol),
1085 				&cp->caddr.in6, ntohs(cp->cport),
1086 				&cp->vaddr.in6, ntohs(cp->vport),
1087 				&cp->daddr.in6, ntohs(cp->dport),
1088 				ip_vs_state_name(cp->protocol, cp->state),
1089 				ip_vs_origin_name(cp->flags),
1090 				(cp->timer.expires-jiffies)/HZ);
1091 		else
1092 #endif
1093 			seq_printf(seq,
1094 				"%-3s %08X %04X %08X %04X "
1095 				"%08X %04X %-11s %-6s %7lu\n",
1096 				ip_vs_proto_name(cp->protocol),
1097 				ntohl(cp->caddr.ip), ntohs(cp->cport),
1098 				ntohl(cp->vaddr.ip), ntohs(cp->vport),
1099 				ntohl(cp->daddr.ip), ntohs(cp->dport),
1100 				ip_vs_state_name(cp->protocol, cp->state),
1101 				ip_vs_origin_name(cp->flags),
1102 				(cp->timer.expires-jiffies)/HZ);
1103 	}
1104 	return 0;
1105 }
1106 
1107 static const struct seq_operations ip_vs_conn_sync_seq_ops = {
1108 	.start = ip_vs_conn_seq_start,
1109 	.next  = ip_vs_conn_seq_next,
1110 	.stop  = ip_vs_conn_seq_stop,
1111 	.show  = ip_vs_conn_sync_seq_show,
1112 };
1113 
ip_vs_conn_sync_open(struct inode * inode,struct file * file)1114 static int ip_vs_conn_sync_open(struct inode *inode, struct file *file)
1115 {
1116 	return seq_open_net(inode, file, &ip_vs_conn_sync_seq_ops,
1117 			    sizeof(struct ip_vs_iter_state));
1118 }
1119 
1120 static const struct file_operations ip_vs_conn_sync_fops = {
1121 	.owner	 = THIS_MODULE,
1122 	.open    = ip_vs_conn_sync_open,
1123 	.read    = seq_read,
1124 	.llseek  = seq_lseek,
1125 	.release = seq_release_net,
1126 };
1127 
1128 #endif
1129 
1130 
1131 /*
1132  *      Randomly drop connection entries before running out of memory
1133  */
todrop_entry(struct ip_vs_conn * cp)1134 static inline int todrop_entry(struct ip_vs_conn *cp)
1135 {
1136 	/*
1137 	 * The drop rate array needs tuning for real environments.
1138 	 * Called from timer bh only => no locking
1139 	 */
1140 	static const char todrop_rate[9] = {0, 1, 2, 3, 4, 5, 6, 7, 8};
1141 	static char todrop_counter[9] = {0};
1142 	int i;
1143 
1144 	/* if the conn entry hasn't lasted for 60 seconds, don't drop it.
1145 	   This will leave enough time for normal connection to get
1146 	   through. */
1147 	if (time_before(cp->timeout + jiffies, cp->timer.expires + 60*HZ))
1148 		return 0;
1149 
1150 	/* Don't drop the entry if its number of incoming packets is not
1151 	   located in [0, 8] */
1152 	i = atomic_read(&cp->in_pkts);
1153 	if (i > 8 || i < 0) return 0;
1154 
1155 	if (!todrop_rate[i]) return 0;
1156 	if (--todrop_counter[i] > 0) return 0;
1157 
1158 	todrop_counter[i] = todrop_rate[i];
1159 	return 1;
1160 }
1161 
1162 /* Called from keventd and must protect itself from softirqs */
ip_vs_random_dropentry(struct net * net)1163 void ip_vs_random_dropentry(struct net *net)
1164 {
1165 	int idx;
1166 	struct ip_vs_conn *cp;
1167 
1168 	/*
1169 	 * Randomly scan 1/32 of the whole table every second
1170 	 */
1171 	for (idx = 0; idx < (ip_vs_conn_tab_size>>5); idx++) {
1172 		unsigned hash = net_random() & ip_vs_conn_tab_mask;
1173 		struct hlist_node *n;
1174 
1175 		/*
1176 		 *  Lock is actually needed in this loop.
1177 		 */
1178 		ct_write_lock_bh(hash);
1179 
1180 		hlist_for_each_entry(cp, n, &ip_vs_conn_tab[hash], c_list) {
1181 			if (cp->flags & IP_VS_CONN_F_TEMPLATE)
1182 				/* connection template */
1183 				continue;
1184 			if (!ip_vs_conn_net_eq(cp, net))
1185 				continue;
1186 			if (cp->protocol == IPPROTO_TCP) {
1187 				switch(cp->state) {
1188 				case IP_VS_TCP_S_SYN_RECV:
1189 				case IP_VS_TCP_S_SYNACK:
1190 					break;
1191 
1192 				case IP_VS_TCP_S_ESTABLISHED:
1193 					if (todrop_entry(cp))
1194 						break;
1195 					continue;
1196 
1197 				default:
1198 					continue;
1199 				}
1200 			} else {
1201 				if (!todrop_entry(cp))
1202 					continue;
1203 			}
1204 
1205 			IP_VS_DBG(4, "del connection\n");
1206 			ip_vs_conn_expire_now(cp);
1207 			if (cp->control) {
1208 				IP_VS_DBG(4, "del conn template\n");
1209 				ip_vs_conn_expire_now(cp->control);
1210 			}
1211 		}
1212 		ct_write_unlock_bh(hash);
1213 	}
1214 }
1215 
1216 
1217 /*
1218  *      Flush all the connection entries in the ip_vs_conn_tab
1219  */
ip_vs_conn_flush(struct net * net)1220 static void ip_vs_conn_flush(struct net *net)
1221 {
1222 	int idx;
1223 	struct ip_vs_conn *cp;
1224 	struct netns_ipvs *ipvs = net_ipvs(net);
1225 
1226 flush_again:
1227 	for (idx = 0; idx < ip_vs_conn_tab_size; idx++) {
1228 		struct hlist_node *n;
1229 
1230 		/*
1231 		 *  Lock is actually needed in this loop.
1232 		 */
1233 		ct_write_lock_bh(idx);
1234 
1235 		hlist_for_each_entry(cp, n, &ip_vs_conn_tab[idx], c_list) {
1236 			if (!ip_vs_conn_net_eq(cp, net))
1237 				continue;
1238 			IP_VS_DBG(4, "del connection\n");
1239 			ip_vs_conn_expire_now(cp);
1240 			if (cp->control) {
1241 				IP_VS_DBG(4, "del conn template\n");
1242 				ip_vs_conn_expire_now(cp->control);
1243 			}
1244 		}
1245 		ct_write_unlock_bh(idx);
1246 	}
1247 
1248 	/* the counter may be not NULL, because maybe some conn entries
1249 	   are run by slow timer handler or unhashed but still referred */
1250 	if (atomic_read(&ipvs->conn_count) != 0) {
1251 		schedule();
1252 		goto flush_again;
1253 	}
1254 }
1255 /*
1256  * per netns init and exit
1257  */
ip_vs_conn_net_init(struct net * net)1258 int __net_init ip_vs_conn_net_init(struct net *net)
1259 {
1260 	struct netns_ipvs *ipvs = net_ipvs(net);
1261 
1262 	atomic_set(&ipvs->conn_count, 0);
1263 
1264 	proc_net_fops_create(net, "ip_vs_conn", 0, &ip_vs_conn_fops);
1265 	proc_net_fops_create(net, "ip_vs_conn_sync", 0, &ip_vs_conn_sync_fops);
1266 	return 0;
1267 }
1268 
ip_vs_conn_net_cleanup(struct net * net)1269 void __net_exit ip_vs_conn_net_cleanup(struct net *net)
1270 {
1271 	/* flush all the connection entries first */
1272 	ip_vs_conn_flush(net);
1273 	proc_net_remove(net, "ip_vs_conn");
1274 	proc_net_remove(net, "ip_vs_conn_sync");
1275 }
1276 
ip_vs_conn_init(void)1277 int __init ip_vs_conn_init(void)
1278 {
1279 	int idx;
1280 
1281 	/* Compute size and mask */
1282 	ip_vs_conn_tab_size = 1 << ip_vs_conn_tab_bits;
1283 	ip_vs_conn_tab_mask = ip_vs_conn_tab_size - 1;
1284 
1285 	/*
1286 	 * Allocate the connection hash table and initialize its list heads
1287 	 */
1288 	ip_vs_conn_tab = vmalloc(ip_vs_conn_tab_size * sizeof(*ip_vs_conn_tab));
1289 	if (!ip_vs_conn_tab)
1290 		return -ENOMEM;
1291 
1292 	/* Allocate ip_vs_conn slab cache */
1293 	ip_vs_conn_cachep = kmem_cache_create("ip_vs_conn",
1294 					      sizeof(struct ip_vs_conn), 0,
1295 					      SLAB_HWCACHE_ALIGN, NULL);
1296 	if (!ip_vs_conn_cachep) {
1297 		vfree(ip_vs_conn_tab);
1298 		return -ENOMEM;
1299 	}
1300 
1301 	pr_info("Connection hash table configured "
1302 		"(size=%d, memory=%ldKbytes)\n",
1303 		ip_vs_conn_tab_size,
1304 		(long)(ip_vs_conn_tab_size*sizeof(struct list_head))/1024);
1305 	IP_VS_DBG(0, "Each connection entry needs %Zd bytes at least\n",
1306 		  sizeof(struct ip_vs_conn));
1307 
1308 	for (idx = 0; idx < ip_vs_conn_tab_size; idx++)
1309 		INIT_HLIST_HEAD(&ip_vs_conn_tab[idx]);
1310 
1311 	for (idx = 0; idx < CT_LOCKARRAY_SIZE; idx++)  {
1312 		rwlock_init(&__ip_vs_conntbl_lock_array[idx].l);
1313 	}
1314 
1315 	/* calculate the random value for connection hash */
1316 	get_random_bytes(&ip_vs_conn_rnd, sizeof(ip_vs_conn_rnd));
1317 
1318 	return 0;
1319 }
1320 
ip_vs_conn_cleanup(void)1321 void ip_vs_conn_cleanup(void)
1322 {
1323 	/* Release the empty cache */
1324 	kmem_cache_destroy(ip_vs_conn_cachep);
1325 	vfree(ip_vs_conn_tab);
1326 }
1327