1 /*
2  * IPVS         An implementation of the IP virtual server support for the
3  *              LINUX operating system.  IPVS is now implemented as a module
4  *              over the NetFilter framework. IPVS can be used to build a
5  *              high-performance and highly available server based on a
6  *              cluster of servers.
7  *
8  * Version 1,   is capable of handling both version 0 and 1 messages.
9  *              Version 0 is the plain old format.
10  *              Note Version 0 receivers will just drop Ver 1 messages.
11  *              Version 1 is capable of handle IPv6, Persistence data,
12  *              time-outs, and firewall marks.
13  *              In ver.1 "ip_vs_sync_conn_options" will be sent in netw. order.
14  *              Ver. 0 can be turned on by sysctl -w net.ipv4.vs.sync_version=0
15  *
16  * Definitions  Message: is a complete datagram
17  *              Sync_conn: is a part of a Message
18  *              Param Data is an option to a Sync_conn.
19  *
20  * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
21  *
22  * ip_vs_sync:  sync connection info from master load balancer to backups
23  *              through multicast
24  *
25  * Changes:
26  *	Alexandre Cassen	:	Added master & backup support at a time.
27  *	Alexandre Cassen	:	Added SyncID support for incoming sync
28  *					messages filtering.
29  *	Justin Ossevoort	:	Fix endian problem on sync message size.
30  *	Hans Schillstrom	:	Added Version 1: i.e. IPv6,
31  *					Persistence support, fwmark and time-out.
32  */
33 
34 #define KMSG_COMPONENT "IPVS"
35 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
36 
37 #include <linux/module.h>
38 #include <linux/slab.h>
39 #include <linux/inetdevice.h>
40 #include <linux/net.h>
41 #include <linux/completion.h>
42 #include <linux/delay.h>
43 #include <linux/skbuff.h>
44 #include <linux/in.h>
45 #include <linux/igmp.h>                 /* for ip_mc_join_group */
46 #include <linux/udp.h>
47 #include <linux/err.h>
48 #include <linux/kthread.h>
49 #include <linux/wait.h>
50 #include <linux/kernel.h>
51 
52 #include <asm/unaligned.h>		/* Used for ntoh_seq and hton_seq */
53 
54 #include <net/ip.h>
55 #include <net/sock.h>
56 
57 #include <net/ip_vs.h>
58 
59 #define IP_VS_SYNC_GROUP 0xe0000051    /* multicast addr - 224.0.0.81 */
60 #define IP_VS_SYNC_PORT  8848          /* multicast port */
61 
62 #define SYNC_PROTO_VER  1		/* Protocol version in header */
63 
64 /*
65  *	IPVS sync connection entry
66  *	Version 0, i.e. original version.
67  */
68 struct ip_vs_sync_conn_v0 {
69 	__u8			reserved;
70 
71 	/* Protocol, addresses and port numbers */
72 	__u8			protocol;       /* Which protocol (TCP/UDP) */
73 	__be16			cport;
74 	__be16                  vport;
75 	__be16                  dport;
76 	__be32                  caddr;          /* client address */
77 	__be32                  vaddr;          /* virtual address */
78 	__be32                  daddr;          /* destination address */
79 
80 	/* Flags and state transition */
81 	__be16                  flags;          /* status flags */
82 	__be16                  state;          /* state info */
83 
84 	/* The sequence options start here */
85 };
86 
87 struct ip_vs_sync_conn_options {
88 	struct ip_vs_seq        in_seq;         /* incoming seq. struct */
89 	struct ip_vs_seq        out_seq;        /* outgoing seq. struct */
90 };
91 
92 /*
93      Sync Connection format (sync_conn)
94 
95        0                   1                   2                   3
96        0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
97       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
98       |    Type       |    Protocol   | Ver.  |        Size           |
99       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
100       |                             Flags                             |
101       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
102       |            State              |         cport                 |
103       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
104       |            vport              |         dport                 |
105       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
106       |                             fwmark                            |
107       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
108       |                             timeout  (in sec.)                |
109       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
110       |                              ...                              |
111       |                        IP-Addresses  (v4 or v6)               |
112       |                              ...                              |
113       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
114   Optional Parameters.
115       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
116       | Param. Type    | Param. Length |   Param. data                |
117       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+                               |
118       |                              ...                              |
119       |                               +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
120       |                               | Param Type    | Param. Length |
121       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
122       |                           Param  data                         |
123       |         Last Param data should be padded for 32 bit alignment |
124       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
125 */
126 
127 /*
128  *  Type 0, IPv4 sync connection format
129  */
130 struct ip_vs_sync_v4 {
131 	__u8			type;
132 	__u8			protocol;	/* Which protocol (TCP/UDP) */
133 	__be16			ver_size;	/* Version msb 4 bits */
134 	/* Flags and state transition */
135 	__be32			flags;		/* status flags */
136 	__be16			state;		/* state info 	*/
137 	/* Protocol, addresses and port numbers */
138 	__be16			cport;
139 	__be16			vport;
140 	__be16			dport;
141 	__be32			fwmark;		/* Firewall mark from skb */
142 	__be32			timeout;	/* cp timeout */
143 	__be32			caddr;		/* client address */
144 	__be32			vaddr;		/* virtual address */
145 	__be32			daddr;		/* destination address */
146 	/* The sequence options start here */
147 	/* PE data padded to 32bit alignment after seq. options */
148 };
149 /*
150  * Type 2 messages IPv6
151  */
152 struct ip_vs_sync_v6 {
153 	__u8			type;
154 	__u8			protocol;	/* Which protocol (TCP/UDP) */
155 	__be16			ver_size;	/* Version msb 4 bits */
156 	/* Flags and state transition */
157 	__be32			flags;		/* status flags */
158 	__be16			state;		/* state info 	*/
159 	/* Protocol, addresses and port numbers */
160 	__be16			cport;
161 	__be16			vport;
162 	__be16			dport;
163 	__be32			fwmark;		/* Firewall mark from skb */
164 	__be32			timeout;	/* cp timeout */
165 	struct in6_addr		caddr;		/* client address */
166 	struct in6_addr		vaddr;		/* virtual address */
167 	struct in6_addr		daddr;		/* destination address */
168 	/* The sequence options start here */
169 	/* PE data padded to 32bit alignment after seq. options */
170 };
171 
172 union ip_vs_sync_conn {
173 	struct ip_vs_sync_v4	v4;
174 	struct ip_vs_sync_v6	v6;
175 };
176 
177 /* Bits in Type field in above */
178 #define STYPE_INET6		0
179 #define STYPE_F_INET6		(1 << STYPE_INET6)
180 
181 #define SVER_SHIFT		12		/* Shift to get version */
182 #define SVER_MASK		0x0fff		/* Mask to strip version */
183 
184 #define IPVS_OPT_SEQ_DATA	1
185 #define IPVS_OPT_PE_DATA	2
186 #define IPVS_OPT_PE_NAME	3
187 #define IPVS_OPT_PARAM		7
188 
189 #define IPVS_OPT_F_SEQ_DATA	(1 << (IPVS_OPT_SEQ_DATA-1))
190 #define IPVS_OPT_F_PE_DATA	(1 << (IPVS_OPT_PE_DATA-1))
191 #define IPVS_OPT_F_PE_NAME	(1 << (IPVS_OPT_PE_NAME-1))
192 #define IPVS_OPT_F_PARAM	(1 << (IPVS_OPT_PARAM-1))
193 
194 struct ip_vs_sync_thread_data {
195 	struct net *net;
196 	struct socket *sock;
197 	char *buf;
198 };
199 
200 /* Version 0 definition of packet sizes */
201 #define SIMPLE_CONN_SIZE  (sizeof(struct ip_vs_sync_conn_v0))
202 #define FULL_CONN_SIZE  \
203 (sizeof(struct ip_vs_sync_conn_v0) + sizeof(struct ip_vs_sync_conn_options))
204 
205 
206 /*
207   The master mulitcasts messages (Datagrams) to the backup load balancers
208   in the following format.
209 
210  Version 1:
211   Note, first byte should be Zero, so ver 0 receivers will drop the packet.
212 
213        0                   1                   2                   3
214        0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
215       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
216       |      0        |    SyncID     |            Size               |
217       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
218       |  Count Conns  |    Version    |    Reserved, set to Zero      |
219       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
220       |                                                               |
221       |                    IPVS Sync Connection (1)                   |
222       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
223       |                            .                                  |
224       ~                            .                                  ~
225       |                            .                                  |
226       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
227       |                                                               |
228       |                    IPVS Sync Connection (n)                   |
229       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
230 
231  Version 0 Header
232        0                   1                   2                   3
233        0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
234       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
235       |  Count Conns  |    SyncID     |            Size               |
236       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
237       |                    IPVS Sync Connection (1)                   |
238 */
239 
240 #define SYNC_MESG_HEADER_LEN	4
241 #define MAX_CONNS_PER_SYNCBUFF	255 /* nr_conns in ip_vs_sync_mesg is 8 bit */
242 
243 /* Version 0 header */
244 struct ip_vs_sync_mesg_v0 {
245 	__u8                    nr_conns;
246 	__u8                    syncid;
247 	__u16                   size;
248 
249 	/* ip_vs_sync_conn entries start here */
250 };
251 
252 /* Version 1 header */
253 struct ip_vs_sync_mesg {
254 	__u8			reserved;	/* must be zero */
255 	__u8			syncid;
256 	__u16			size;
257 	__u8			nr_conns;
258 	__s8			version;	/* SYNC_PROTO_VER  */
259 	__u16			spare;
260 	/* ip_vs_sync_conn entries start here */
261 };
262 
263 struct ip_vs_sync_buff {
264 	struct list_head        list;
265 	unsigned long           firstuse;
266 
267 	/* pointers for the message data */
268 	struct ip_vs_sync_mesg  *mesg;
269 	unsigned char           *head;
270 	unsigned char           *end;
271 };
272 
273 /* multicast addr */
274 static struct sockaddr_in mcast_addr = {
275 	.sin_family		= AF_INET,
276 	.sin_port		= cpu_to_be16(IP_VS_SYNC_PORT),
277 	.sin_addr.s_addr	= cpu_to_be32(IP_VS_SYNC_GROUP),
278 };
279 
280 /*
281  * Copy of struct ip_vs_seq
282  * From unaligned network order to aligned host order
283  */
ntoh_seq(struct ip_vs_seq * no,struct ip_vs_seq * ho)284 static void ntoh_seq(struct ip_vs_seq *no, struct ip_vs_seq *ho)
285 {
286 	ho->init_seq       = get_unaligned_be32(&no->init_seq);
287 	ho->delta          = get_unaligned_be32(&no->delta);
288 	ho->previous_delta = get_unaligned_be32(&no->previous_delta);
289 }
290 
291 /*
292  * Copy of struct ip_vs_seq
293  * From Aligned host order to unaligned network order
294  */
hton_seq(struct ip_vs_seq * ho,struct ip_vs_seq * no)295 static void hton_seq(struct ip_vs_seq *ho, struct ip_vs_seq *no)
296 {
297 	put_unaligned_be32(ho->init_seq, &no->init_seq);
298 	put_unaligned_be32(ho->delta, &no->delta);
299 	put_unaligned_be32(ho->previous_delta, &no->previous_delta);
300 }
301 
sb_dequeue(struct netns_ipvs * ipvs)302 static inline struct ip_vs_sync_buff *sb_dequeue(struct netns_ipvs *ipvs)
303 {
304 	struct ip_vs_sync_buff *sb;
305 
306 	spin_lock_bh(&ipvs->sync_lock);
307 	if (list_empty(&ipvs->sync_queue)) {
308 		sb = NULL;
309 	} else {
310 		sb = list_entry(ipvs->sync_queue.next,
311 				struct ip_vs_sync_buff,
312 				list);
313 		list_del(&sb->list);
314 	}
315 	spin_unlock_bh(&ipvs->sync_lock);
316 
317 	return sb;
318 }
319 
320 /*
321  * Create a new sync buffer for Version 1 proto.
322  */
323 static inline struct ip_vs_sync_buff *
ip_vs_sync_buff_create(struct netns_ipvs * ipvs)324 ip_vs_sync_buff_create(struct netns_ipvs *ipvs)
325 {
326 	struct ip_vs_sync_buff *sb;
327 
328 	if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC)))
329 		return NULL;
330 
331 	sb->mesg = kmalloc(ipvs->send_mesg_maxlen, GFP_ATOMIC);
332 	if (!sb->mesg) {
333 		kfree(sb);
334 		return NULL;
335 	}
336 	sb->mesg->reserved = 0;  /* old nr_conns i.e. must be zeo now */
337 	sb->mesg->version = SYNC_PROTO_VER;
338 	sb->mesg->syncid = ipvs->master_syncid;
339 	sb->mesg->size = sizeof(struct ip_vs_sync_mesg);
340 	sb->mesg->nr_conns = 0;
341 	sb->mesg->spare = 0;
342 	sb->head = (unsigned char *)sb->mesg + sizeof(struct ip_vs_sync_mesg);
343 	sb->end = (unsigned char *)sb->mesg + ipvs->send_mesg_maxlen;
344 
345 	sb->firstuse = jiffies;
346 	return sb;
347 }
348 
ip_vs_sync_buff_release(struct ip_vs_sync_buff * sb)349 static inline void ip_vs_sync_buff_release(struct ip_vs_sync_buff *sb)
350 {
351 	kfree(sb->mesg);
352 	kfree(sb);
353 }
354 
sb_queue_tail(struct netns_ipvs * ipvs)355 static inline void sb_queue_tail(struct netns_ipvs *ipvs)
356 {
357 	struct ip_vs_sync_buff *sb = ipvs->sync_buff;
358 
359 	spin_lock(&ipvs->sync_lock);
360 	if (ipvs->sync_state & IP_VS_STATE_MASTER)
361 		list_add_tail(&sb->list, &ipvs->sync_queue);
362 	else
363 		ip_vs_sync_buff_release(sb);
364 	spin_unlock(&ipvs->sync_lock);
365 }
366 
367 /*
368  *	Get the current sync buffer if it has been created for more
369  *	than the specified time or the specified time is zero.
370  */
371 static inline struct ip_vs_sync_buff *
get_curr_sync_buff(struct netns_ipvs * ipvs,unsigned long time)372 get_curr_sync_buff(struct netns_ipvs *ipvs, unsigned long time)
373 {
374 	struct ip_vs_sync_buff *sb;
375 
376 	spin_lock_bh(&ipvs->sync_buff_lock);
377 	if (ipvs->sync_buff &&
378 	    time_after_eq(jiffies - ipvs->sync_buff->firstuse, time)) {
379 		sb = ipvs->sync_buff;
380 		ipvs->sync_buff = NULL;
381 	} else
382 		sb = NULL;
383 	spin_unlock_bh(&ipvs->sync_buff_lock);
384 	return sb;
385 }
386 
387 /*
388  * Switch mode from sending version 0 or 1
389  *  - must handle sync_buf
390  */
ip_vs_sync_switch_mode(struct net * net,int mode)391 void ip_vs_sync_switch_mode(struct net *net, int mode)
392 {
393 	struct netns_ipvs *ipvs = net_ipvs(net);
394 
395 	if (!(ipvs->sync_state & IP_VS_STATE_MASTER))
396 		return;
397 	if (mode == sysctl_sync_ver(ipvs) || !ipvs->sync_buff)
398 		return;
399 
400 	spin_lock_bh(&ipvs->sync_buff_lock);
401 	/* Buffer empty ? then let buf_create do the job  */
402 	if (ipvs->sync_buff->mesg->size <=  sizeof(struct ip_vs_sync_mesg)) {
403 		kfree(ipvs->sync_buff);
404 		ipvs->sync_buff = NULL;
405 	} else {
406 		spin_lock_bh(&ipvs->sync_lock);
407 		if (ipvs->sync_state & IP_VS_STATE_MASTER)
408 			list_add_tail(&ipvs->sync_buff->list,
409 				      &ipvs->sync_queue);
410 		else
411 			ip_vs_sync_buff_release(ipvs->sync_buff);
412 		spin_unlock_bh(&ipvs->sync_lock);
413 	}
414 	spin_unlock_bh(&ipvs->sync_buff_lock);
415 }
416 
417 /*
418  * Create a new sync buffer for Version 0 proto.
419  */
420 static inline struct ip_vs_sync_buff *
ip_vs_sync_buff_create_v0(struct netns_ipvs * ipvs)421 ip_vs_sync_buff_create_v0(struct netns_ipvs *ipvs)
422 {
423 	struct ip_vs_sync_buff *sb;
424 	struct ip_vs_sync_mesg_v0 *mesg;
425 
426 	if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC)))
427 		return NULL;
428 
429 	sb->mesg = kmalloc(ipvs->send_mesg_maxlen, GFP_ATOMIC);
430 	if (!sb->mesg) {
431 		kfree(sb);
432 		return NULL;
433 	}
434 	mesg = (struct ip_vs_sync_mesg_v0 *)sb->mesg;
435 	mesg->nr_conns = 0;
436 	mesg->syncid = ipvs->master_syncid;
437 	mesg->size = sizeof(struct ip_vs_sync_mesg_v0);
438 	sb->head = (unsigned char *)mesg + sizeof(struct ip_vs_sync_mesg_v0);
439 	sb->end = (unsigned char *)mesg + ipvs->send_mesg_maxlen;
440 	sb->firstuse = jiffies;
441 	return sb;
442 }
443 
444 /*
445  *      Version 0 , could be switched in by sys_ctl.
446  *      Add an ip_vs_conn information into the current sync_buff.
447  */
ip_vs_sync_conn_v0(struct net * net,struct ip_vs_conn * cp)448 void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp)
449 {
450 	struct netns_ipvs *ipvs = net_ipvs(net);
451 	struct ip_vs_sync_mesg_v0 *m;
452 	struct ip_vs_sync_conn_v0 *s;
453 	int len;
454 
455 	if (unlikely(cp->af != AF_INET))
456 		return;
457 	/* Do not sync ONE PACKET */
458 	if (cp->flags & IP_VS_CONN_F_ONE_PACKET)
459 		return;
460 
461 	spin_lock(&ipvs->sync_buff_lock);
462 	if (!ipvs->sync_buff) {
463 		ipvs->sync_buff =
464 			ip_vs_sync_buff_create_v0(ipvs);
465 		if (!ipvs->sync_buff) {
466 			spin_unlock(&ipvs->sync_buff_lock);
467 			pr_err("ip_vs_sync_buff_create failed.\n");
468 			return;
469 		}
470 	}
471 
472 	len = (cp->flags & IP_VS_CONN_F_SEQ_MASK) ? FULL_CONN_SIZE :
473 		SIMPLE_CONN_SIZE;
474 	m = (struct ip_vs_sync_mesg_v0 *)ipvs->sync_buff->mesg;
475 	s = (struct ip_vs_sync_conn_v0 *)ipvs->sync_buff->head;
476 
477 	/* copy members */
478 	s->reserved = 0;
479 	s->protocol = cp->protocol;
480 	s->cport = cp->cport;
481 	s->vport = cp->vport;
482 	s->dport = cp->dport;
483 	s->caddr = cp->caddr.ip;
484 	s->vaddr = cp->vaddr.ip;
485 	s->daddr = cp->daddr.ip;
486 	s->flags = htons(cp->flags & ~IP_VS_CONN_F_HASHED);
487 	s->state = htons(cp->state);
488 	if (cp->flags & IP_VS_CONN_F_SEQ_MASK) {
489 		struct ip_vs_sync_conn_options *opt =
490 			(struct ip_vs_sync_conn_options *)&s[1];
491 		memcpy(opt, &cp->in_seq, sizeof(*opt));
492 	}
493 
494 	m->nr_conns++;
495 	m->size += len;
496 	ipvs->sync_buff->head += len;
497 
498 	/* check if there is a space for next one */
499 	if (ipvs->sync_buff->head + FULL_CONN_SIZE > ipvs->sync_buff->end) {
500 		sb_queue_tail(ipvs);
501 		ipvs->sync_buff = NULL;
502 	}
503 	spin_unlock(&ipvs->sync_buff_lock);
504 
505 	/* synchronize its controller if it has */
506 	if (cp->control)
507 		ip_vs_sync_conn(net, cp->control);
508 }
509 
510 /*
511  *      Add an ip_vs_conn information into the current sync_buff.
512  *      Called by ip_vs_in.
513  *      Sending Version 1 messages
514  */
ip_vs_sync_conn(struct net * net,struct ip_vs_conn * cp)515 void ip_vs_sync_conn(struct net *net, struct ip_vs_conn *cp)
516 {
517 	struct netns_ipvs *ipvs = net_ipvs(net);
518 	struct ip_vs_sync_mesg *m;
519 	union ip_vs_sync_conn *s;
520 	__u8 *p;
521 	unsigned int len, pe_name_len, pad;
522 
523 	/* Handle old version of the protocol */
524 	if (sysctl_sync_ver(ipvs) == 0) {
525 		ip_vs_sync_conn_v0(net, cp);
526 		return;
527 	}
528 	/* Do not sync ONE PACKET */
529 	if (cp->flags & IP_VS_CONN_F_ONE_PACKET)
530 		goto control;
531 sloop:
532 	/* Sanity checks */
533 	pe_name_len = 0;
534 	if (cp->pe_data_len) {
535 		if (!cp->pe_data || !cp->dest) {
536 			IP_VS_ERR_RL("SYNC, connection pe_data invalid\n");
537 			return;
538 		}
539 		pe_name_len = strnlen(cp->pe->name, IP_VS_PENAME_MAXLEN);
540 	}
541 
542 	spin_lock(&ipvs->sync_buff_lock);
543 
544 #ifdef CONFIG_IP_VS_IPV6
545 	if (cp->af == AF_INET6)
546 		len = sizeof(struct ip_vs_sync_v6);
547 	else
548 #endif
549 		len = sizeof(struct ip_vs_sync_v4);
550 
551 	if (cp->flags & IP_VS_CONN_F_SEQ_MASK)
552 		len += sizeof(struct ip_vs_sync_conn_options) + 2;
553 
554 	if (cp->pe_data_len)
555 		len += cp->pe_data_len + 2;	/* + Param hdr field */
556 	if (pe_name_len)
557 		len += pe_name_len + 2;
558 
559 	/* check if there is a space for this one  */
560 	pad = 0;
561 	if (ipvs->sync_buff) {
562 		pad = (4 - (size_t)ipvs->sync_buff->head) & 3;
563 		if (ipvs->sync_buff->head + len + pad > ipvs->sync_buff->end) {
564 			sb_queue_tail(ipvs);
565 			ipvs->sync_buff = NULL;
566 			pad = 0;
567 		}
568 	}
569 
570 	if (!ipvs->sync_buff) {
571 		ipvs->sync_buff = ip_vs_sync_buff_create(ipvs);
572 		if (!ipvs->sync_buff) {
573 			spin_unlock(&ipvs->sync_buff_lock);
574 			pr_err("ip_vs_sync_buff_create failed.\n");
575 			return;
576 		}
577 	}
578 
579 	m = ipvs->sync_buff->mesg;
580 	p = ipvs->sync_buff->head;
581 	ipvs->sync_buff->head += pad + len;
582 	m->size += pad + len;
583 	/* Add ev. padding from prev. sync_conn */
584 	while (pad--)
585 		*(p++) = 0;
586 
587 	s = (union ip_vs_sync_conn *)p;
588 
589 	/* Set message type  & copy members */
590 	s->v4.type = (cp->af == AF_INET6 ? STYPE_F_INET6 : 0);
591 	s->v4.ver_size = htons(len & SVER_MASK);	/* Version 0 */
592 	s->v4.flags = htonl(cp->flags & ~IP_VS_CONN_F_HASHED);
593 	s->v4.state = htons(cp->state);
594 	s->v4.protocol = cp->protocol;
595 	s->v4.cport = cp->cport;
596 	s->v4.vport = cp->vport;
597 	s->v4.dport = cp->dport;
598 	s->v4.fwmark = htonl(cp->fwmark);
599 	s->v4.timeout = htonl(cp->timeout / HZ);
600 	m->nr_conns++;
601 
602 #ifdef CONFIG_IP_VS_IPV6
603 	if (cp->af == AF_INET6) {
604 		p += sizeof(struct ip_vs_sync_v6);
605 		ipv6_addr_copy(&s->v6.caddr, &cp->caddr.in6);
606 		ipv6_addr_copy(&s->v6.vaddr, &cp->vaddr.in6);
607 		ipv6_addr_copy(&s->v6.daddr, &cp->daddr.in6);
608 	} else
609 #endif
610 	{
611 		p += sizeof(struct ip_vs_sync_v4);	/* options ptr */
612 		s->v4.caddr = cp->caddr.ip;
613 		s->v4.vaddr = cp->vaddr.ip;
614 		s->v4.daddr = cp->daddr.ip;
615 	}
616 	if (cp->flags & IP_VS_CONN_F_SEQ_MASK) {
617 		*(p++) = IPVS_OPT_SEQ_DATA;
618 		*(p++) = sizeof(struct ip_vs_sync_conn_options);
619 		hton_seq((struct ip_vs_seq *)p, &cp->in_seq);
620 		p += sizeof(struct ip_vs_seq);
621 		hton_seq((struct ip_vs_seq *)p, &cp->out_seq);
622 		p += sizeof(struct ip_vs_seq);
623 	}
624 	/* Handle pe data */
625 	if (cp->pe_data_len && cp->pe_data) {
626 		*(p++) = IPVS_OPT_PE_DATA;
627 		*(p++) = cp->pe_data_len;
628 		memcpy(p, cp->pe_data, cp->pe_data_len);
629 		p += cp->pe_data_len;
630 		if (pe_name_len) {
631 			/* Add PE_NAME */
632 			*(p++) = IPVS_OPT_PE_NAME;
633 			*(p++) = pe_name_len;
634 			memcpy(p, cp->pe->name, pe_name_len);
635 			p += pe_name_len;
636 		}
637 	}
638 
639 	spin_unlock(&ipvs->sync_buff_lock);
640 
641 control:
642 	/* synchronize its controller if it has */
643 	cp = cp->control;
644 	if (!cp)
645 		return;
646 	/*
647 	 * Reduce sync rate for templates
648 	 * i.e only increment in_pkts for Templates.
649 	 */
650 	if (cp->flags & IP_VS_CONN_F_TEMPLATE) {
651 		int pkts = atomic_add_return(1, &cp->in_pkts);
652 
653 		if (pkts % sysctl_sync_period(ipvs) != 1)
654 			return;
655 	}
656 	goto sloop;
657 }
658 
659 /*
660  *  fill_param used by version 1
661  */
662 static inline int
ip_vs_conn_fill_param_sync(struct net * net,int af,union ip_vs_sync_conn * sc,struct ip_vs_conn_param * p,__u8 * pe_data,unsigned int pe_data_len,__u8 * pe_name,unsigned int pe_name_len)663 ip_vs_conn_fill_param_sync(struct net *net, int af, union ip_vs_sync_conn *sc,
664 			   struct ip_vs_conn_param *p,
665 			   __u8 *pe_data, unsigned int pe_data_len,
666 			   __u8 *pe_name, unsigned int pe_name_len)
667 {
668 #ifdef CONFIG_IP_VS_IPV6
669 	if (af == AF_INET6)
670 		ip_vs_conn_fill_param(net, af, sc->v6.protocol,
671 				      (const union nf_inet_addr *)&sc->v6.caddr,
672 				      sc->v6.cport,
673 				      (const union nf_inet_addr *)&sc->v6.vaddr,
674 				      sc->v6.vport, p);
675 	else
676 #endif
677 		ip_vs_conn_fill_param(net, af, sc->v4.protocol,
678 				      (const union nf_inet_addr *)&sc->v4.caddr,
679 				      sc->v4.cport,
680 				      (const union nf_inet_addr *)&sc->v4.vaddr,
681 				      sc->v4.vport, p);
682 	/* Handle pe data */
683 	if (pe_data_len) {
684 		if (pe_name_len) {
685 			char buff[IP_VS_PENAME_MAXLEN+1];
686 
687 			memcpy(buff, pe_name, pe_name_len);
688 			buff[pe_name_len]=0;
689 			p->pe = __ip_vs_pe_getbyname(buff);
690 			if (!p->pe) {
691 				IP_VS_DBG(3, "BACKUP, no %s engine found/loaded\n",
692 					     buff);
693 				return 1;
694 			}
695 		} else {
696 			IP_VS_ERR_RL("BACKUP, Invalid PE parameters\n");
697 			return 1;
698 		}
699 
700 		p->pe_data = kmemdup(pe_data, pe_data_len, GFP_ATOMIC);
701 		if (!p->pe_data) {
702 			if (p->pe->module)
703 				module_put(p->pe->module);
704 			return -ENOMEM;
705 		}
706 		p->pe_data_len = pe_data_len;
707 	}
708 	return 0;
709 }
710 
711 /*
712  *  Connection Add / Update.
713  *  Common for version 0 and 1 reception of backup sync_conns.
714  *  Param: ...
715  *         timeout is in sec.
716  */
ip_vs_proc_conn(struct net * net,struct ip_vs_conn_param * param,unsigned int flags,unsigned int state,unsigned int protocol,unsigned int type,const union nf_inet_addr * daddr,__be16 dport,unsigned long timeout,__u32 fwmark,struct ip_vs_sync_conn_options * opt)717 static void ip_vs_proc_conn(struct net *net, struct ip_vs_conn_param *param,
718 			    unsigned int flags, unsigned int state,
719 			    unsigned int protocol, unsigned int type,
720 			    const union nf_inet_addr *daddr, __be16 dport,
721 			    unsigned long timeout, __u32 fwmark,
722 			    struct ip_vs_sync_conn_options *opt)
723 {
724 	struct ip_vs_dest *dest;
725 	struct ip_vs_conn *cp;
726 	struct netns_ipvs *ipvs = net_ipvs(net);
727 
728 	if (!(flags & IP_VS_CONN_F_TEMPLATE))
729 		cp = ip_vs_conn_in_get(param);
730 	else
731 		cp = ip_vs_ct_in_get(param);
732 
733 	if (cp && param->pe_data) 	/* Free pe_data */
734 		kfree(param->pe_data);
735 	if (!cp) {
736 		/*
737 		 * Find the appropriate destination for the connection.
738 		 * If it is not found the connection will remain unbound
739 		 * but still handled.
740 		 */
741 		dest = ip_vs_find_dest(net, type, daddr, dport, param->vaddr,
742 				       param->vport, protocol, fwmark);
743 
744 		/*  Set the approprite ativity flag */
745 		if (protocol == IPPROTO_TCP) {
746 			if (state != IP_VS_TCP_S_ESTABLISHED)
747 				flags |= IP_VS_CONN_F_INACTIVE;
748 			else
749 				flags &= ~IP_VS_CONN_F_INACTIVE;
750 		} else if (protocol == IPPROTO_SCTP) {
751 			if (state != IP_VS_SCTP_S_ESTABLISHED)
752 				flags |= IP_VS_CONN_F_INACTIVE;
753 			else
754 				flags &= ~IP_VS_CONN_F_INACTIVE;
755 		}
756 		cp = ip_vs_conn_new(param, daddr, dport, flags, dest, fwmark);
757 		if (dest)
758 			atomic_dec(&dest->refcnt);
759 		if (!cp) {
760 			if (param->pe_data)
761 				kfree(param->pe_data);
762 			IP_VS_DBG(2, "BACKUP, add new conn. failed\n");
763 			return;
764 		}
765 	} else if (!cp->dest) {
766 		dest = ip_vs_try_bind_dest(cp);
767 		if (dest)
768 			atomic_dec(&dest->refcnt);
769 	} else if ((cp->dest) && (cp->protocol == IPPROTO_TCP) &&
770 		(cp->state != state)) {
771 		/* update active/inactive flag for the connection */
772 		dest = cp->dest;
773 		if (!(cp->flags & IP_VS_CONN_F_INACTIVE) &&
774 			(state != IP_VS_TCP_S_ESTABLISHED)) {
775 			atomic_dec(&dest->activeconns);
776 			atomic_inc(&dest->inactconns);
777 			cp->flags |= IP_VS_CONN_F_INACTIVE;
778 		} else if ((cp->flags & IP_VS_CONN_F_INACTIVE) &&
779 			(state == IP_VS_TCP_S_ESTABLISHED)) {
780 			atomic_inc(&dest->activeconns);
781 			atomic_dec(&dest->inactconns);
782 			cp->flags &= ~IP_VS_CONN_F_INACTIVE;
783 		}
784 	} else if ((cp->dest) && (cp->protocol == IPPROTO_SCTP) &&
785 		(cp->state != state)) {
786 		dest = cp->dest;
787 		if (!(cp->flags & IP_VS_CONN_F_INACTIVE) &&
788 		(state != IP_VS_SCTP_S_ESTABLISHED)) {
789 			atomic_dec(&dest->activeconns);
790 			atomic_inc(&dest->inactconns);
791 			cp->flags &= ~IP_VS_CONN_F_INACTIVE;
792 		}
793 	}
794 
795 	if (opt)
796 		memcpy(&cp->in_seq, opt, sizeof(*opt));
797 	atomic_set(&cp->in_pkts, sysctl_sync_threshold(ipvs));
798 	cp->state = state;
799 	cp->old_state = cp->state;
800 	/*
801 	 * For Ver 0 messages style
802 	 *  - Not possible to recover the right timeout for templates
803 	 *  - can not find the right fwmark
804 	 *    virtual service. If needed, we can do it for
805 	 *    non-fwmark persistent services.
806 	 * Ver 1 messages style.
807 	 *  - No problem.
808 	 */
809 	if (timeout) {
810 		if (timeout > MAX_SCHEDULE_TIMEOUT / HZ)
811 			timeout = MAX_SCHEDULE_TIMEOUT / HZ;
812 		cp->timeout = timeout*HZ;
813 	} else {
814 		struct ip_vs_proto_data *pd;
815 
816 		pd = ip_vs_proto_data_get(net, protocol);
817 		if (!(flags & IP_VS_CONN_F_TEMPLATE) && pd && pd->timeout_table)
818 			cp->timeout = pd->timeout_table[state];
819 		else
820 			cp->timeout = (3*60*HZ);
821 	}
822 	ip_vs_conn_put(cp);
823 }
824 
825 /*
826  *  Process received multicast message for Version 0
827  */
ip_vs_process_message_v0(struct net * net,const char * buffer,const size_t buflen)828 static void ip_vs_process_message_v0(struct net *net, const char *buffer,
829 				     const size_t buflen)
830 {
831 	struct ip_vs_sync_mesg_v0 *m = (struct ip_vs_sync_mesg_v0 *)buffer;
832 	struct ip_vs_sync_conn_v0 *s;
833 	struct ip_vs_sync_conn_options *opt;
834 	struct ip_vs_protocol *pp;
835 	struct ip_vs_conn_param param;
836 	char *p;
837 	int i;
838 
839 	p = (char *)buffer + sizeof(struct ip_vs_sync_mesg_v0);
840 	for (i=0; i<m->nr_conns; i++) {
841 		unsigned flags, state;
842 
843 		if (p + SIMPLE_CONN_SIZE > buffer+buflen) {
844 			IP_VS_ERR_RL("BACKUP v0, bogus conn\n");
845 			return;
846 		}
847 		s = (struct ip_vs_sync_conn_v0 *) p;
848 		flags = ntohs(s->flags) | IP_VS_CONN_F_SYNC;
849 		flags &= ~IP_VS_CONN_F_HASHED;
850 		if (flags & IP_VS_CONN_F_SEQ_MASK) {
851 			opt = (struct ip_vs_sync_conn_options *)&s[1];
852 			p += FULL_CONN_SIZE;
853 			if (p > buffer+buflen) {
854 				IP_VS_ERR_RL("BACKUP v0, Dropping buffer bogus conn options\n");
855 				return;
856 			}
857 		} else {
858 			opt = NULL;
859 			p += SIMPLE_CONN_SIZE;
860 		}
861 
862 		state = ntohs(s->state);
863 		if (!(flags & IP_VS_CONN_F_TEMPLATE)) {
864 			pp = ip_vs_proto_get(s->protocol);
865 			if (!pp) {
866 				IP_VS_DBG(2, "BACKUP v0, Unsupported protocol %u\n",
867 					s->protocol);
868 				continue;
869 			}
870 			if (state >= pp->num_states) {
871 				IP_VS_DBG(2, "BACKUP v0, Invalid %s state %u\n",
872 					pp->name, state);
873 				continue;
874 			}
875 		} else {
876 			/* protocol in templates is not used for state/timeout */
877 			if (state > 0) {
878 				IP_VS_DBG(2, "BACKUP v0, Invalid template state %u\n",
879 					state);
880 				state = 0;
881 			}
882 		}
883 
884 		ip_vs_conn_fill_param(net, AF_INET, s->protocol,
885 				      (const union nf_inet_addr *)&s->caddr,
886 				      s->cport,
887 				      (const union nf_inet_addr *)&s->vaddr,
888 				      s->vport, &param);
889 
890 		/* Send timeout as Zero */
891 		ip_vs_proc_conn(net, &param, flags, state, s->protocol, AF_INET,
892 				(union nf_inet_addr *)&s->daddr, s->dport,
893 				0, 0, opt);
894 	}
895 }
896 
897 /*
898  * Handle options
899  */
ip_vs_proc_seqopt(__u8 * p,unsigned int plen,__u32 * opt_flags,struct ip_vs_sync_conn_options * opt)900 static inline int ip_vs_proc_seqopt(__u8 *p, unsigned int plen,
901 				    __u32 *opt_flags,
902 				    struct ip_vs_sync_conn_options *opt)
903 {
904 	struct ip_vs_sync_conn_options *topt;
905 
906 	topt = (struct ip_vs_sync_conn_options *)p;
907 
908 	if (plen != sizeof(struct ip_vs_sync_conn_options)) {
909 		IP_VS_DBG(2, "BACKUP, bogus conn options length\n");
910 		return -EINVAL;
911 	}
912 	if (*opt_flags & IPVS_OPT_F_SEQ_DATA) {
913 		IP_VS_DBG(2, "BACKUP, conn options found twice\n");
914 		return -EINVAL;
915 	}
916 	ntoh_seq(&topt->in_seq, &opt->in_seq);
917 	ntoh_seq(&topt->out_seq, &opt->out_seq);
918 	*opt_flags |= IPVS_OPT_F_SEQ_DATA;
919 	return 0;
920 }
921 
ip_vs_proc_str(__u8 * p,unsigned int plen,unsigned int * data_len,__u8 ** data,unsigned int maxlen,__u32 * opt_flags,__u32 flag)922 static int ip_vs_proc_str(__u8 *p, unsigned int plen, unsigned int *data_len,
923 			  __u8 **data, unsigned int maxlen,
924 			  __u32 *opt_flags, __u32 flag)
925 {
926 	if (plen > maxlen) {
927 		IP_VS_DBG(2, "BACKUP, bogus par.data len > %d\n", maxlen);
928 		return -EINVAL;
929 	}
930 	if (*opt_flags & flag) {
931 		IP_VS_DBG(2, "BACKUP, Par.data found twice 0x%x\n", flag);
932 		return -EINVAL;
933 	}
934 	*data_len = plen;
935 	*data = p;
936 	*opt_flags |= flag;
937 	return 0;
938 }
939 /*
940  *   Process a Version 1 sync. connection
941  */
ip_vs_proc_sync_conn(struct net * net,__u8 * p,__u8 * msg_end)942 static inline int ip_vs_proc_sync_conn(struct net *net, __u8 *p, __u8 *msg_end)
943 {
944 	struct ip_vs_sync_conn_options opt;
945 	union  ip_vs_sync_conn *s;
946 	struct ip_vs_protocol *pp;
947 	struct ip_vs_conn_param param;
948 	__u32 flags;
949 	unsigned int af, state, pe_data_len=0, pe_name_len=0;
950 	__u8 *pe_data=NULL, *pe_name=NULL;
951 	__u32 opt_flags=0;
952 	int retc=0;
953 
954 	s = (union ip_vs_sync_conn *) p;
955 
956 	if (s->v6.type & STYPE_F_INET6) {
957 #ifdef CONFIG_IP_VS_IPV6
958 		af = AF_INET6;
959 		p += sizeof(struct ip_vs_sync_v6);
960 #else
961 		IP_VS_DBG(3,"BACKUP, IPv6 msg received, and IPVS is not compiled for IPv6\n");
962 		retc = 10;
963 		goto out;
964 #endif
965 	} else if (!s->v4.type) {
966 		af = AF_INET;
967 		p += sizeof(struct ip_vs_sync_v4);
968 	} else {
969 		return -10;
970 	}
971 	if (p > msg_end)
972 		return -20;
973 
974 	/* Process optional params check Type & Len. */
975 	while (p < msg_end) {
976 		int ptype;
977 		int plen;
978 
979 		if (p+2 > msg_end)
980 			return -30;
981 		ptype = *(p++);
982 		plen  = *(p++);
983 
984 		if (!plen || ((p + plen) > msg_end))
985 			return -40;
986 		/* Handle seq option  p = param data */
987 		switch (ptype & ~IPVS_OPT_F_PARAM) {
988 		case IPVS_OPT_SEQ_DATA:
989 			if (ip_vs_proc_seqopt(p, plen, &opt_flags, &opt))
990 				return -50;
991 			break;
992 
993 		case IPVS_OPT_PE_DATA:
994 			if (ip_vs_proc_str(p, plen, &pe_data_len, &pe_data,
995 					   IP_VS_PEDATA_MAXLEN, &opt_flags,
996 					   IPVS_OPT_F_PE_DATA))
997 				return -60;
998 			break;
999 
1000 		case IPVS_OPT_PE_NAME:
1001 			if (ip_vs_proc_str(p, plen,&pe_name_len, &pe_name,
1002 					   IP_VS_PENAME_MAXLEN, &opt_flags,
1003 					   IPVS_OPT_F_PE_NAME))
1004 				return -70;
1005 			break;
1006 
1007 		default:
1008 			/* Param data mandatory ? */
1009 			if (!(ptype & IPVS_OPT_F_PARAM)) {
1010 				IP_VS_DBG(3, "BACKUP, Unknown mandatory param %d found\n",
1011 					  ptype & ~IPVS_OPT_F_PARAM);
1012 				retc = 20;
1013 				goto out;
1014 			}
1015 		}
1016 		p += plen;  /* Next option */
1017 	}
1018 
1019 	/* Get flags and Mask off unsupported */
1020 	flags  = ntohl(s->v4.flags) & IP_VS_CONN_F_BACKUP_MASK;
1021 	flags |= IP_VS_CONN_F_SYNC;
1022 	state = ntohs(s->v4.state);
1023 
1024 	if (!(flags & IP_VS_CONN_F_TEMPLATE)) {
1025 		pp = ip_vs_proto_get(s->v4.protocol);
1026 		if (!pp) {
1027 			IP_VS_DBG(3,"BACKUP, Unsupported protocol %u\n",
1028 				s->v4.protocol);
1029 			retc = 30;
1030 			goto out;
1031 		}
1032 		if (state >= pp->num_states) {
1033 			IP_VS_DBG(3, "BACKUP, Invalid %s state %u\n",
1034 				pp->name, state);
1035 			retc = 40;
1036 			goto out;
1037 		}
1038 	} else {
1039 		/* protocol in templates is not used for state/timeout */
1040 		if (state > 0) {
1041 			IP_VS_DBG(3, "BACKUP, Invalid template state %u\n",
1042 				state);
1043 			state = 0;
1044 		}
1045 	}
1046 	if (ip_vs_conn_fill_param_sync(net, af, s, &param, pe_data,
1047 				       pe_data_len, pe_name, pe_name_len)) {
1048 		retc = 50;
1049 		goto out;
1050 	}
1051 	/* If only IPv4, just silent skip IPv6 */
1052 	if (af == AF_INET)
1053 		ip_vs_proc_conn(net, &param, flags, state, s->v4.protocol, af,
1054 				(union nf_inet_addr *)&s->v4.daddr, s->v4.dport,
1055 				ntohl(s->v4.timeout), ntohl(s->v4.fwmark),
1056 				(opt_flags & IPVS_OPT_F_SEQ_DATA ? &opt : NULL)
1057 				);
1058 #ifdef CONFIG_IP_VS_IPV6
1059 	else
1060 		ip_vs_proc_conn(net, &param, flags, state, s->v6.protocol, af,
1061 				(union nf_inet_addr *)&s->v6.daddr, s->v6.dport,
1062 				ntohl(s->v6.timeout), ntohl(s->v6.fwmark),
1063 				(opt_flags & IPVS_OPT_F_SEQ_DATA ? &opt : NULL)
1064 				);
1065 #endif
1066 	return 0;
1067 	/* Error exit */
1068 out:
1069 	IP_VS_DBG(2, "BACKUP, Single msg dropped err:%d\n", retc);
1070 	return retc;
1071 
1072 }
1073 /*
1074  *      Process received multicast message and create the corresponding
1075  *      ip_vs_conn entries.
1076  *      Handles Version 0 & 1
1077  */
ip_vs_process_message(struct net * net,__u8 * buffer,const size_t buflen)1078 static void ip_vs_process_message(struct net *net, __u8 *buffer,
1079 				  const size_t buflen)
1080 {
1081 	struct netns_ipvs *ipvs = net_ipvs(net);
1082 	struct ip_vs_sync_mesg *m2 = (struct ip_vs_sync_mesg *)buffer;
1083 	__u8 *p, *msg_end;
1084 	int i, nr_conns;
1085 
1086 	if (buflen < sizeof(struct ip_vs_sync_mesg_v0)) {
1087 		IP_VS_DBG(2, "BACKUP, message header too short\n");
1088 		return;
1089 	}
1090 	/* Convert size back to host byte order */
1091 	m2->size = ntohs(m2->size);
1092 
1093 	if (buflen != m2->size) {
1094 		IP_VS_DBG(2, "BACKUP, bogus message size\n");
1095 		return;
1096 	}
1097 	/* SyncID sanity check */
1098 	if (ipvs->backup_syncid != 0 && m2->syncid != ipvs->backup_syncid) {
1099 		IP_VS_DBG(7, "BACKUP, Ignoring syncid = %d\n", m2->syncid);
1100 		return;
1101 	}
1102 	/* Handle version 1  message */
1103 	if ((m2->version == SYNC_PROTO_VER) && (m2->reserved == 0)
1104 	    && (m2->spare == 0)) {
1105 
1106 		msg_end = buffer + sizeof(struct ip_vs_sync_mesg);
1107 		nr_conns = m2->nr_conns;
1108 
1109 		for (i=0; i<nr_conns; i++) {
1110 			union ip_vs_sync_conn *s;
1111 			unsigned size;
1112 			int retc;
1113 
1114 			p = msg_end;
1115 			if (p + sizeof(s->v4) > buffer+buflen) {
1116 				IP_VS_ERR_RL("BACKUP, Dropping buffer, to small\n");
1117 				return;
1118 			}
1119 			s = (union ip_vs_sync_conn *)p;
1120 			size = ntohs(s->v4.ver_size) & SVER_MASK;
1121 			msg_end = p + size;
1122 			/* Basic sanity checks */
1123 			if (msg_end  > buffer+buflen) {
1124 				IP_VS_ERR_RL("BACKUP, Dropping buffer, msg > buffer\n");
1125 				return;
1126 			}
1127 			if (ntohs(s->v4.ver_size) >> SVER_SHIFT) {
1128 				IP_VS_ERR_RL("BACKUP, Dropping buffer, Unknown version %d\n",
1129 					      ntohs(s->v4.ver_size) >> SVER_SHIFT);
1130 				return;
1131 			}
1132 			/* Process a single sync_conn */
1133 			retc = ip_vs_proc_sync_conn(net, p, msg_end);
1134 			if (retc < 0) {
1135 				IP_VS_ERR_RL("BACKUP, Dropping buffer, Err: %d in decoding\n",
1136 					     retc);
1137 				return;
1138 			}
1139 			/* Make sure we have 32 bit alignment */
1140 			msg_end = p + ((size + 3) & ~3);
1141 		}
1142 	} else {
1143 		/* Old type of message */
1144 		ip_vs_process_message_v0(net, buffer, buflen);
1145 		return;
1146 	}
1147 }
1148 
1149 
1150 /*
1151  *      Setup loopback of outgoing multicasts on a sending socket
1152  */
set_mcast_loop(struct sock * sk,u_char loop)1153 static void set_mcast_loop(struct sock *sk, u_char loop)
1154 {
1155 	struct inet_sock *inet = inet_sk(sk);
1156 
1157 	/* setsockopt(sock, SOL_IP, IP_MULTICAST_LOOP, &loop, sizeof(loop)); */
1158 	lock_sock(sk);
1159 	inet->mc_loop = loop ? 1 : 0;
1160 	release_sock(sk);
1161 }
1162 
1163 /*
1164  *      Specify TTL for outgoing multicasts on a sending socket
1165  */
set_mcast_ttl(struct sock * sk,u_char ttl)1166 static void set_mcast_ttl(struct sock *sk, u_char ttl)
1167 {
1168 	struct inet_sock *inet = inet_sk(sk);
1169 
1170 	/* setsockopt(sock, SOL_IP, IP_MULTICAST_TTL, &ttl, sizeof(ttl)); */
1171 	lock_sock(sk);
1172 	inet->mc_ttl = ttl;
1173 	release_sock(sk);
1174 }
1175 
1176 /*
1177  *      Specifiy default interface for outgoing multicasts
1178  */
set_mcast_if(struct sock * sk,char * ifname)1179 static int set_mcast_if(struct sock *sk, char *ifname)
1180 {
1181 	struct net_device *dev;
1182 	struct inet_sock *inet = inet_sk(sk);
1183 	struct net *net = sock_net(sk);
1184 
1185 	dev = __dev_get_by_name(net, ifname);
1186 	if (!dev)
1187 		return -ENODEV;
1188 
1189 	if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if)
1190 		return -EINVAL;
1191 
1192 	lock_sock(sk);
1193 	inet->mc_index = dev->ifindex;
1194 	/*  inet->mc_addr  = 0; */
1195 	release_sock(sk);
1196 
1197 	return 0;
1198 }
1199 
1200 
1201 /*
1202  *	Set the maximum length of sync message according to the
1203  *	specified interface's MTU.
1204  */
set_sync_mesg_maxlen(struct net * net,int sync_state)1205 static int set_sync_mesg_maxlen(struct net *net, int sync_state)
1206 {
1207 	struct netns_ipvs *ipvs = net_ipvs(net);
1208 	struct net_device *dev;
1209 	int num;
1210 
1211 	if (sync_state == IP_VS_STATE_MASTER) {
1212 		dev = __dev_get_by_name(net, ipvs->master_mcast_ifn);
1213 		if (!dev)
1214 			return -ENODEV;
1215 
1216 		num = (dev->mtu - sizeof(struct iphdr) -
1217 		       sizeof(struct udphdr) -
1218 		       SYNC_MESG_HEADER_LEN - 20) / SIMPLE_CONN_SIZE;
1219 		ipvs->send_mesg_maxlen = SYNC_MESG_HEADER_LEN +
1220 			SIMPLE_CONN_SIZE * min(num, MAX_CONNS_PER_SYNCBUFF);
1221 		IP_VS_DBG(7, "setting the maximum length of sync sending "
1222 			  "message %d.\n", ipvs->send_mesg_maxlen);
1223 	} else if (sync_state == IP_VS_STATE_BACKUP) {
1224 		dev = __dev_get_by_name(net, ipvs->backup_mcast_ifn);
1225 		if (!dev)
1226 			return -ENODEV;
1227 
1228 		ipvs->recv_mesg_maxlen = dev->mtu -
1229 			sizeof(struct iphdr) - sizeof(struct udphdr);
1230 		IP_VS_DBG(7, "setting the maximum length of sync receiving "
1231 			  "message %d.\n", ipvs->recv_mesg_maxlen);
1232 	}
1233 
1234 	return 0;
1235 }
1236 
1237 
1238 /*
1239  *      Join a multicast group.
1240  *      the group is specified by a class D multicast address 224.0.0.0/8
1241  *      in the in_addr structure passed in as a parameter.
1242  */
1243 static int
join_mcast_group(struct sock * sk,struct in_addr * addr,char * ifname)1244 join_mcast_group(struct sock *sk, struct in_addr *addr, char *ifname)
1245 {
1246 	struct net *net = sock_net(sk);
1247 	struct ip_mreqn mreq;
1248 	struct net_device *dev;
1249 	int ret;
1250 
1251 	memset(&mreq, 0, sizeof(mreq));
1252 	memcpy(&mreq.imr_multiaddr, addr, sizeof(struct in_addr));
1253 
1254 	dev = __dev_get_by_name(net, ifname);
1255 	if (!dev)
1256 		return -ENODEV;
1257 	if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if)
1258 		return -EINVAL;
1259 
1260 	mreq.imr_ifindex = dev->ifindex;
1261 
1262 	lock_sock(sk);
1263 	ret = ip_mc_join_group(sk, &mreq);
1264 	release_sock(sk);
1265 
1266 	return ret;
1267 }
1268 
1269 
bind_mcastif_addr(struct socket * sock,char * ifname)1270 static int bind_mcastif_addr(struct socket *sock, char *ifname)
1271 {
1272 	struct net *net = sock_net(sock->sk);
1273 	struct net_device *dev;
1274 	__be32 addr;
1275 	struct sockaddr_in sin;
1276 
1277 	dev = __dev_get_by_name(net, ifname);
1278 	if (!dev)
1279 		return -ENODEV;
1280 
1281 	addr = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
1282 	if (!addr)
1283 		pr_err("You probably need to specify IP address on "
1284 		       "multicast interface.\n");
1285 
1286 	IP_VS_DBG(7, "binding socket with (%s) %pI4\n",
1287 		  ifname, &addr);
1288 
1289 	/* Now bind the socket with the address of multicast interface */
1290 	sin.sin_family	     = AF_INET;
1291 	sin.sin_addr.s_addr  = addr;
1292 	sin.sin_port         = 0;
1293 
1294 	return sock->ops->bind(sock, (struct sockaddr*)&sin, sizeof(sin));
1295 }
1296 
1297 /*
1298  *      Set up sending multicast socket over UDP
1299  */
make_send_sock(struct net * net)1300 static struct socket *make_send_sock(struct net *net)
1301 {
1302 	struct netns_ipvs *ipvs = net_ipvs(net);
1303 	struct socket *sock;
1304 	int result;
1305 
1306 	/* First create a socket move it to right name space later */
1307 	result = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock);
1308 	if (result < 0) {
1309 		pr_err("Error during creation of socket; terminating\n");
1310 		return ERR_PTR(result);
1311 	}
1312 	/*
1313 	 * Kernel sockets that are a part of a namespace, should not
1314 	 * hold a reference to a namespace in order to allow to stop it.
1315 	 * After sk_change_net should be released using sk_release_kernel.
1316 	 */
1317 	sk_change_net(sock->sk, net);
1318 	result = set_mcast_if(sock->sk, ipvs->master_mcast_ifn);
1319 	if (result < 0) {
1320 		pr_err("Error setting outbound mcast interface\n");
1321 		goto error;
1322 	}
1323 
1324 	set_mcast_loop(sock->sk, 0);
1325 	set_mcast_ttl(sock->sk, 1);
1326 
1327 	result = bind_mcastif_addr(sock, ipvs->master_mcast_ifn);
1328 	if (result < 0) {
1329 		pr_err("Error binding address of the mcast interface\n");
1330 		goto error;
1331 	}
1332 
1333 	result = sock->ops->connect(sock, (struct sockaddr *) &mcast_addr,
1334 			sizeof(struct sockaddr), 0);
1335 	if (result < 0) {
1336 		pr_err("Error connecting to the multicast addr\n");
1337 		goto error;
1338 	}
1339 
1340 	return sock;
1341 
1342 error:
1343 	sk_release_kernel(sock->sk);
1344 	return ERR_PTR(result);
1345 }
1346 
1347 
1348 /*
1349  *      Set up receiving multicast socket over UDP
1350  */
make_receive_sock(struct net * net)1351 static struct socket *make_receive_sock(struct net *net)
1352 {
1353 	struct netns_ipvs *ipvs = net_ipvs(net);
1354 	struct socket *sock;
1355 	int result;
1356 
1357 	/* First create a socket */
1358 	result = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock);
1359 	if (result < 0) {
1360 		pr_err("Error during creation of socket; terminating\n");
1361 		return ERR_PTR(result);
1362 	}
1363 	/*
1364 	 * Kernel sockets that are a part of a namespace, should not
1365 	 * hold a reference to a namespace in order to allow to stop it.
1366 	 * After sk_change_net should be released using sk_release_kernel.
1367 	 */
1368 	sk_change_net(sock->sk, net);
1369 	/* it is equivalent to the REUSEADDR option in user-space */
1370 	sock->sk->sk_reuse = 1;
1371 
1372 	result = sock->ops->bind(sock, (struct sockaddr *) &mcast_addr,
1373 			sizeof(struct sockaddr));
1374 	if (result < 0) {
1375 		pr_err("Error binding to the multicast addr\n");
1376 		goto error;
1377 	}
1378 
1379 	/* join the multicast group */
1380 	result = join_mcast_group(sock->sk,
1381 			(struct in_addr *) &mcast_addr.sin_addr,
1382 			ipvs->backup_mcast_ifn);
1383 	if (result < 0) {
1384 		pr_err("Error joining to the multicast group\n");
1385 		goto error;
1386 	}
1387 
1388 	return sock;
1389 
1390 error:
1391 	sk_release_kernel(sock->sk);
1392 	return ERR_PTR(result);
1393 }
1394 
1395 
1396 static int
ip_vs_send_async(struct socket * sock,const char * buffer,const size_t length)1397 ip_vs_send_async(struct socket *sock, const char *buffer, const size_t length)
1398 {
1399 	struct msghdr	msg = {.msg_flags = MSG_DONTWAIT|MSG_NOSIGNAL};
1400 	struct kvec	iov;
1401 	int		len;
1402 
1403 	EnterFunction(7);
1404 	iov.iov_base     = (void *)buffer;
1405 	iov.iov_len      = length;
1406 
1407 	len = kernel_sendmsg(sock, &msg, &iov, 1, (size_t)(length));
1408 
1409 	LeaveFunction(7);
1410 	return len;
1411 }
1412 
1413 static void
ip_vs_send_sync_msg(struct socket * sock,struct ip_vs_sync_mesg * msg)1414 ip_vs_send_sync_msg(struct socket *sock, struct ip_vs_sync_mesg *msg)
1415 {
1416 	int msize;
1417 
1418 	msize = msg->size;
1419 
1420 	/* Put size in network byte order */
1421 	msg->size = htons(msg->size);
1422 
1423 	if (ip_vs_send_async(sock, (char *)msg, msize) != msize)
1424 		pr_err("ip_vs_send_async error\n");
1425 }
1426 
1427 static int
ip_vs_receive(struct socket * sock,char * buffer,const size_t buflen)1428 ip_vs_receive(struct socket *sock, char *buffer, const size_t buflen)
1429 {
1430 	struct msghdr		msg = {NULL,};
1431 	struct kvec		iov;
1432 	int			len;
1433 
1434 	EnterFunction(7);
1435 
1436 	/* Receive a packet */
1437 	iov.iov_base     = buffer;
1438 	iov.iov_len      = (size_t)buflen;
1439 
1440 	len = kernel_recvmsg(sock, &msg, &iov, 1, buflen, 0);
1441 
1442 	if (len < 0)
1443 		return -1;
1444 
1445 	LeaveFunction(7);
1446 	return len;
1447 }
1448 
1449 
sync_thread_master(void * data)1450 static int sync_thread_master(void *data)
1451 {
1452 	struct ip_vs_sync_thread_data *tinfo = data;
1453 	struct netns_ipvs *ipvs = net_ipvs(tinfo->net);
1454 	struct ip_vs_sync_buff *sb;
1455 
1456 	pr_info("sync thread started: state = MASTER, mcast_ifn = %s, "
1457 		"syncid = %d\n",
1458 		ipvs->master_mcast_ifn, ipvs->master_syncid);
1459 
1460 	while (!kthread_should_stop()) {
1461 		while ((sb = sb_dequeue(ipvs))) {
1462 			ip_vs_send_sync_msg(tinfo->sock, sb->mesg);
1463 			ip_vs_sync_buff_release(sb);
1464 		}
1465 
1466 		/* check if entries stay in ipvs->sync_buff for 2 seconds */
1467 		sb = get_curr_sync_buff(ipvs, 2 * HZ);
1468 		if (sb) {
1469 			ip_vs_send_sync_msg(tinfo->sock, sb->mesg);
1470 			ip_vs_sync_buff_release(sb);
1471 		}
1472 
1473 		schedule_timeout_interruptible(HZ);
1474 	}
1475 
1476 	/* clean up the sync_buff queue */
1477 	while ((sb = sb_dequeue(ipvs)))
1478 		ip_vs_sync_buff_release(sb);
1479 
1480 	/* clean up the current sync_buff */
1481 	sb = get_curr_sync_buff(ipvs, 0);
1482 	if (sb)
1483 		ip_vs_sync_buff_release(sb);
1484 
1485 	/* release the sending multicast socket */
1486 	sk_release_kernel(tinfo->sock->sk);
1487 	kfree(tinfo);
1488 
1489 	return 0;
1490 }
1491 
1492 
sync_thread_backup(void * data)1493 static int sync_thread_backup(void *data)
1494 {
1495 	struct ip_vs_sync_thread_data *tinfo = data;
1496 	struct netns_ipvs *ipvs = net_ipvs(tinfo->net);
1497 	int len;
1498 
1499 	pr_info("sync thread started: state = BACKUP, mcast_ifn = %s, "
1500 		"syncid = %d\n",
1501 		ipvs->backup_mcast_ifn, ipvs->backup_syncid);
1502 
1503 	while (!kthread_should_stop()) {
1504 		wait_event_interruptible(*sk_sleep(tinfo->sock->sk),
1505 			 !skb_queue_empty(&tinfo->sock->sk->sk_receive_queue)
1506 			 || kthread_should_stop());
1507 
1508 		/* do we have data now? */
1509 		while (!skb_queue_empty(&(tinfo->sock->sk->sk_receive_queue))) {
1510 			len = ip_vs_receive(tinfo->sock, tinfo->buf,
1511 					ipvs->recv_mesg_maxlen);
1512 			if (len <= 0) {
1513 				pr_err("receiving message error\n");
1514 				break;
1515 			}
1516 
1517 			/* disable bottom half, because it accesses the data
1518 			   shared by softirq while getting/creating conns */
1519 			local_bh_disable();
1520 			ip_vs_process_message(tinfo->net, tinfo->buf, len);
1521 			local_bh_enable();
1522 		}
1523 	}
1524 
1525 	/* release the sending multicast socket */
1526 	sk_release_kernel(tinfo->sock->sk);
1527 	kfree(tinfo->buf);
1528 	kfree(tinfo);
1529 
1530 	return 0;
1531 }
1532 
1533 
start_sync_thread(struct net * net,int state,char * mcast_ifn,__u8 syncid)1534 int start_sync_thread(struct net *net, int state, char *mcast_ifn, __u8 syncid)
1535 {
1536 	struct ip_vs_sync_thread_data *tinfo;
1537 	struct task_struct **realtask, *task;
1538 	struct socket *sock;
1539 	struct netns_ipvs *ipvs = net_ipvs(net);
1540 	char *name, *buf = NULL;
1541 	int (*threadfn)(void *data);
1542 	int result = -ENOMEM;
1543 
1544 	IP_VS_DBG(7, "%s(): pid %d\n", __func__, task_pid_nr(current));
1545 	IP_VS_DBG(7, "Each ip_vs_sync_conn entry needs %Zd bytes\n",
1546 		  sizeof(struct ip_vs_sync_conn_v0));
1547 
1548 	if (state == IP_VS_STATE_MASTER) {
1549 		if (ipvs->master_thread)
1550 			return -EEXIST;
1551 
1552 		strlcpy(ipvs->master_mcast_ifn, mcast_ifn,
1553 			sizeof(ipvs->master_mcast_ifn));
1554 		ipvs->master_syncid = syncid;
1555 		realtask = &ipvs->master_thread;
1556 		name = "ipvs_master:%d";
1557 		threadfn = sync_thread_master;
1558 		sock = make_send_sock(net);
1559 	} else if (state == IP_VS_STATE_BACKUP) {
1560 		if (ipvs->backup_thread)
1561 			return -EEXIST;
1562 
1563 		strlcpy(ipvs->backup_mcast_ifn, mcast_ifn,
1564 			sizeof(ipvs->backup_mcast_ifn));
1565 		ipvs->backup_syncid = syncid;
1566 		realtask = &ipvs->backup_thread;
1567 		name = "ipvs_backup:%d";
1568 		threadfn = sync_thread_backup;
1569 		sock = make_receive_sock(net);
1570 	} else {
1571 		return -EINVAL;
1572 	}
1573 
1574 	if (IS_ERR(sock)) {
1575 		result = PTR_ERR(sock);
1576 		goto out;
1577 	}
1578 
1579 	set_sync_mesg_maxlen(net, state);
1580 	if (state == IP_VS_STATE_BACKUP) {
1581 		buf = kmalloc(ipvs->recv_mesg_maxlen, GFP_KERNEL);
1582 		if (!buf)
1583 			goto outsocket;
1584 	}
1585 
1586 	tinfo = kmalloc(sizeof(*tinfo), GFP_KERNEL);
1587 	if (!tinfo)
1588 		goto outbuf;
1589 
1590 	tinfo->net = net;
1591 	tinfo->sock = sock;
1592 	tinfo->buf = buf;
1593 
1594 	task = kthread_run(threadfn, tinfo, name, ipvs->gen);
1595 	if (IS_ERR(task)) {
1596 		result = PTR_ERR(task);
1597 		goto outtinfo;
1598 	}
1599 
1600 	/* mark as active */
1601 	*realtask = task;
1602 	ipvs->sync_state |= state;
1603 
1604 	/* increase the module use count */
1605 	ip_vs_use_count_inc();
1606 
1607 	return 0;
1608 
1609 outtinfo:
1610 	kfree(tinfo);
1611 outbuf:
1612 	kfree(buf);
1613 outsocket:
1614 	sk_release_kernel(sock->sk);
1615 out:
1616 	return result;
1617 }
1618 
1619 
stop_sync_thread(struct net * net,int state)1620 int stop_sync_thread(struct net *net, int state)
1621 {
1622 	struct netns_ipvs *ipvs = net_ipvs(net);
1623 	int retc = -EINVAL;
1624 
1625 	IP_VS_DBG(7, "%s(): pid %d\n", __func__, task_pid_nr(current));
1626 
1627 	if (state == IP_VS_STATE_MASTER) {
1628 		if (!ipvs->master_thread)
1629 			return -ESRCH;
1630 
1631 		pr_info("stopping master sync thread %d ...\n",
1632 			task_pid_nr(ipvs->master_thread));
1633 
1634 		/*
1635 		 * The lock synchronizes with sb_queue_tail(), so that we don't
1636 		 * add sync buffers to the queue, when we are already in
1637 		 * progress of stopping the master sync daemon.
1638 		 */
1639 
1640 		spin_lock_bh(&ipvs->sync_lock);
1641 		ipvs->sync_state &= ~IP_VS_STATE_MASTER;
1642 		spin_unlock_bh(&ipvs->sync_lock);
1643 		retc = kthread_stop(ipvs->master_thread);
1644 		ipvs->master_thread = NULL;
1645 	} else if (state == IP_VS_STATE_BACKUP) {
1646 		if (!ipvs->backup_thread)
1647 			return -ESRCH;
1648 
1649 		pr_info("stopping backup sync thread %d ...\n",
1650 			task_pid_nr(ipvs->backup_thread));
1651 
1652 		ipvs->sync_state &= ~IP_VS_STATE_BACKUP;
1653 		retc = kthread_stop(ipvs->backup_thread);
1654 		ipvs->backup_thread = NULL;
1655 	}
1656 
1657 	/* decrease the module use count */
1658 	ip_vs_use_count_dec();
1659 
1660 	return retc;
1661 }
1662 
1663 /*
1664  * Initialize data struct for each netns
1665  */
__ip_vs_sync_init(struct net * net)1666 int __net_init __ip_vs_sync_init(struct net *net)
1667 {
1668 	struct netns_ipvs *ipvs = net_ipvs(net);
1669 
1670 	INIT_LIST_HEAD(&ipvs->sync_queue);
1671 	spin_lock_init(&ipvs->sync_lock);
1672 	spin_lock_init(&ipvs->sync_buff_lock);
1673 
1674 	ipvs->sync_mcast_addr.sin_family = AF_INET;
1675 	ipvs->sync_mcast_addr.sin_port = cpu_to_be16(IP_VS_SYNC_PORT);
1676 	ipvs->sync_mcast_addr.sin_addr.s_addr = cpu_to_be32(IP_VS_SYNC_GROUP);
1677 	return 0;
1678 }
1679 
__ip_vs_sync_cleanup(struct net * net)1680 void __ip_vs_sync_cleanup(struct net *net)
1681 {
1682 	int retc;
1683 
1684 	retc = stop_sync_thread(net, IP_VS_STATE_MASTER);
1685 	if (retc && retc != -ESRCH)
1686 		pr_err("Failed to stop Master Daemon\n");
1687 
1688 	retc = stop_sync_thread(net, IP_VS_STATE_BACKUP);
1689 	if (retc && retc != -ESRCH)
1690 		pr_err("Failed to stop Backup Daemon\n");
1691 }
1692 
ip_vs_sync_init(void)1693 int __init ip_vs_sync_init(void)
1694 {
1695 	return 0;
1696 }
1697 
ip_vs_sync_cleanup(void)1698 void ip_vs_sync_cleanup(void)
1699 {
1700 }
1701