1 // SPDX-License-Identifier: GPL-2.0-only
2 /* (C) 1999-2001 Paul `Rusty' Russell
3 * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
4 * (C) 2002-2013 Jozsef Kadlecsik <kadlec@netfilter.org>
5 * (C) 2006-2012 Patrick McHardy <kaber@trash.net>
6 */
7
8 #include <linux/types.h>
9 #include <linux/timer.h>
10 #include <linux/module.h>
11 #include <linux/in.h>
12 #include <linux/tcp.h>
13 #include <linux/spinlock.h>
14 #include <linux/skbuff.h>
15 #include <linux/ipv6.h>
16 #include <net/ip6_checksum.h>
17 #include <asm/unaligned.h>
18
19 #include <net/tcp.h>
20
21 #include <linux/netfilter.h>
22 #include <linux/netfilter_ipv4.h>
23 #include <linux/netfilter_ipv6.h>
24 #include <net/netfilter/nf_conntrack.h>
25 #include <net/netfilter/nf_conntrack_l4proto.h>
26 #include <net/netfilter/nf_conntrack_ecache.h>
27 #include <net/netfilter/nf_conntrack_seqadj.h>
28 #include <net/netfilter/nf_conntrack_synproxy.h>
29 #include <net/netfilter/nf_conntrack_timeout.h>
30 #include <net/netfilter/nf_log.h>
31 #include <net/netfilter/ipv4/nf_conntrack_ipv4.h>
32 #include <net/netfilter/ipv6/nf_conntrack_ipv6.h>
33
34 /* FIXME: Examine ipfilter's timeouts and conntrack transitions more
35 closely. They're more complex. --RR */
36
37 static const char *const tcp_conntrack_names[] = {
38 "NONE",
39 "SYN_SENT",
40 "SYN_RECV",
41 "ESTABLISHED",
42 "FIN_WAIT",
43 "CLOSE_WAIT",
44 "LAST_ACK",
45 "TIME_WAIT",
46 "CLOSE",
47 "SYN_SENT2",
48 };
49
50 #define SECS * HZ
51 #define MINS * 60 SECS
52 #define HOURS * 60 MINS
53 #define DAYS * 24 HOURS
54
55 static const unsigned int tcp_timeouts[TCP_CONNTRACK_TIMEOUT_MAX] = {
56 [TCP_CONNTRACK_SYN_SENT] = 2 MINS,
57 [TCP_CONNTRACK_SYN_RECV] = 60 SECS,
58 [TCP_CONNTRACK_ESTABLISHED] = 5 DAYS,
59 [TCP_CONNTRACK_FIN_WAIT] = 2 MINS,
60 [TCP_CONNTRACK_CLOSE_WAIT] = 60 SECS,
61 [TCP_CONNTRACK_LAST_ACK] = 30 SECS,
62 [TCP_CONNTRACK_TIME_WAIT] = 2 MINS,
63 [TCP_CONNTRACK_CLOSE] = 10 SECS,
64 [TCP_CONNTRACK_SYN_SENT2] = 2 MINS,
65 /* RFC1122 says the R2 limit should be at least 100 seconds.
66 Linux uses 15 packets as limit, which corresponds
67 to ~13-30min depending on RTO. */
68 [TCP_CONNTRACK_RETRANS] = 5 MINS,
69 [TCP_CONNTRACK_UNACK] = 5 MINS,
70 };
71
72 #define sNO TCP_CONNTRACK_NONE
73 #define sSS TCP_CONNTRACK_SYN_SENT
74 #define sSR TCP_CONNTRACK_SYN_RECV
75 #define sES TCP_CONNTRACK_ESTABLISHED
76 #define sFW TCP_CONNTRACK_FIN_WAIT
77 #define sCW TCP_CONNTRACK_CLOSE_WAIT
78 #define sLA TCP_CONNTRACK_LAST_ACK
79 #define sTW TCP_CONNTRACK_TIME_WAIT
80 #define sCL TCP_CONNTRACK_CLOSE
81 #define sS2 TCP_CONNTRACK_SYN_SENT2
82 #define sIV TCP_CONNTRACK_MAX
83 #define sIG TCP_CONNTRACK_IGNORE
84
85 /* What TCP flags are set from RST/SYN/FIN/ACK. */
86 enum tcp_bit_set {
87 TCP_SYN_SET,
88 TCP_SYNACK_SET,
89 TCP_FIN_SET,
90 TCP_ACK_SET,
91 TCP_RST_SET,
92 TCP_NONE_SET,
93 };
94
95 /*
96 * The TCP state transition table needs a few words...
97 *
98 * We are the man in the middle. All the packets go through us
99 * but might get lost in transit to the destination.
100 * It is assumed that the destinations can't receive segments
101 * we haven't seen.
102 *
103 * The checked segment is in window, but our windows are *not*
104 * equivalent with the ones of the sender/receiver. We always
105 * try to guess the state of the current sender.
106 *
107 * The meaning of the states are:
108 *
109 * NONE: initial state
110 * SYN_SENT: SYN-only packet seen
111 * SYN_SENT2: SYN-only packet seen from reply dir, simultaneous open
112 * SYN_RECV: SYN-ACK packet seen
113 * ESTABLISHED: ACK packet seen
114 * FIN_WAIT: FIN packet seen
115 * CLOSE_WAIT: ACK seen (after FIN)
116 * LAST_ACK: FIN seen (after FIN)
117 * TIME_WAIT: last ACK seen
118 * CLOSE: closed connection (RST)
119 *
120 * Packets marked as IGNORED (sIG):
121 * if they may be either invalid or valid
122 * and the receiver may send back a connection
123 * closing RST or a SYN/ACK.
124 *
125 * Packets marked as INVALID (sIV):
126 * if we regard them as truly invalid packets
127 */
128 static const u8 tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = {
129 {
130 /* ORIGINAL */
131 /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */
132 /*syn*/ { sSS, sSS, sIG, sIG, sIG, sIG, sIG, sSS, sSS, sS2 },
133 /*
134 * sNO -> sSS Initialize a new connection
135 * sSS -> sSS Retransmitted SYN
136 * sS2 -> sS2 Late retransmitted SYN
137 * sSR -> sIG
138 * sES -> sIG Error: SYNs in window outside the SYN_SENT state
139 * are errors. Receiver will reply with RST
140 * and close the connection.
141 * Or we are not in sync and hold a dead connection.
142 * sFW -> sIG
143 * sCW -> sIG
144 * sLA -> sIG
145 * sTW -> sSS Reopened connection (RFC 1122).
146 * sCL -> sSS
147 */
148 /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */
149 /*synack*/ { sIV, sIV, sSR, sIV, sIV, sIV, sIV, sIV, sIV, sSR },
150 /*
151 * sNO -> sIV Too late and no reason to do anything
152 * sSS -> sIV Client can't send SYN and then SYN/ACK
153 * sS2 -> sSR SYN/ACK sent to SYN2 in simultaneous open
154 * sSR -> sSR Late retransmitted SYN/ACK in simultaneous open
155 * sES -> sIV Invalid SYN/ACK packets sent by the client
156 * sFW -> sIV
157 * sCW -> sIV
158 * sLA -> sIV
159 * sTW -> sIV
160 * sCL -> sIV
161 */
162 /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */
163 /*fin*/ { sIV, sIV, sFW, sFW, sLA, sLA, sLA, sTW, sCL, sIV },
164 /*
165 * sNO -> sIV Too late and no reason to do anything...
166 * sSS -> sIV Client migth not send FIN in this state:
167 * we enforce waiting for a SYN/ACK reply first.
168 * sS2 -> sIV
169 * sSR -> sFW Close started.
170 * sES -> sFW
171 * sFW -> sLA FIN seen in both directions, waiting for
172 * the last ACK.
173 * Migth be a retransmitted FIN as well...
174 * sCW -> sLA
175 * sLA -> sLA Retransmitted FIN. Remain in the same state.
176 * sTW -> sTW
177 * sCL -> sCL
178 */
179 /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */
180 /*ack*/ { sES, sIV, sES, sES, sCW, sCW, sTW, sTW, sCL, sIV },
181 /*
182 * sNO -> sES Assumed.
183 * sSS -> sIV ACK is invalid: we haven't seen a SYN/ACK yet.
184 * sS2 -> sIV
185 * sSR -> sES Established state is reached.
186 * sES -> sES :-)
187 * sFW -> sCW Normal close request answered by ACK.
188 * sCW -> sCW
189 * sLA -> sTW Last ACK detected (RFC5961 challenged)
190 * sTW -> sTW Retransmitted last ACK. Remain in the same state.
191 * sCL -> sCL
192 */
193 /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */
194 /*rst*/ { sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL },
195 /*none*/ { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV }
196 },
197 {
198 /* REPLY */
199 /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */
200 /*syn*/ { sIV, sS2, sIV, sIV, sIV, sIV, sIV, sSS, sIV, sS2 },
201 /*
202 * sNO -> sIV Never reached.
203 * sSS -> sS2 Simultaneous open
204 * sS2 -> sS2 Retransmitted simultaneous SYN
205 * sSR -> sIV Invalid SYN packets sent by the server
206 * sES -> sIV
207 * sFW -> sIV
208 * sCW -> sIV
209 * sLA -> sIV
210 * sTW -> sSS Reopened connection, but server may have switched role
211 * sCL -> sIV
212 */
213 /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */
214 /*synack*/ { sIV, sSR, sIG, sIG, sIG, sIG, sIG, sIG, sIG, sSR },
215 /*
216 * sSS -> sSR Standard open.
217 * sS2 -> sSR Simultaneous open
218 * sSR -> sIG Retransmitted SYN/ACK, ignore it.
219 * sES -> sIG Late retransmitted SYN/ACK?
220 * sFW -> sIG Might be SYN/ACK answering ignored SYN
221 * sCW -> sIG
222 * sLA -> sIG
223 * sTW -> sIG
224 * sCL -> sIG
225 */
226 /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */
227 /*fin*/ { sIV, sIV, sFW, sFW, sLA, sLA, sLA, sTW, sCL, sIV },
228 /*
229 * sSS -> sIV Server might not send FIN in this state.
230 * sS2 -> sIV
231 * sSR -> sFW Close started.
232 * sES -> sFW
233 * sFW -> sLA FIN seen in both directions.
234 * sCW -> sLA
235 * sLA -> sLA Retransmitted FIN.
236 * sTW -> sTW
237 * sCL -> sCL
238 */
239 /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */
240 /*ack*/ { sIV, sIG, sSR, sES, sCW, sCW, sTW, sTW, sCL, sIG },
241 /*
242 * sSS -> sIG Might be a half-open connection.
243 * sS2 -> sIG
244 * sSR -> sSR Might answer late resent SYN.
245 * sES -> sES :-)
246 * sFW -> sCW Normal close request answered by ACK.
247 * sCW -> sCW
248 * sLA -> sTW Last ACK detected (RFC5961 challenged)
249 * sTW -> sTW Retransmitted last ACK.
250 * sCL -> sCL
251 */
252 /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */
253 /*rst*/ { sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL },
254 /*none*/ { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV }
255 }
256 };
257
258 #ifdef CONFIG_NF_CONNTRACK_PROCFS
259 /* Print out the private part of the conntrack. */
tcp_print_conntrack(struct seq_file * s,struct nf_conn * ct)260 static void tcp_print_conntrack(struct seq_file *s, struct nf_conn *ct)
261 {
262 if (test_bit(IPS_OFFLOAD_BIT, &ct->status))
263 return;
264
265 seq_printf(s, "%s ", tcp_conntrack_names[ct->proto.tcp.state]);
266 }
267 #endif
268
get_conntrack_index(const struct tcphdr * tcph)269 static unsigned int get_conntrack_index(const struct tcphdr *tcph)
270 {
271 if (tcph->rst) return TCP_RST_SET;
272 else if (tcph->syn) return (tcph->ack ? TCP_SYNACK_SET : TCP_SYN_SET);
273 else if (tcph->fin) return TCP_FIN_SET;
274 else if (tcph->ack) return TCP_ACK_SET;
275 else return TCP_NONE_SET;
276 }
277
278 /* TCP connection tracking based on 'Real Stateful TCP Packet Filtering
279 in IP Filter' by Guido van Rooij.
280
281 http://www.sane.nl/events/sane2000/papers.html
282 http://www.darkart.com/mirrors/www.obfuscation.org/ipf/
283
284 The boundaries and the conditions are changed according to RFC793:
285 the packet must intersect the window (i.e. segments may be
286 after the right or before the left edge) and thus receivers may ACK
287 segments after the right edge of the window.
288
289 td_maxend = max(sack + max(win,1)) seen in reply packets
290 td_maxwin = max(max(win, 1)) + (sack - ack) seen in sent packets
291 td_maxwin += seq + len - sender.td_maxend
292 if seq + len > sender.td_maxend
293 td_end = max(seq + len) seen in sent packets
294
295 I. Upper bound for valid data: seq <= sender.td_maxend
296 II. Lower bound for valid data: seq + len >= sender.td_end - receiver.td_maxwin
297 III. Upper bound for valid (s)ack: sack <= receiver.td_end
298 IV. Lower bound for valid (s)ack: sack >= receiver.td_end - MAXACKWINDOW
299
300 where sack is the highest right edge of sack block found in the packet
301 or ack in the case of packet without SACK option.
302
303 The upper bound limit for a valid (s)ack is not ignored -
304 we doesn't have to deal with fragments.
305 */
306
segment_seq_plus_len(__u32 seq,size_t len,unsigned int dataoff,const struct tcphdr * tcph)307 static inline __u32 segment_seq_plus_len(__u32 seq,
308 size_t len,
309 unsigned int dataoff,
310 const struct tcphdr *tcph)
311 {
312 /* XXX Should I use payload length field in IP/IPv6 header ?
313 * - YK */
314 return (seq + len - dataoff - tcph->doff*4
315 + (tcph->syn ? 1 : 0) + (tcph->fin ? 1 : 0));
316 }
317
318 /* Fixme: what about big packets? */
319 #define MAXACKWINCONST 66000
320 #define MAXACKWINDOW(sender) \
321 ((sender)->td_maxwin > MAXACKWINCONST ? (sender)->td_maxwin \
322 : MAXACKWINCONST)
323
324 /*
325 * Simplified tcp_parse_options routine from tcp_input.c
326 */
tcp_options(const struct sk_buff * skb,unsigned int dataoff,const struct tcphdr * tcph,struct ip_ct_tcp_state * state)327 static void tcp_options(const struct sk_buff *skb,
328 unsigned int dataoff,
329 const struct tcphdr *tcph,
330 struct ip_ct_tcp_state *state)
331 {
332 unsigned char buff[(15 * 4) - sizeof(struct tcphdr)];
333 const unsigned char *ptr;
334 int length = (tcph->doff*4) - sizeof(struct tcphdr);
335
336 if (!length)
337 return;
338
339 ptr = skb_header_pointer(skb, dataoff + sizeof(struct tcphdr),
340 length, buff);
341 if (!ptr)
342 return;
343
344 state->td_scale = 0;
345 state->flags &= IP_CT_TCP_FLAG_BE_LIBERAL;
346
347 while (length > 0) {
348 int opcode=*ptr++;
349 int opsize;
350
351 switch (opcode) {
352 case TCPOPT_EOL:
353 return;
354 case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */
355 length--;
356 continue;
357 default:
358 if (length < 2)
359 return;
360 opsize=*ptr++;
361 if (opsize < 2) /* "silly options" */
362 return;
363 if (opsize > length)
364 return; /* don't parse partial options */
365
366 if (opcode == TCPOPT_SACK_PERM
367 && opsize == TCPOLEN_SACK_PERM)
368 state->flags |= IP_CT_TCP_FLAG_SACK_PERM;
369 else if (opcode == TCPOPT_WINDOW
370 && opsize == TCPOLEN_WINDOW) {
371 state->td_scale = *(u_int8_t *)ptr;
372
373 if (state->td_scale > TCP_MAX_WSCALE)
374 state->td_scale = TCP_MAX_WSCALE;
375
376 state->flags |=
377 IP_CT_TCP_FLAG_WINDOW_SCALE;
378 }
379 ptr += opsize - 2;
380 length -= opsize;
381 }
382 }
383 }
384
tcp_sack(const struct sk_buff * skb,unsigned int dataoff,const struct tcphdr * tcph,__u32 * sack)385 static void tcp_sack(const struct sk_buff *skb, unsigned int dataoff,
386 const struct tcphdr *tcph, __u32 *sack)
387 {
388 unsigned char buff[(15 * 4) - sizeof(struct tcphdr)];
389 const unsigned char *ptr;
390 int length = (tcph->doff*4) - sizeof(struct tcphdr);
391 __u32 tmp;
392
393 if (!length)
394 return;
395
396 ptr = skb_header_pointer(skb, dataoff + sizeof(struct tcphdr),
397 length, buff);
398 if (!ptr)
399 return;
400
401 /* Fast path for timestamp-only option */
402 if (length == TCPOLEN_TSTAMP_ALIGNED
403 && *(__be32 *)ptr == htonl((TCPOPT_NOP << 24)
404 | (TCPOPT_NOP << 16)
405 | (TCPOPT_TIMESTAMP << 8)
406 | TCPOLEN_TIMESTAMP))
407 return;
408
409 while (length > 0) {
410 int opcode = *ptr++;
411 int opsize, i;
412
413 switch (opcode) {
414 case TCPOPT_EOL:
415 return;
416 case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */
417 length--;
418 continue;
419 default:
420 if (length < 2)
421 return;
422 opsize = *ptr++;
423 if (opsize < 2) /* "silly options" */
424 return;
425 if (opsize > length)
426 return; /* don't parse partial options */
427
428 if (opcode == TCPOPT_SACK
429 && opsize >= (TCPOLEN_SACK_BASE
430 + TCPOLEN_SACK_PERBLOCK)
431 && !((opsize - TCPOLEN_SACK_BASE)
432 % TCPOLEN_SACK_PERBLOCK)) {
433 for (i = 0;
434 i < (opsize - TCPOLEN_SACK_BASE);
435 i += TCPOLEN_SACK_PERBLOCK) {
436 tmp = get_unaligned_be32((__be32 *)(ptr+i)+1);
437
438 if (after(tmp, *sack))
439 *sack = tmp;
440 }
441 return;
442 }
443 ptr += opsize - 2;
444 length -= opsize;
445 }
446 }
447 }
448
tcp_init_sender(struct ip_ct_tcp_state * sender,struct ip_ct_tcp_state * receiver,const struct sk_buff * skb,unsigned int dataoff,const struct tcphdr * tcph,u32 end,u32 win)449 static void tcp_init_sender(struct ip_ct_tcp_state *sender,
450 struct ip_ct_tcp_state *receiver,
451 const struct sk_buff *skb,
452 unsigned int dataoff,
453 const struct tcphdr *tcph,
454 u32 end, u32 win)
455 {
456 /* SYN-ACK in reply to a SYN
457 * or SYN from reply direction in simultaneous open.
458 */
459 sender->td_end =
460 sender->td_maxend = end;
461 sender->td_maxwin = (win == 0 ? 1 : win);
462
463 tcp_options(skb, dataoff, tcph, sender);
464 /* RFC 1323:
465 * Both sides must send the Window Scale option
466 * to enable window scaling in either direction.
467 */
468 if (!(sender->flags & IP_CT_TCP_FLAG_WINDOW_SCALE &&
469 receiver->flags & IP_CT_TCP_FLAG_WINDOW_SCALE)) {
470 sender->td_scale = 0;
471 receiver->td_scale = 0;
472 }
473 }
474
tcp_in_window(struct nf_conn * ct,enum ip_conntrack_dir dir,unsigned int index,const struct sk_buff * skb,unsigned int dataoff,const struct tcphdr * tcph,const struct nf_hook_state * hook_state)475 static bool tcp_in_window(struct nf_conn *ct,
476 enum ip_conntrack_dir dir,
477 unsigned int index,
478 const struct sk_buff *skb,
479 unsigned int dataoff,
480 const struct tcphdr *tcph,
481 const struct nf_hook_state *hook_state)
482 {
483 struct ip_ct_tcp *state = &ct->proto.tcp;
484 struct net *net = nf_ct_net(ct);
485 struct nf_tcp_net *tn = nf_tcp_pernet(net);
486 struct ip_ct_tcp_state *sender = &state->seen[dir];
487 struct ip_ct_tcp_state *receiver = &state->seen[!dir];
488 __u32 seq, ack, sack, end, win, swin;
489 u16 win_raw;
490 s32 receiver_offset;
491 bool res, in_recv_win;
492
493 /*
494 * Get the required data from the packet.
495 */
496 seq = ntohl(tcph->seq);
497 ack = sack = ntohl(tcph->ack_seq);
498 win_raw = ntohs(tcph->window);
499 win = win_raw;
500 end = segment_seq_plus_len(seq, skb->len, dataoff, tcph);
501
502 if (receiver->flags & IP_CT_TCP_FLAG_SACK_PERM)
503 tcp_sack(skb, dataoff, tcph, &sack);
504
505 /* Take into account NAT sequence number mangling */
506 receiver_offset = nf_ct_seq_offset(ct, !dir, ack - 1);
507 ack -= receiver_offset;
508 sack -= receiver_offset;
509
510 if (sender->td_maxwin == 0) {
511 /*
512 * Initialize sender data.
513 */
514 if (tcph->syn) {
515 tcp_init_sender(sender, receiver,
516 skb, dataoff, tcph,
517 end, win);
518 if (!tcph->ack)
519 /* Simultaneous open */
520 return true;
521 } else {
522 /*
523 * We are in the middle of a connection,
524 * its history is lost for us.
525 * Let's try to use the data from the packet.
526 */
527 sender->td_end = end;
528 swin = win << sender->td_scale;
529 sender->td_maxwin = (swin == 0 ? 1 : swin);
530 sender->td_maxend = end + sender->td_maxwin;
531 if (receiver->td_maxwin == 0) {
532 /* We haven't seen traffic in the other
533 * direction yet but we have to tweak window
534 * tracking to pass III and IV until that
535 * happens.
536 */
537 receiver->td_end = receiver->td_maxend = sack;
538 } else if (sack == receiver->td_end + 1) {
539 /* Likely a reply to a keepalive.
540 * Needed for III.
541 */
542 receiver->td_end++;
543 }
544
545 }
546 } else if (tcph->syn &&
547 after(end, sender->td_end) &&
548 (state->state == TCP_CONNTRACK_SYN_SENT ||
549 state->state == TCP_CONNTRACK_SYN_RECV)) {
550 /*
551 * RFC 793: "if a TCP is reinitialized ... then it need
552 * not wait at all; it must only be sure to use sequence
553 * numbers larger than those recently used."
554 *
555 * Re-init state for this direction, just like for the first
556 * syn(-ack) reply, it might differ in seq, ack or tcp options.
557 */
558 tcp_init_sender(sender, receiver,
559 skb, dataoff, tcph,
560 end, win);
561
562 if (dir == IP_CT_DIR_REPLY && !tcph->ack)
563 return true;
564 }
565
566 if (!(tcph->ack)) {
567 /*
568 * If there is no ACK, just pretend it was set and OK.
569 */
570 ack = sack = receiver->td_end;
571 } else if (((tcp_flag_word(tcph) & (TCP_FLAG_ACK|TCP_FLAG_RST)) ==
572 (TCP_FLAG_ACK|TCP_FLAG_RST))
573 && (ack == 0)) {
574 /*
575 * Broken TCP stacks, that set ACK in RST packets as well
576 * with zero ack value.
577 */
578 ack = sack = receiver->td_end;
579 }
580
581 if (tcph->rst && seq == 0 && state->state == TCP_CONNTRACK_SYN_SENT)
582 /*
583 * RST sent answering SYN.
584 */
585 seq = end = sender->td_end;
586
587 /* Is the ending sequence in the receive window (if available)? */
588 in_recv_win = !receiver->td_maxwin ||
589 after(end, sender->td_end - receiver->td_maxwin - 1);
590
591 if (before(seq, sender->td_maxend + 1) &&
592 in_recv_win &&
593 before(sack, receiver->td_end + 1) &&
594 after(sack, receiver->td_end - MAXACKWINDOW(sender) - 1)) {
595 /*
596 * Take into account window scaling (RFC 1323).
597 */
598 if (!tcph->syn)
599 win <<= sender->td_scale;
600
601 /*
602 * Update sender data.
603 */
604 swin = win + (sack - ack);
605 if (sender->td_maxwin < swin)
606 sender->td_maxwin = swin;
607 if (after(end, sender->td_end)) {
608 sender->td_end = end;
609 sender->flags |= IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED;
610 }
611 if (tcph->ack) {
612 if (!(sender->flags & IP_CT_TCP_FLAG_MAXACK_SET)) {
613 sender->td_maxack = ack;
614 sender->flags |= IP_CT_TCP_FLAG_MAXACK_SET;
615 } else if (after(ack, sender->td_maxack))
616 sender->td_maxack = ack;
617 }
618
619 /*
620 * Update receiver data.
621 */
622 if (receiver->td_maxwin != 0 && after(end, sender->td_maxend))
623 receiver->td_maxwin += end - sender->td_maxend;
624 if (after(sack + win, receiver->td_maxend - 1)) {
625 receiver->td_maxend = sack + win;
626 if (win == 0)
627 receiver->td_maxend++;
628 }
629 if (ack == receiver->td_end)
630 receiver->flags &= ~IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED;
631
632 /*
633 * Check retransmissions.
634 */
635 if (index == TCP_ACK_SET) {
636 if (state->last_dir == dir
637 && state->last_seq == seq
638 && state->last_ack == ack
639 && state->last_end == end
640 && state->last_win == win_raw)
641 state->retrans++;
642 else {
643 state->last_dir = dir;
644 state->last_seq = seq;
645 state->last_ack = ack;
646 state->last_end = end;
647 state->last_win = win_raw;
648 state->retrans = 0;
649 }
650 }
651 res = true;
652 } else {
653 res = false;
654 if (sender->flags & IP_CT_TCP_FLAG_BE_LIBERAL ||
655 tn->tcp_be_liberal)
656 res = true;
657 if (!res) {
658 bool seq_ok = before(seq, sender->td_maxend + 1);
659
660 if (!seq_ok) {
661 u32 overshot = end - sender->td_maxend + 1;
662 bool ack_ok;
663
664 ack_ok = after(sack, receiver->td_end - MAXACKWINDOW(sender) - 1);
665
666 if (in_recv_win &&
667 ack_ok &&
668 overshot <= receiver->td_maxwin &&
669 before(sack, receiver->td_end + 1)) {
670 /* Work around TCPs that send more bytes than allowed by
671 * the receive window.
672 *
673 * If the (marked as invalid) packet is allowed to pass by
674 * the ruleset and the peer acks this data, then its possible
675 * all future packets will trigger 'ACK is over upper bound' check.
676 *
677 * Thus if only the sequence check fails then do update td_end so
678 * possible ACK for this data can update internal state.
679 */
680 sender->td_end = end;
681 sender->flags |= IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED;
682
683 nf_ct_l4proto_log_invalid(skb, ct, hook_state,
684 "%u bytes more than expected", overshot);
685 return res;
686 }
687 }
688
689 nf_ct_l4proto_log_invalid(skb, ct, hook_state,
690 "%s",
691 before(seq, sender->td_maxend + 1) ?
692 in_recv_win ?
693 before(sack, receiver->td_end + 1) ?
694 after(sack, receiver->td_end - MAXACKWINDOW(sender) - 1) ? "BUG"
695 : "ACK is under the lower bound (possible overly delayed ACK)"
696 : "ACK is over the upper bound (ACKed data not seen yet)"
697 : "SEQ is under the lower bound (already ACKed data retransmitted)"
698 : "SEQ is over the upper bound (over the window of the receiver)");
699 }
700 }
701
702 return res;
703 }
704
705 /* table of valid flag combinations - PUSH, ECE and CWR are always valid */
706 static const u8 tcp_valid_flags[(TCPHDR_FIN|TCPHDR_SYN|TCPHDR_RST|TCPHDR_ACK|
707 TCPHDR_URG) + 1] =
708 {
709 [TCPHDR_SYN] = 1,
710 [TCPHDR_SYN|TCPHDR_URG] = 1,
711 [TCPHDR_SYN|TCPHDR_ACK] = 1,
712 [TCPHDR_RST] = 1,
713 [TCPHDR_RST|TCPHDR_ACK] = 1,
714 [TCPHDR_FIN|TCPHDR_ACK] = 1,
715 [TCPHDR_FIN|TCPHDR_ACK|TCPHDR_URG] = 1,
716 [TCPHDR_ACK] = 1,
717 [TCPHDR_ACK|TCPHDR_URG] = 1,
718 };
719
tcp_error_log(const struct sk_buff * skb,const struct nf_hook_state * state,const char * msg)720 static void tcp_error_log(const struct sk_buff *skb,
721 const struct nf_hook_state *state,
722 const char *msg)
723 {
724 nf_l4proto_log_invalid(skb, state, IPPROTO_TCP, "%s", msg);
725 }
726
727 /* Protect conntrack agaist broken packets. Code taken from ipt_unclean.c. */
tcp_error(const struct tcphdr * th,struct sk_buff * skb,unsigned int dataoff,const struct nf_hook_state * state)728 static bool tcp_error(const struct tcphdr *th,
729 struct sk_buff *skb,
730 unsigned int dataoff,
731 const struct nf_hook_state *state)
732 {
733 unsigned int tcplen = skb->len - dataoff;
734 u8 tcpflags;
735
736 /* Not whole TCP header or malformed packet */
737 if (th->doff*4 < sizeof(struct tcphdr) || tcplen < th->doff*4) {
738 tcp_error_log(skb, state, "truncated packet");
739 return true;
740 }
741
742 /* Checksum invalid? Ignore.
743 * We skip checking packets on the outgoing path
744 * because the checksum is assumed to be correct.
745 */
746 /* FIXME: Source route IP option packets --RR */
747 if (state->net->ct.sysctl_checksum &&
748 state->hook == NF_INET_PRE_ROUTING &&
749 nf_checksum(skb, state->hook, dataoff, IPPROTO_TCP, state->pf)) {
750 tcp_error_log(skb, state, "bad checksum");
751 return true;
752 }
753
754 /* Check TCP flags. */
755 tcpflags = (tcp_flag_byte(th) & ~(TCPHDR_ECE|TCPHDR_CWR|TCPHDR_PSH));
756 if (!tcp_valid_flags[tcpflags]) {
757 tcp_error_log(skb, state, "invalid tcp flag combination");
758 return true;
759 }
760
761 return false;
762 }
763
tcp_new(struct nf_conn * ct,const struct sk_buff * skb,unsigned int dataoff,const struct tcphdr * th)764 static noinline bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb,
765 unsigned int dataoff,
766 const struct tcphdr *th)
767 {
768 enum tcp_conntrack new_state;
769 struct net *net = nf_ct_net(ct);
770 const struct nf_tcp_net *tn = nf_tcp_pernet(net);
771
772 /* Don't need lock here: this conntrack not in circulation yet */
773 new_state = tcp_conntracks[0][get_conntrack_index(th)][TCP_CONNTRACK_NONE];
774
775 /* Invalid: delete conntrack */
776 if (new_state >= TCP_CONNTRACK_MAX) {
777 pr_debug("nf_ct_tcp: invalid new deleting.\n");
778 return false;
779 }
780
781 if (new_state == TCP_CONNTRACK_SYN_SENT) {
782 memset(&ct->proto.tcp, 0, sizeof(ct->proto.tcp));
783 /* SYN packet */
784 ct->proto.tcp.seen[0].td_end =
785 segment_seq_plus_len(ntohl(th->seq), skb->len,
786 dataoff, th);
787 ct->proto.tcp.seen[0].td_maxwin = ntohs(th->window);
788 if (ct->proto.tcp.seen[0].td_maxwin == 0)
789 ct->proto.tcp.seen[0].td_maxwin = 1;
790 ct->proto.tcp.seen[0].td_maxend =
791 ct->proto.tcp.seen[0].td_end;
792
793 tcp_options(skb, dataoff, th, &ct->proto.tcp.seen[0]);
794 } else if (tn->tcp_loose == 0) {
795 /* Don't try to pick up connections. */
796 return false;
797 } else {
798 memset(&ct->proto.tcp, 0, sizeof(ct->proto.tcp));
799 /*
800 * We are in the middle of a connection,
801 * its history is lost for us.
802 * Let's try to use the data from the packet.
803 */
804 ct->proto.tcp.seen[0].td_end =
805 segment_seq_plus_len(ntohl(th->seq), skb->len,
806 dataoff, th);
807 ct->proto.tcp.seen[0].td_maxwin = ntohs(th->window);
808 if (ct->proto.tcp.seen[0].td_maxwin == 0)
809 ct->proto.tcp.seen[0].td_maxwin = 1;
810 ct->proto.tcp.seen[0].td_maxend =
811 ct->proto.tcp.seen[0].td_end +
812 ct->proto.tcp.seen[0].td_maxwin;
813
814 /* We assume SACK and liberal window checking to handle
815 * window scaling */
816 ct->proto.tcp.seen[0].flags =
817 ct->proto.tcp.seen[1].flags = IP_CT_TCP_FLAG_SACK_PERM |
818 IP_CT_TCP_FLAG_BE_LIBERAL;
819 }
820
821 /* tcp_packet will set them */
822 ct->proto.tcp.last_index = TCP_NONE_SET;
823 return true;
824 }
825
tcp_can_early_drop(const struct nf_conn * ct)826 static bool tcp_can_early_drop(const struct nf_conn *ct)
827 {
828 switch (ct->proto.tcp.state) {
829 case TCP_CONNTRACK_FIN_WAIT:
830 case TCP_CONNTRACK_LAST_ACK:
831 case TCP_CONNTRACK_TIME_WAIT:
832 case TCP_CONNTRACK_CLOSE:
833 case TCP_CONNTRACK_CLOSE_WAIT:
834 return true;
835 default:
836 break;
837 }
838
839 return false;
840 }
841
nf_ct_tcp_state_reset(struct ip_ct_tcp_state * state)842 static void nf_ct_tcp_state_reset(struct ip_ct_tcp_state *state)
843 {
844 state->td_end = 0;
845 state->td_maxend = 0;
846 state->td_maxwin = 0;
847 state->td_maxack = 0;
848 state->td_scale = 0;
849 state->flags &= IP_CT_TCP_FLAG_BE_LIBERAL;
850 }
851
852 /* Returns verdict for packet, or -1 for invalid. */
nf_conntrack_tcp_packet(struct nf_conn * ct,struct sk_buff * skb,unsigned int dataoff,enum ip_conntrack_info ctinfo,const struct nf_hook_state * state)853 int nf_conntrack_tcp_packet(struct nf_conn *ct,
854 struct sk_buff *skb,
855 unsigned int dataoff,
856 enum ip_conntrack_info ctinfo,
857 const struct nf_hook_state *state)
858 {
859 struct net *net = nf_ct_net(ct);
860 struct nf_tcp_net *tn = nf_tcp_pernet(net);
861 struct nf_conntrack_tuple *tuple;
862 enum tcp_conntrack new_state, old_state;
863 unsigned int index, *timeouts;
864 enum ip_conntrack_dir dir;
865 const struct tcphdr *th;
866 struct tcphdr _tcph;
867 unsigned long timeout;
868
869 th = skb_header_pointer(skb, dataoff, sizeof(_tcph), &_tcph);
870 if (th == NULL)
871 return -NF_ACCEPT;
872
873 if (tcp_error(th, skb, dataoff, state))
874 return -NF_ACCEPT;
875
876 if (!nf_ct_is_confirmed(ct) && !tcp_new(ct, skb, dataoff, th))
877 return -NF_ACCEPT;
878
879 spin_lock_bh(&ct->lock);
880 old_state = ct->proto.tcp.state;
881 dir = CTINFO2DIR(ctinfo);
882 index = get_conntrack_index(th);
883 new_state = tcp_conntracks[dir][index][old_state];
884 tuple = &ct->tuplehash[dir].tuple;
885
886 switch (new_state) {
887 case TCP_CONNTRACK_SYN_SENT:
888 if (old_state < TCP_CONNTRACK_TIME_WAIT)
889 break;
890 /* RFC 1122: "When a connection is closed actively,
891 * it MUST linger in TIME-WAIT state for a time 2xMSL
892 * (Maximum Segment Lifetime). However, it MAY accept
893 * a new SYN from the remote TCP to reopen the connection
894 * directly from TIME-WAIT state, if..."
895 * We ignore the conditions because we are in the
896 * TIME-WAIT state anyway.
897 *
898 * Handle aborted connections: we and the server
899 * think there is an existing connection but the client
900 * aborts it and starts a new one.
901 */
902 if (((ct->proto.tcp.seen[dir].flags
903 | ct->proto.tcp.seen[!dir].flags)
904 & IP_CT_TCP_FLAG_CLOSE_INIT)
905 || (ct->proto.tcp.last_dir == dir
906 && ct->proto.tcp.last_index == TCP_RST_SET)) {
907 /* Attempt to reopen a closed/aborted connection.
908 * Delete this connection and look up again. */
909 spin_unlock_bh(&ct->lock);
910
911 /* Only repeat if we can actually remove the timer.
912 * Destruction may already be in progress in process
913 * context and we must give it a chance to terminate.
914 */
915 if (nf_ct_kill(ct))
916 return -NF_REPEAT;
917 return NF_DROP;
918 }
919 fallthrough;
920 case TCP_CONNTRACK_IGNORE:
921 /* Ignored packets:
922 *
923 * Our connection entry may be out of sync, so ignore
924 * packets which may signal the real connection between
925 * the client and the server.
926 *
927 * a) SYN in ORIGINAL
928 * b) SYN/ACK in REPLY
929 * c) ACK in reply direction after initial SYN in original.
930 *
931 * If the ignored packet is invalid, the receiver will send
932 * a RST we'll catch below.
933 */
934 if (index == TCP_SYNACK_SET
935 && ct->proto.tcp.last_index == TCP_SYN_SET
936 && ct->proto.tcp.last_dir != dir
937 && ntohl(th->ack_seq) == ct->proto.tcp.last_end) {
938 /* b) This SYN/ACK acknowledges a SYN that we earlier
939 * ignored as invalid. This means that the client and
940 * the server are both in sync, while the firewall is
941 * not. We get in sync from the previously annotated
942 * values.
943 */
944 old_state = TCP_CONNTRACK_SYN_SENT;
945 new_state = TCP_CONNTRACK_SYN_RECV;
946 ct->proto.tcp.seen[ct->proto.tcp.last_dir].td_end =
947 ct->proto.tcp.last_end;
948 ct->proto.tcp.seen[ct->proto.tcp.last_dir].td_maxend =
949 ct->proto.tcp.last_end;
950 ct->proto.tcp.seen[ct->proto.tcp.last_dir].td_maxwin =
951 ct->proto.tcp.last_win == 0 ?
952 1 : ct->proto.tcp.last_win;
953 ct->proto.tcp.seen[ct->proto.tcp.last_dir].td_scale =
954 ct->proto.tcp.last_wscale;
955 ct->proto.tcp.last_flags &= ~IP_CT_EXP_CHALLENGE_ACK;
956 ct->proto.tcp.seen[ct->proto.tcp.last_dir].flags =
957 ct->proto.tcp.last_flags;
958 nf_ct_tcp_state_reset(&ct->proto.tcp.seen[dir]);
959 break;
960 }
961 ct->proto.tcp.last_index = index;
962 ct->proto.tcp.last_dir = dir;
963 ct->proto.tcp.last_seq = ntohl(th->seq);
964 ct->proto.tcp.last_end =
965 segment_seq_plus_len(ntohl(th->seq), skb->len, dataoff, th);
966 ct->proto.tcp.last_win = ntohs(th->window);
967
968 /* a) This is a SYN in ORIGINAL. The client and the server
969 * may be in sync but we are not. In that case, we annotate
970 * the TCP options and let the packet go through. If it is a
971 * valid SYN packet, the server will reply with a SYN/ACK, and
972 * then we'll get in sync. Otherwise, the server potentially
973 * responds with a challenge ACK if implementing RFC5961.
974 */
975 if (index == TCP_SYN_SET && dir == IP_CT_DIR_ORIGINAL) {
976 struct ip_ct_tcp_state seen = {};
977
978 ct->proto.tcp.last_flags =
979 ct->proto.tcp.last_wscale = 0;
980 tcp_options(skb, dataoff, th, &seen);
981 if (seen.flags & IP_CT_TCP_FLAG_WINDOW_SCALE) {
982 ct->proto.tcp.last_flags |=
983 IP_CT_TCP_FLAG_WINDOW_SCALE;
984 ct->proto.tcp.last_wscale = seen.td_scale;
985 }
986 if (seen.flags & IP_CT_TCP_FLAG_SACK_PERM) {
987 ct->proto.tcp.last_flags |=
988 IP_CT_TCP_FLAG_SACK_PERM;
989 }
990 /* Mark the potential for RFC5961 challenge ACK,
991 * this pose a special problem for LAST_ACK state
992 * as ACK is intrepretated as ACKing last FIN.
993 */
994 if (old_state == TCP_CONNTRACK_LAST_ACK)
995 ct->proto.tcp.last_flags |=
996 IP_CT_EXP_CHALLENGE_ACK;
997 }
998 spin_unlock_bh(&ct->lock);
999 nf_ct_l4proto_log_invalid(skb, ct, state,
1000 "packet (index %d) in dir %d ignored, state %s",
1001 index, dir,
1002 tcp_conntrack_names[old_state]);
1003 return NF_ACCEPT;
1004 case TCP_CONNTRACK_MAX:
1005 /* Special case for SYN proxy: when the SYN to the server or
1006 * the SYN/ACK from the server is lost, the client may transmit
1007 * a keep-alive packet while in SYN_SENT state. This needs to
1008 * be associated with the original conntrack entry in order to
1009 * generate a new SYN with the correct sequence number.
1010 */
1011 if (nfct_synproxy(ct) && old_state == TCP_CONNTRACK_SYN_SENT &&
1012 index == TCP_ACK_SET && dir == IP_CT_DIR_ORIGINAL &&
1013 ct->proto.tcp.last_dir == IP_CT_DIR_ORIGINAL &&
1014 ct->proto.tcp.seen[dir].td_end - 1 == ntohl(th->seq)) {
1015 pr_debug("nf_ct_tcp: SYN proxy client keep alive\n");
1016 spin_unlock_bh(&ct->lock);
1017 return NF_ACCEPT;
1018 }
1019
1020 /* Invalid packet */
1021 spin_unlock_bh(&ct->lock);
1022 nf_ct_l4proto_log_invalid(skb, ct, state,
1023 "packet (index %d) in dir %d invalid, state %s",
1024 index, dir,
1025 tcp_conntrack_names[old_state]);
1026 return -NF_ACCEPT;
1027 case TCP_CONNTRACK_TIME_WAIT:
1028 /* RFC5961 compliance cause stack to send "challenge-ACK"
1029 * e.g. in response to spurious SYNs. Conntrack MUST
1030 * not believe this ACK is acking last FIN.
1031 */
1032 if (old_state == TCP_CONNTRACK_LAST_ACK &&
1033 index == TCP_ACK_SET &&
1034 ct->proto.tcp.last_dir != dir &&
1035 ct->proto.tcp.last_index == TCP_SYN_SET &&
1036 (ct->proto.tcp.last_flags & IP_CT_EXP_CHALLENGE_ACK)) {
1037 /* Detected RFC5961 challenge ACK */
1038 ct->proto.tcp.last_flags &= ~IP_CT_EXP_CHALLENGE_ACK;
1039 spin_unlock_bh(&ct->lock);
1040 nf_ct_l4proto_log_invalid(skb, ct, state, "challenge-ack ignored");
1041 return NF_ACCEPT; /* Don't change state */
1042 }
1043 break;
1044 case TCP_CONNTRACK_SYN_SENT2:
1045 /* tcp_conntracks table is not smart enough to handle
1046 * simultaneous open.
1047 */
1048 ct->proto.tcp.last_flags |= IP_CT_TCP_SIMULTANEOUS_OPEN;
1049 break;
1050 case TCP_CONNTRACK_SYN_RECV:
1051 if (dir == IP_CT_DIR_REPLY && index == TCP_ACK_SET &&
1052 ct->proto.tcp.last_flags & IP_CT_TCP_SIMULTANEOUS_OPEN)
1053 new_state = TCP_CONNTRACK_ESTABLISHED;
1054 break;
1055 case TCP_CONNTRACK_CLOSE:
1056 if (index != TCP_RST_SET)
1057 break;
1058
1059 /* If we are closing, tuple might have been re-used already.
1060 * last_index, last_ack, and all other ct fields used for
1061 * sequence/window validation are outdated in that case.
1062 *
1063 * As the conntrack can already be expired by GC under pressure,
1064 * just skip validation checks.
1065 */
1066 if (tcp_can_early_drop(ct))
1067 goto in_window;
1068
1069 /* td_maxack might be outdated if we let a SYN through earlier */
1070 if ((ct->proto.tcp.seen[!dir].flags & IP_CT_TCP_FLAG_MAXACK_SET) &&
1071 ct->proto.tcp.last_index != TCP_SYN_SET) {
1072 u32 seq = ntohl(th->seq);
1073
1074 /* If we are not in established state and SEQ=0 this is most
1075 * likely an answer to a SYN we let go through above (last_index
1076 * can be updated due to out-of-order ACKs).
1077 */
1078 if (seq == 0 && !nf_conntrack_tcp_established(ct))
1079 break;
1080
1081 if (before(seq, ct->proto.tcp.seen[!dir].td_maxack) &&
1082 !tn->tcp_ignore_invalid_rst) {
1083 /* Invalid RST */
1084 spin_unlock_bh(&ct->lock);
1085 nf_ct_l4proto_log_invalid(skb, ct, state, "invalid rst");
1086 return -NF_ACCEPT;
1087 }
1088
1089 if (!nf_conntrack_tcp_established(ct) ||
1090 seq == ct->proto.tcp.seen[!dir].td_maxack)
1091 break;
1092
1093 /* Check if rst is part of train, such as
1094 * foo:80 > bar:4379: P, 235946583:235946602(19) ack 42
1095 * foo:80 > bar:4379: R, 235946602:235946602(0) ack 42
1096 */
1097 if (ct->proto.tcp.last_index == TCP_ACK_SET &&
1098 ct->proto.tcp.last_dir == dir &&
1099 seq == ct->proto.tcp.last_end)
1100 break;
1101
1102 /* ... RST sequence number doesn't match exactly, keep
1103 * established state to allow a possible challenge ACK.
1104 */
1105 new_state = old_state;
1106 }
1107 if (((test_bit(IPS_SEEN_REPLY_BIT, &ct->status)
1108 && ct->proto.tcp.last_index == TCP_SYN_SET)
1109 || (!test_bit(IPS_ASSURED_BIT, &ct->status)
1110 && ct->proto.tcp.last_index == TCP_ACK_SET))
1111 && ntohl(th->ack_seq) == ct->proto.tcp.last_end) {
1112 /* RST sent to invalid SYN or ACK we had let through
1113 * at a) and c) above:
1114 *
1115 * a) SYN was in window then
1116 * c) we hold a half-open connection.
1117 *
1118 * Delete our connection entry.
1119 * We skip window checking, because packet might ACK
1120 * segments we ignored. */
1121 goto in_window;
1122 }
1123 break;
1124 default:
1125 /* Keep compilers happy. */
1126 break;
1127 }
1128
1129 if (!tcp_in_window(ct, dir, index,
1130 skb, dataoff, th, state)) {
1131 spin_unlock_bh(&ct->lock);
1132 return -NF_ACCEPT;
1133 }
1134 in_window:
1135 /* From now on we have got in-window packets */
1136 ct->proto.tcp.last_index = index;
1137 ct->proto.tcp.last_dir = dir;
1138
1139 pr_debug("tcp_conntracks: ");
1140 nf_ct_dump_tuple(tuple);
1141 pr_debug("syn=%i ack=%i fin=%i rst=%i old=%i new=%i\n",
1142 (th->syn ? 1 : 0), (th->ack ? 1 : 0),
1143 (th->fin ? 1 : 0), (th->rst ? 1 : 0),
1144 old_state, new_state);
1145
1146 ct->proto.tcp.state = new_state;
1147 if (old_state != new_state
1148 && new_state == TCP_CONNTRACK_FIN_WAIT)
1149 ct->proto.tcp.seen[dir].flags |= IP_CT_TCP_FLAG_CLOSE_INIT;
1150
1151 timeouts = nf_ct_timeout_lookup(ct);
1152 if (!timeouts)
1153 timeouts = tn->timeouts;
1154
1155 if (ct->proto.tcp.retrans >= tn->tcp_max_retrans &&
1156 timeouts[new_state] > timeouts[TCP_CONNTRACK_RETRANS])
1157 timeout = timeouts[TCP_CONNTRACK_RETRANS];
1158 else if (unlikely(index == TCP_RST_SET))
1159 timeout = timeouts[TCP_CONNTRACK_CLOSE];
1160 else if ((ct->proto.tcp.seen[0].flags | ct->proto.tcp.seen[1].flags) &
1161 IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED &&
1162 timeouts[new_state] > timeouts[TCP_CONNTRACK_UNACK])
1163 timeout = timeouts[TCP_CONNTRACK_UNACK];
1164 else if (ct->proto.tcp.last_win == 0 &&
1165 timeouts[new_state] > timeouts[TCP_CONNTRACK_RETRANS])
1166 timeout = timeouts[TCP_CONNTRACK_RETRANS];
1167 else
1168 timeout = timeouts[new_state];
1169 spin_unlock_bh(&ct->lock);
1170
1171 if (new_state != old_state)
1172 nf_conntrack_event_cache(IPCT_PROTOINFO, ct);
1173
1174 if (!test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
1175 /* If only reply is a RST, we can consider ourselves not to
1176 have an established connection: this is a fairly common
1177 problem case, so we can delete the conntrack
1178 immediately. --RR */
1179 if (th->rst) {
1180 nf_ct_kill_acct(ct, ctinfo, skb);
1181 return NF_ACCEPT;
1182 }
1183
1184 if (index == TCP_SYN_SET && old_state == TCP_CONNTRACK_SYN_SENT) {
1185 /* do not renew timeout on SYN retransmit.
1186 *
1187 * Else port reuse by client or NAT middlebox can keep
1188 * entry alive indefinitely (including nat info).
1189 */
1190 return NF_ACCEPT;
1191 }
1192
1193 /* ESTABLISHED without SEEN_REPLY, i.e. mid-connection
1194 * pickup with loose=1. Avoid large ESTABLISHED timeout.
1195 */
1196 if (new_state == TCP_CONNTRACK_ESTABLISHED &&
1197 timeout > timeouts[TCP_CONNTRACK_UNACK])
1198 timeout = timeouts[TCP_CONNTRACK_UNACK];
1199 } else if (!test_bit(IPS_ASSURED_BIT, &ct->status)
1200 && (old_state == TCP_CONNTRACK_SYN_RECV
1201 || old_state == TCP_CONNTRACK_ESTABLISHED)
1202 && new_state == TCP_CONNTRACK_ESTABLISHED) {
1203 /* Set ASSURED if we see valid ack in ESTABLISHED
1204 after SYN_RECV or a valid answer for a picked up
1205 connection. */
1206 set_bit(IPS_ASSURED_BIT, &ct->status);
1207 nf_conntrack_event_cache(IPCT_ASSURED, ct);
1208 }
1209 nf_ct_refresh_acct(ct, ctinfo, skb, timeout);
1210
1211 return NF_ACCEPT;
1212 }
1213
1214 #if IS_ENABLED(CONFIG_NF_CT_NETLINK)
1215
1216 #include <linux/netfilter/nfnetlink.h>
1217 #include <linux/netfilter/nfnetlink_conntrack.h>
1218
tcp_to_nlattr(struct sk_buff * skb,struct nlattr * nla,struct nf_conn * ct,bool destroy)1219 static int tcp_to_nlattr(struct sk_buff *skb, struct nlattr *nla,
1220 struct nf_conn *ct, bool destroy)
1221 {
1222 struct nlattr *nest_parms;
1223 struct nf_ct_tcp_flags tmp = {};
1224
1225 spin_lock_bh(&ct->lock);
1226 nest_parms = nla_nest_start(skb, CTA_PROTOINFO_TCP);
1227 if (!nest_parms)
1228 goto nla_put_failure;
1229
1230 if (nla_put_u8(skb, CTA_PROTOINFO_TCP_STATE, ct->proto.tcp.state))
1231 goto nla_put_failure;
1232
1233 if (destroy)
1234 goto skip_state;
1235
1236 if (nla_put_u8(skb, CTA_PROTOINFO_TCP_WSCALE_ORIGINAL,
1237 ct->proto.tcp.seen[0].td_scale) ||
1238 nla_put_u8(skb, CTA_PROTOINFO_TCP_WSCALE_REPLY,
1239 ct->proto.tcp.seen[1].td_scale))
1240 goto nla_put_failure;
1241
1242 tmp.flags = ct->proto.tcp.seen[0].flags;
1243 if (nla_put(skb, CTA_PROTOINFO_TCP_FLAGS_ORIGINAL,
1244 sizeof(struct nf_ct_tcp_flags), &tmp))
1245 goto nla_put_failure;
1246
1247 tmp.flags = ct->proto.tcp.seen[1].flags;
1248 if (nla_put(skb, CTA_PROTOINFO_TCP_FLAGS_REPLY,
1249 sizeof(struct nf_ct_tcp_flags), &tmp))
1250 goto nla_put_failure;
1251 skip_state:
1252 spin_unlock_bh(&ct->lock);
1253 nla_nest_end(skb, nest_parms);
1254
1255 return 0;
1256
1257 nla_put_failure:
1258 spin_unlock_bh(&ct->lock);
1259 return -1;
1260 }
1261
1262 static const struct nla_policy tcp_nla_policy[CTA_PROTOINFO_TCP_MAX+1] = {
1263 [CTA_PROTOINFO_TCP_STATE] = { .type = NLA_U8 },
1264 [CTA_PROTOINFO_TCP_WSCALE_ORIGINAL] = { .type = NLA_U8 },
1265 [CTA_PROTOINFO_TCP_WSCALE_REPLY] = { .type = NLA_U8 },
1266 [CTA_PROTOINFO_TCP_FLAGS_ORIGINAL] = { .len = sizeof(struct nf_ct_tcp_flags) },
1267 [CTA_PROTOINFO_TCP_FLAGS_REPLY] = { .len = sizeof(struct nf_ct_tcp_flags) },
1268 };
1269
1270 #define TCP_NLATTR_SIZE ( \
1271 NLA_ALIGN(NLA_HDRLEN + 1) + \
1272 NLA_ALIGN(NLA_HDRLEN + 1) + \
1273 NLA_ALIGN(NLA_HDRLEN + sizeof(struct nf_ct_tcp_flags)) + \
1274 NLA_ALIGN(NLA_HDRLEN + sizeof(struct nf_ct_tcp_flags)))
1275
nlattr_to_tcp(struct nlattr * cda[],struct nf_conn * ct)1276 static int nlattr_to_tcp(struct nlattr *cda[], struct nf_conn *ct)
1277 {
1278 struct nlattr *pattr = cda[CTA_PROTOINFO_TCP];
1279 struct nlattr *tb[CTA_PROTOINFO_TCP_MAX+1];
1280 int err;
1281
1282 /* updates could not contain anything about the private
1283 * protocol info, in that case skip the parsing */
1284 if (!pattr)
1285 return 0;
1286
1287 err = nla_parse_nested_deprecated(tb, CTA_PROTOINFO_TCP_MAX, pattr,
1288 tcp_nla_policy, NULL);
1289 if (err < 0)
1290 return err;
1291
1292 if (tb[CTA_PROTOINFO_TCP_STATE] &&
1293 nla_get_u8(tb[CTA_PROTOINFO_TCP_STATE]) >= TCP_CONNTRACK_MAX)
1294 return -EINVAL;
1295
1296 spin_lock_bh(&ct->lock);
1297 if (tb[CTA_PROTOINFO_TCP_STATE])
1298 ct->proto.tcp.state = nla_get_u8(tb[CTA_PROTOINFO_TCP_STATE]);
1299
1300 if (tb[CTA_PROTOINFO_TCP_FLAGS_ORIGINAL]) {
1301 struct nf_ct_tcp_flags *attr =
1302 nla_data(tb[CTA_PROTOINFO_TCP_FLAGS_ORIGINAL]);
1303 ct->proto.tcp.seen[0].flags &= ~attr->mask;
1304 ct->proto.tcp.seen[0].flags |= attr->flags & attr->mask;
1305 }
1306
1307 if (tb[CTA_PROTOINFO_TCP_FLAGS_REPLY]) {
1308 struct nf_ct_tcp_flags *attr =
1309 nla_data(tb[CTA_PROTOINFO_TCP_FLAGS_REPLY]);
1310 ct->proto.tcp.seen[1].flags &= ~attr->mask;
1311 ct->proto.tcp.seen[1].flags |= attr->flags & attr->mask;
1312 }
1313
1314 if (tb[CTA_PROTOINFO_TCP_WSCALE_ORIGINAL] &&
1315 tb[CTA_PROTOINFO_TCP_WSCALE_REPLY] &&
1316 ct->proto.tcp.seen[0].flags & IP_CT_TCP_FLAG_WINDOW_SCALE &&
1317 ct->proto.tcp.seen[1].flags & IP_CT_TCP_FLAG_WINDOW_SCALE) {
1318 ct->proto.tcp.seen[0].td_scale =
1319 nla_get_u8(tb[CTA_PROTOINFO_TCP_WSCALE_ORIGINAL]);
1320 ct->proto.tcp.seen[1].td_scale =
1321 nla_get_u8(tb[CTA_PROTOINFO_TCP_WSCALE_REPLY]);
1322 }
1323 spin_unlock_bh(&ct->lock);
1324
1325 return 0;
1326 }
1327
tcp_nlattr_tuple_size(void)1328 static unsigned int tcp_nlattr_tuple_size(void)
1329 {
1330 static unsigned int size __read_mostly;
1331
1332 if (!size)
1333 size = nla_policy_len(nf_ct_port_nla_policy, CTA_PROTO_MAX + 1);
1334
1335 return size;
1336 }
1337 #endif
1338
1339 #ifdef CONFIG_NF_CONNTRACK_TIMEOUT
1340
1341 #include <linux/netfilter/nfnetlink.h>
1342 #include <linux/netfilter/nfnetlink_cttimeout.h>
1343
tcp_timeout_nlattr_to_obj(struct nlattr * tb[],struct net * net,void * data)1344 static int tcp_timeout_nlattr_to_obj(struct nlattr *tb[],
1345 struct net *net, void *data)
1346 {
1347 struct nf_tcp_net *tn = nf_tcp_pernet(net);
1348 unsigned int *timeouts = data;
1349 int i;
1350
1351 if (!timeouts)
1352 timeouts = tn->timeouts;
1353 /* set default TCP timeouts. */
1354 for (i=0; i<TCP_CONNTRACK_TIMEOUT_MAX; i++)
1355 timeouts[i] = tn->timeouts[i];
1356
1357 if (tb[CTA_TIMEOUT_TCP_SYN_SENT]) {
1358 timeouts[TCP_CONNTRACK_SYN_SENT] =
1359 ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_SYN_SENT]))*HZ;
1360 }
1361
1362 if (tb[CTA_TIMEOUT_TCP_SYN_RECV]) {
1363 timeouts[TCP_CONNTRACK_SYN_RECV] =
1364 ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_SYN_RECV]))*HZ;
1365 }
1366 if (tb[CTA_TIMEOUT_TCP_ESTABLISHED]) {
1367 timeouts[TCP_CONNTRACK_ESTABLISHED] =
1368 ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_ESTABLISHED]))*HZ;
1369 }
1370 if (tb[CTA_TIMEOUT_TCP_FIN_WAIT]) {
1371 timeouts[TCP_CONNTRACK_FIN_WAIT] =
1372 ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_FIN_WAIT]))*HZ;
1373 }
1374 if (tb[CTA_TIMEOUT_TCP_CLOSE_WAIT]) {
1375 timeouts[TCP_CONNTRACK_CLOSE_WAIT] =
1376 ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_CLOSE_WAIT]))*HZ;
1377 }
1378 if (tb[CTA_TIMEOUT_TCP_LAST_ACK]) {
1379 timeouts[TCP_CONNTRACK_LAST_ACK] =
1380 ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_LAST_ACK]))*HZ;
1381 }
1382 if (tb[CTA_TIMEOUT_TCP_TIME_WAIT]) {
1383 timeouts[TCP_CONNTRACK_TIME_WAIT] =
1384 ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_TIME_WAIT]))*HZ;
1385 }
1386 if (tb[CTA_TIMEOUT_TCP_CLOSE]) {
1387 timeouts[TCP_CONNTRACK_CLOSE] =
1388 ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_CLOSE]))*HZ;
1389 }
1390 if (tb[CTA_TIMEOUT_TCP_SYN_SENT2]) {
1391 timeouts[TCP_CONNTRACK_SYN_SENT2] =
1392 ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_SYN_SENT2]))*HZ;
1393 }
1394 if (tb[CTA_TIMEOUT_TCP_RETRANS]) {
1395 timeouts[TCP_CONNTRACK_RETRANS] =
1396 ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_RETRANS]))*HZ;
1397 }
1398 if (tb[CTA_TIMEOUT_TCP_UNACK]) {
1399 timeouts[TCP_CONNTRACK_UNACK] =
1400 ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_UNACK]))*HZ;
1401 }
1402
1403 timeouts[CTA_TIMEOUT_TCP_UNSPEC] = timeouts[CTA_TIMEOUT_TCP_SYN_SENT];
1404 return 0;
1405 }
1406
1407 static int
tcp_timeout_obj_to_nlattr(struct sk_buff * skb,const void * data)1408 tcp_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data)
1409 {
1410 const unsigned int *timeouts = data;
1411
1412 if (nla_put_be32(skb, CTA_TIMEOUT_TCP_SYN_SENT,
1413 htonl(timeouts[TCP_CONNTRACK_SYN_SENT] / HZ)) ||
1414 nla_put_be32(skb, CTA_TIMEOUT_TCP_SYN_RECV,
1415 htonl(timeouts[TCP_CONNTRACK_SYN_RECV] / HZ)) ||
1416 nla_put_be32(skb, CTA_TIMEOUT_TCP_ESTABLISHED,
1417 htonl(timeouts[TCP_CONNTRACK_ESTABLISHED] / HZ)) ||
1418 nla_put_be32(skb, CTA_TIMEOUT_TCP_FIN_WAIT,
1419 htonl(timeouts[TCP_CONNTRACK_FIN_WAIT] / HZ)) ||
1420 nla_put_be32(skb, CTA_TIMEOUT_TCP_CLOSE_WAIT,
1421 htonl(timeouts[TCP_CONNTRACK_CLOSE_WAIT] / HZ)) ||
1422 nla_put_be32(skb, CTA_TIMEOUT_TCP_LAST_ACK,
1423 htonl(timeouts[TCP_CONNTRACK_LAST_ACK] / HZ)) ||
1424 nla_put_be32(skb, CTA_TIMEOUT_TCP_TIME_WAIT,
1425 htonl(timeouts[TCP_CONNTRACK_TIME_WAIT] / HZ)) ||
1426 nla_put_be32(skb, CTA_TIMEOUT_TCP_CLOSE,
1427 htonl(timeouts[TCP_CONNTRACK_CLOSE] / HZ)) ||
1428 nla_put_be32(skb, CTA_TIMEOUT_TCP_SYN_SENT2,
1429 htonl(timeouts[TCP_CONNTRACK_SYN_SENT2] / HZ)) ||
1430 nla_put_be32(skb, CTA_TIMEOUT_TCP_RETRANS,
1431 htonl(timeouts[TCP_CONNTRACK_RETRANS] / HZ)) ||
1432 nla_put_be32(skb, CTA_TIMEOUT_TCP_UNACK,
1433 htonl(timeouts[TCP_CONNTRACK_UNACK] / HZ)))
1434 goto nla_put_failure;
1435 return 0;
1436
1437 nla_put_failure:
1438 return -ENOSPC;
1439 }
1440
1441 static const struct nla_policy tcp_timeout_nla_policy[CTA_TIMEOUT_TCP_MAX+1] = {
1442 [CTA_TIMEOUT_TCP_SYN_SENT] = { .type = NLA_U32 },
1443 [CTA_TIMEOUT_TCP_SYN_RECV] = { .type = NLA_U32 },
1444 [CTA_TIMEOUT_TCP_ESTABLISHED] = { .type = NLA_U32 },
1445 [CTA_TIMEOUT_TCP_FIN_WAIT] = { .type = NLA_U32 },
1446 [CTA_TIMEOUT_TCP_CLOSE_WAIT] = { .type = NLA_U32 },
1447 [CTA_TIMEOUT_TCP_LAST_ACK] = { .type = NLA_U32 },
1448 [CTA_TIMEOUT_TCP_TIME_WAIT] = { .type = NLA_U32 },
1449 [CTA_TIMEOUT_TCP_CLOSE] = { .type = NLA_U32 },
1450 [CTA_TIMEOUT_TCP_SYN_SENT2] = { .type = NLA_U32 },
1451 [CTA_TIMEOUT_TCP_RETRANS] = { .type = NLA_U32 },
1452 [CTA_TIMEOUT_TCP_UNACK] = { .type = NLA_U32 },
1453 };
1454 #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
1455
nf_conntrack_tcp_init_net(struct net * net)1456 void nf_conntrack_tcp_init_net(struct net *net)
1457 {
1458 struct nf_tcp_net *tn = nf_tcp_pernet(net);
1459 int i;
1460
1461 for (i = 0; i < TCP_CONNTRACK_TIMEOUT_MAX; i++)
1462 tn->timeouts[i] = tcp_timeouts[i];
1463
1464 /* timeouts[0] is unused, make it same as SYN_SENT so
1465 * ->timeouts[0] contains 'new' timeout, like udp or icmp.
1466 */
1467 tn->timeouts[0] = tcp_timeouts[TCP_CONNTRACK_SYN_SENT];
1468
1469 /* If it is set to zero, we disable picking up already established
1470 * connections.
1471 */
1472 tn->tcp_loose = 1;
1473
1474 /* "Be conservative in what you do,
1475 * be liberal in what you accept from others."
1476 * If it's non-zero, we mark only out of window RST segments as INVALID.
1477 */
1478 tn->tcp_be_liberal = 0;
1479
1480 /* If it's non-zero, we turn off RST sequence number check */
1481 tn->tcp_ignore_invalid_rst = 0;
1482
1483 /* Max number of the retransmitted packets without receiving an (acceptable)
1484 * ACK from the destination. If this number is reached, a shorter timer
1485 * will be started.
1486 */
1487 tn->tcp_max_retrans = 3;
1488
1489 #if IS_ENABLED(CONFIG_NF_FLOW_TABLE)
1490 tn->offload_timeout = 30 * HZ;
1491 #endif
1492 }
1493
1494 const struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp =
1495 {
1496 .l4proto = IPPROTO_TCP,
1497 #ifdef CONFIG_NF_CONNTRACK_PROCFS
1498 .print_conntrack = tcp_print_conntrack,
1499 #endif
1500 .can_early_drop = tcp_can_early_drop,
1501 #if IS_ENABLED(CONFIG_NF_CT_NETLINK)
1502 .to_nlattr = tcp_to_nlattr,
1503 .from_nlattr = nlattr_to_tcp,
1504 .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr,
1505 .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple,
1506 .nlattr_tuple_size = tcp_nlattr_tuple_size,
1507 .nlattr_size = TCP_NLATTR_SIZE,
1508 .nla_policy = nf_ct_port_nla_policy,
1509 #endif
1510 #ifdef CONFIG_NF_CONNTRACK_TIMEOUT
1511 .ctnl_timeout = {
1512 .nlattr_to_obj = tcp_timeout_nlattr_to_obj,
1513 .obj_to_nlattr = tcp_timeout_obj_to_nlattr,
1514 .nlattr_max = CTA_TIMEOUT_TCP_MAX,
1515 .obj_size = sizeof(unsigned int) *
1516 TCP_CONNTRACK_TIMEOUT_MAX,
1517 .nla_policy = tcp_timeout_nla_policy,
1518 },
1519 #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
1520 };
1521