1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Implementation of the Transmission Control Protocol(TCP).
7 *
8 * Version: $Id: tcp_output.c,v 1.144 2001/11/06 22:21:08 davem Exp $
9 *
10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
13 * Corey Minyard <wf-rch!minyard@relay.EU.net>
14 * Florian La Roche, <flla@stud.uni-sb.de>
15 * Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
16 * Linus Torvalds, <torvalds@cs.helsinki.fi>
17 * Alan Cox, <gw4pts@gw4pts.ampr.org>
18 * Matthew Dillon, <dillon@apollo.west.oic.com>
19 * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
20 * Jorge Cwik, <jorge@laser.satlink.net>
21 */
22
23 /*
24 * Changes: Pedro Roque : Retransmit queue handled by TCP.
25 * : Fragmentation on mtu decrease
26 * : Segment collapse on retransmit
27 * : AF independence
28 *
29 * Linus Torvalds : send_delayed_ack
30 * David S. Miller : Charge memory using the right skb
31 * during syn/ack processing.
32 * David S. Miller : Output engine completely rewritten.
33 * Andrea Arcangeli: SYNACK carry ts_recent in tsecr.
34 * Cacophonix Gaul : draft-minshall-nagle-01
35 * J Hadi Salim : ECN support
36 *
37 */
38
39 #include <net/tcp.h>
40
41 #include <linux/compiler.h>
42 #include <linux/smp_lock.h>
43
44 /* People can turn this off for buggy TCP's found in printers etc. */
45 int sysctl_tcp_retrans_collapse = 1;
46
47 static __inline__
update_send_head(struct sock * sk,struct tcp_opt * tp,struct sk_buff * skb)48 void update_send_head(struct sock *sk, struct tcp_opt *tp, struct sk_buff *skb)
49 {
50 tp->send_head = skb->next;
51 if (tp->send_head == (struct sk_buff *) &sk->write_queue)
52 tp->send_head = NULL;
53 tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
54 if (tp->packets_out++ == 0)
55 tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
56 }
57
58 /* SND.NXT, if window was not shrunk.
59 * If window has been shrunk, what should we make? It is not clear at all.
60 * Using SND.UNA we will fail to open window, SND.NXT is out of window. :-(
61 * Anything in between SND.UNA...SND.UNA+SND.WND also can be already
62 * invalid. OK, let's make this for now:
63 */
tcp_acceptable_seq(struct sock * sk,struct tcp_opt * tp)64 static __inline__ __u32 tcp_acceptable_seq(struct sock *sk, struct tcp_opt *tp)
65 {
66 if (!before(tp->snd_una+tp->snd_wnd, tp->snd_nxt))
67 return tp->snd_nxt;
68 else
69 return tp->snd_una+tp->snd_wnd;
70 }
71
72 /* Calculate mss to advertise in SYN segment.
73 * RFC1122, RFC1063, draft-ietf-tcpimpl-pmtud-01 state that:
74 *
75 * 1. It is independent of path mtu.
76 * 2. Ideally, it is maximal possible segment size i.e. 65535-40.
77 * 3. For IPv4 it is reasonable to calculate it from maximal MTU of
78 * attached devices, because some buggy hosts are confused by
79 * large MSS.
80 * 4. We do not make 3, we advertise MSS, calculated from first
81 * hop device mtu, but allow to raise it to ip_rt_min_advmss.
82 * This may be overriden via information stored in routing table.
83 * 5. Value 65535 for MSS is valid in IPv6 and means "as large as possible,
84 * probably even Jumbo".
85 */
tcp_advertise_mss(struct sock * sk)86 static __u16 tcp_advertise_mss(struct sock *sk)
87 {
88 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
89 struct dst_entry *dst = __sk_dst_get(sk);
90 int mss = tp->advmss;
91
92 if (dst && dst->advmss < mss) {
93 mss = dst->advmss;
94 tp->advmss = mss;
95 }
96
97 return (__u16)mss;
98 }
99
100 /* RFC2861. Reset CWND after idle period longer RTO to "restart window".
101 * This is the first part of cwnd validation mechanism. */
tcp_cwnd_restart(struct tcp_opt * tp)102 static void tcp_cwnd_restart(struct tcp_opt *tp)
103 {
104 s32 delta = tcp_time_stamp - tp->lsndtime;
105 u32 restart_cwnd = tcp_init_cwnd(tp);
106 u32 cwnd = tp->snd_cwnd;
107
108 if (tcp_is_vegas(tp))
109 tcp_vegas_enable(tp);
110
111 tp->snd_ssthresh = tcp_current_ssthresh(tp);
112 restart_cwnd = min(restart_cwnd, cwnd);
113
114 while ((delta -= tp->rto) > 0 && cwnd > restart_cwnd)
115 cwnd >>= 1;
116 tp->snd_cwnd = max(cwnd, restart_cwnd);
117 tp->snd_cwnd_stamp = tcp_time_stamp;
118 tp->snd_cwnd_used = 0;
119 }
120
tcp_event_data_sent(struct tcp_opt * tp,struct sk_buff * skb)121 static __inline__ void tcp_event_data_sent(struct tcp_opt *tp, struct sk_buff *skb)
122 {
123 u32 now = tcp_time_stamp;
124
125 if (!tp->packets_out && (s32)(now - tp->lsndtime) > tp->rto)
126 tcp_cwnd_restart(tp);
127
128 tp->lsndtime = now;
129
130 /* If it is a reply for ato after last received
131 * packet, enter pingpong mode.
132 */
133 if ((u32)(now - tp->ack.lrcvtime) < tp->ack.ato)
134 tp->ack.pingpong = 1;
135 }
136
tcp_event_ack_sent(struct sock * sk)137 static __inline__ void tcp_event_ack_sent(struct sock *sk)
138 {
139 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
140
141 tcp_dec_quickack_mode(tp);
142 tcp_clear_xmit_timer(sk, TCP_TIME_DACK);
143 }
144
145 /* from 2.6's ALIGN, used in tcp_select_window() */
146 #define ALIGN_WIN(x,a) __ALIGN_MASK(x,(typeof(x))(a)-1)
147 #define __ALIGN_MASK(x,mask) (((x)+(mask))&~(mask))
148
149 /* Chose a new window to advertise, update state in tcp_opt for the
150 * socket, and return result with RFC1323 scaling applied. The return
151 * value can be stuffed directly into th->window for an outgoing
152 * frame.
153 */
tcp_select_window(struct sock * sk)154 static __inline__ u16 tcp_select_window(struct sock *sk)
155 {
156 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
157 u32 cur_win = tcp_receive_window(tp);
158 u32 new_win = __tcp_select_window(sk);
159
160 /* Never shrink the offered window */
161 if(new_win < cur_win) {
162 /* Danger Will Robinson!
163 * Don't update rcv_wup/rcv_wnd here or else
164 * we will not be able to advertise a zero
165 * window in time. --DaveM
166 *
167 * Relax Will Robinson.
168 */
169 new_win = ALIGN_WIN(cur_win, 1 << tp->rcv_wscale);
170 }
171 tp->rcv_wnd = new_win;
172 tp->rcv_wup = tp->rcv_nxt;
173
174 /* RFC1323 scaling applied */
175 new_win >>= tp->rcv_wscale;
176
177 /* If we advertise zero window, disable fast path. */
178 if (new_win == 0)
179 tp->pred_flags = 0;
180
181 return new_win;
182 }
183
184
185 /* This routine actually transmits TCP packets queued in by
186 * tcp_do_sendmsg(). This is used by both the initial
187 * transmission and possible later retransmissions.
188 * All SKB's seen here are completely headerless. It is our
189 * job to build the TCP header, and pass the packet down to
190 * IP so it can do the same plus pass the packet off to the
191 * device.
192 *
193 * We are working here with either a clone of the original
194 * SKB, or a fresh unique copy made by the retransmit engine.
195 */
tcp_transmit_skb(struct sock * sk,struct sk_buff * skb)196 int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
197 {
198 if(skb != NULL) {
199 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
200 struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
201 int tcp_header_size = tp->tcp_header_len;
202 struct tcphdr *th;
203 int sysctl_flags;
204 int err;
205
206 #define SYSCTL_FLAG_TSTAMPS 0x1
207 #define SYSCTL_FLAG_WSCALE 0x2
208 #define SYSCTL_FLAG_SACK 0x4
209
210 sysctl_flags = 0;
211 if (tcb->flags & TCPCB_FLAG_SYN) {
212 tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS;
213 if(sysctl_tcp_timestamps) {
214 tcp_header_size += TCPOLEN_TSTAMP_ALIGNED;
215 sysctl_flags |= SYSCTL_FLAG_TSTAMPS;
216 }
217 if(sysctl_tcp_window_scaling) {
218 tcp_header_size += TCPOLEN_WSCALE_ALIGNED;
219 sysctl_flags |= SYSCTL_FLAG_WSCALE;
220 }
221 if(sysctl_tcp_sack) {
222 sysctl_flags |= SYSCTL_FLAG_SACK;
223 if(!(sysctl_flags & SYSCTL_FLAG_TSTAMPS))
224 tcp_header_size += TCPOLEN_SACKPERM_ALIGNED;
225 }
226 } else if (tp->eff_sacks) {
227 /* A SACK is 2 pad bytes, a 2 byte header, plus
228 * 2 32-bit sequence numbers for each SACK block.
229 */
230 tcp_header_size += (TCPOLEN_SACK_BASE_ALIGNED +
231 (tp->eff_sacks * TCPOLEN_SACK_PERBLOCK));
232 }
233
234 /*
235 * If the connection is idle and we are restarting,
236 * then we don't want to do any Vegas calculations
237 * until we get fresh RTT samples. So when we
238 * restart, we reset our Vegas state to a clean
239 * slate. After we get acks for this flight of
240 * packets, _then_ we can make Vegas calculations
241 * again.
242 */
243 if (tcp_is_vegas(tp) && tcp_packets_in_flight(tp) == 0)
244 tcp_vegas_enable(tp);
245
246 th = (struct tcphdr *) skb_push(skb, tcp_header_size);
247 skb->h.th = th;
248 skb_set_owner_w(skb, sk);
249
250 /* Build TCP header and checksum it. */
251 th->source = sk->sport;
252 th->dest = sk->dport;
253 th->seq = htonl(tcb->seq);
254 th->ack_seq = htonl(tp->rcv_nxt);
255 *(((__u16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) | tcb->flags);
256 if (tcb->flags & TCPCB_FLAG_SYN) {
257 /* RFC1323: The window in SYN & SYN/ACK segments
258 * is never scaled.
259 */
260 th->window = htons(tp->rcv_wnd);
261 } else {
262 th->window = htons(tcp_select_window(sk));
263 }
264 th->check = 0;
265 th->urg_ptr = 0;
266
267 if (tp->urg_mode &&
268 between(tp->snd_up, tcb->seq+1, tcb->seq+0xFFFF)) {
269 th->urg_ptr = htons(tp->snd_up-tcb->seq);
270 th->urg = 1;
271 }
272
273 if (tcb->flags & TCPCB_FLAG_SYN) {
274 tcp_syn_build_options((__u32 *)(th + 1),
275 tcp_advertise_mss(sk),
276 (sysctl_flags & SYSCTL_FLAG_TSTAMPS),
277 (sysctl_flags & SYSCTL_FLAG_SACK),
278 (sysctl_flags & SYSCTL_FLAG_WSCALE),
279 tp->rcv_wscale,
280 tcb->when,
281 tp->ts_recent);
282 } else {
283 tcp_build_and_update_options((__u32 *)(th + 1),
284 tp, tcb->when);
285
286 TCP_ECN_send(sk, tp, skb, tcp_header_size);
287 }
288 tp->af_specific->send_check(sk, th, skb->len, skb);
289
290 if (tcb->flags & TCPCB_FLAG_ACK)
291 tcp_event_ack_sent(sk);
292
293 if (skb->len != tcp_header_size)
294 tcp_event_data_sent(tp, skb);
295
296 TCP_INC_STATS(TcpOutSegs);
297
298 err = tp->af_specific->queue_xmit(skb, 0);
299 if (err <= 0)
300 return err;
301
302 tcp_enter_cwr(tp);
303
304 /* NET_XMIT_CN is special. It does not guarantee,
305 * that this packet is lost. It tells that device
306 * is about to start to drop packets or already
307 * drops some packets of the same priority and
308 * invokes us to send less aggressively.
309 */
310 return err == NET_XMIT_CN ? 0 : err;
311 }
312 return -ENOBUFS;
313 #undef SYSCTL_FLAG_TSTAMPS
314 #undef SYSCTL_FLAG_WSCALE
315 #undef SYSCTL_FLAG_SACK
316 }
317
318
319 /* This is the main buffer sending routine. We queue the buffer
320 * and decide whether to queue or transmit now.
321 *
322 * NOTE: probe0 timer is not checked, do not forget tcp_push_pending_frames,
323 * otherwise socket can stall.
324 */
tcp_send_skb(struct sock * sk,struct sk_buff * skb,int force_queue,unsigned cur_mss)325 void tcp_send_skb(struct sock *sk, struct sk_buff *skb, int force_queue, unsigned cur_mss)
326 {
327 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
328
329 /* Advance write_seq and place onto the write_queue. */
330 tp->write_seq = TCP_SKB_CB(skb)->end_seq;
331 __skb_queue_tail(&sk->write_queue, skb);
332 tcp_charge_skb(sk, skb);
333
334 if (!force_queue && tp->send_head == NULL && tcp_snd_test(tp, skb, cur_mss, tp->nonagle)) {
335 /* Send it out now. */
336 TCP_SKB_CB(skb)->when = tcp_time_stamp;
337 if (tcp_transmit_skb(sk, skb_clone(skb, sk->allocation)) == 0) {
338 tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
339 tcp_minshall_update(tp, cur_mss, skb);
340 if (tp->packets_out++ == 0)
341 tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
342 return;
343 }
344 }
345 /* Queue it, remembering where we must start sending. */
346 if (tp->send_head == NULL)
347 tp->send_head = skb;
348 }
349
350 /* Send _single_ skb sitting at the send head. This function requires
351 * true push pending frames to setup probe timer etc.
352 */
tcp_push_one(struct sock * sk,unsigned cur_mss)353 void tcp_push_one(struct sock *sk, unsigned cur_mss)
354 {
355 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
356 struct sk_buff *skb = tp->send_head;
357
358 if (tcp_snd_test(tp, skb, cur_mss, 1)) {
359 /* Send it out now. */
360 TCP_SKB_CB(skb)->when = tcp_time_stamp;
361 if (tcp_transmit_skb(sk, skb_clone(skb, sk->allocation)) == 0) {
362 tp->send_head = NULL;
363 tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
364 if (tp->packets_out++ == 0)
365 tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
366 return;
367 }
368 }
369 }
370
371 /* Split fragmented skb to two parts at length len. */
372
skb_split(struct sk_buff * skb,struct sk_buff * skb1,u32 len)373 static void skb_split(struct sk_buff *skb, struct sk_buff *skb1, u32 len)
374 {
375 int i;
376 int pos = skb->len - skb->data_len;
377
378 if (len < pos) {
379 /* Split line is inside header. */
380 memcpy(skb_put(skb1, pos-len), skb->data + len, pos-len);
381
382 /* And move data appendix as is. */
383 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
384 skb_shinfo(skb1)->frags[i] = skb_shinfo(skb)->frags[i];
385
386 skb_shinfo(skb1)->nr_frags = skb_shinfo(skb)->nr_frags;
387 skb_shinfo(skb)->nr_frags = 0;
388
389 skb1->data_len = skb->data_len;
390 skb1->len += skb1->data_len;
391 skb->data_len = 0;
392 skb->len = len;
393 skb->tail = skb->data+len;
394 } else {
395 int k = 0;
396 int nfrags = skb_shinfo(skb)->nr_frags;
397
398 /* Second chunk has no header, nothing to copy. */
399
400 skb_shinfo(skb)->nr_frags = 0;
401 skb1->len = skb1->data_len = skb->len - len;
402 skb->len = len;
403 skb->data_len = len - pos;
404
405 for (i=0; i<nfrags; i++) {
406 int size = skb_shinfo(skb)->frags[i].size;
407 if (pos + size > len) {
408 skb_shinfo(skb1)->frags[k] = skb_shinfo(skb)->frags[i];
409
410 if (pos < len) {
411 /* Split frag.
412 * We have to variants in this case:
413 * 1. Move all the frag to the second
414 * part, if it is possible. F.e.
415 * this approach is mandatory for TUX,
416 * where splitting is expensive.
417 * 2. Split is accurately. We make this.
418 */
419 get_page(skb_shinfo(skb)->frags[i].page);
420 skb_shinfo(skb1)->frags[0].page_offset += (len-pos);
421 skb_shinfo(skb1)->frags[0].size -= (len-pos);
422 skb_shinfo(skb)->frags[i].size = len-pos;
423 skb_shinfo(skb)->nr_frags++;
424 }
425 k++;
426 } else {
427 skb_shinfo(skb)->nr_frags++;
428 }
429 pos += size;
430 }
431 skb_shinfo(skb1)->nr_frags = k;
432 }
433 }
434
435 /* Function to create two new TCP segments. Shrinks the given segment
436 * to the specified size and appends a new segment with the rest of the
437 * packet to the list. This won't be called frequently, I hope.
438 * Remember, these are still headerless SKBs at this point.
439 */
tcp_fragment(struct sock * sk,struct sk_buff * skb,u32 len)440 static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len)
441 {
442 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
443 struct sk_buff *buff;
444 int nsize = skb->len - len;
445 u16 flags;
446
447 if (skb_cloned(skb) &&
448 skb_is_nonlinear(skb) &&
449 pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
450 return -ENOMEM;
451
452 /* Get a new skb... force flag on. */
453 buff = tcp_alloc_skb(sk, nsize, GFP_ATOMIC);
454 if (buff == NULL)
455 return -ENOMEM; /* We'll just try again later. */
456 tcp_charge_skb(sk, buff);
457
458 /* Correct the sequence numbers. */
459 TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
460 TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
461 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
462
463 /* PSH and FIN should only be set in the second packet. */
464 flags = TCP_SKB_CB(skb)->flags;
465 TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH);
466 TCP_SKB_CB(buff)->flags = flags;
467 TCP_SKB_CB(buff)->sacked = TCP_SKB_CB(skb)->sacked&(TCPCB_LOST|TCPCB_EVER_RETRANS|TCPCB_AT_TAIL);
468 if (TCP_SKB_CB(buff)->sacked&TCPCB_LOST) {
469 tp->lost_out++;
470 tp->left_out++;
471 }
472 TCP_SKB_CB(skb)->sacked &= ~TCPCB_AT_TAIL;
473
474 if (!skb_shinfo(skb)->nr_frags && skb->ip_summed != CHECKSUM_HW) {
475 /* Copy and checksum data tail into the new buffer. */
476 buff->csum = csum_partial_copy_nocheck(skb->data + len, skb_put(buff, nsize),
477 nsize, 0);
478
479 skb_trim(skb, len);
480
481 skb->csum = csum_block_sub(skb->csum, buff->csum, len);
482 } else {
483 skb->ip_summed = CHECKSUM_HW;
484 skb_split(skb, buff, len);
485 }
486
487 buff->ip_summed = skb->ip_summed;
488
489 /* Looks stupid, but our code really uses when of
490 * skbs, which it never sent before. --ANK
491 */
492 TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when;
493
494 /* Link BUFF into the send queue. */
495 __skb_append(skb, buff);
496
497 return 0;
498 }
499
500 /* This function synchronize snd mss to current pmtu/exthdr set.
501
502 tp->user_mss is mss set by user by TCP_MAXSEG. It does NOT counts
503 for TCP options, but includes only bare TCP header.
504
505 tp->mss_clamp is mss negotiated at connection setup.
506 It is minumum of user_mss and mss received with SYN.
507 It also does not include TCP options.
508
509 tp->pmtu_cookie is last pmtu, seen by this function.
510
511 tp->mss_cache is current effective sending mss, including
512 all tcp options except for SACKs. It is evaluated,
513 taking into account current pmtu, but never exceeds
514 tp->mss_clamp.
515
516 NOTE1. rfc1122 clearly states that advertised MSS
517 DOES NOT include either tcp or ip options.
518
519 NOTE2. tp->pmtu_cookie and tp->mss_cache are READ ONLY outside
520 this function. --ANK (980731)
521 */
522
tcp_sync_mss(struct sock * sk,u32 pmtu)523 int tcp_sync_mss(struct sock *sk, u32 pmtu)
524 {
525 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
526 int mss_now;
527
528 /* Calculate base mss without TCP options:
529 It is MMS_S - sizeof(tcphdr) of rfc1122
530 */
531
532 mss_now = pmtu - tp->af_specific->net_header_len - sizeof(struct tcphdr);
533
534 /* Clamp it (mss_clamp does not include tcp options) */
535 if (mss_now > tp->mss_clamp)
536 mss_now = tp->mss_clamp;
537
538 /* Now subtract optional transport overhead */
539 mss_now -= tp->ext_header_len;
540
541 /* Then reserve room for full set of TCP options and 8 bytes of data */
542 if (mss_now < 48)
543 mss_now = 48;
544
545 /* Now subtract TCP options size, not including SACKs */
546 mss_now -= tp->tcp_header_len - sizeof(struct tcphdr);
547
548 /* Bound mss with half of window */
549 if (tp->max_window && mss_now > (tp->max_window>>1))
550 mss_now = max((tp->max_window>>1), 68U - tp->tcp_header_len);
551
552 /* And store cached results */
553 tp->pmtu_cookie = pmtu;
554 tp->mss_cache = mss_now;
555 return mss_now;
556 }
557
558
559 /* This routine writes packets to the network. It advances the
560 * send_head. This happens as incoming acks open up the remote
561 * window for us.
562 *
563 * Returns 1, if no segments are in flight and we have queued segments, but
564 * cannot send anything now because of SWS or another problem.
565 */
tcp_write_xmit(struct sock * sk,int nonagle)566 int tcp_write_xmit(struct sock *sk, int nonagle)
567 {
568 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
569 unsigned int mss_now;
570
571 /* If we are closed, the bytes will have to remain here.
572 * In time closedown will finish, we empty the write queue and all
573 * will be happy.
574 */
575 if(sk->state != TCP_CLOSE) {
576 struct sk_buff *skb;
577 int sent_pkts = 0;
578
579 /* Account for SACKS, we may need to fragment due to this.
580 * It is just like the real MSS changing on us midstream.
581 * We also handle things correctly when the user adds some
582 * IP options mid-stream. Silly to do, but cover it.
583 */
584 mss_now = tcp_current_mss(sk);
585
586 while((skb = tp->send_head) &&
587 tcp_snd_test(tp, skb, mss_now, tcp_skb_is_last(sk, skb) ? nonagle : 1)) {
588 if (skb->len > mss_now) {
589 if (tcp_fragment(sk, skb, mss_now))
590 break;
591 }
592
593 TCP_SKB_CB(skb)->when = tcp_time_stamp;
594 if (tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)))
595 break;
596 /* Advance the send_head. This one is sent out. */
597 update_send_head(sk, tp, skb);
598 tcp_minshall_update(tp, mss_now, skb);
599 sent_pkts = 1;
600 }
601
602 if (sent_pkts) {
603 tcp_cwnd_validate(sk, tp);
604 return 0;
605 }
606
607 return !tp->packets_out && tp->send_head;
608 }
609 return 0;
610 }
611
612 /* This function returns the amount that we can raise the
613 * usable window based on the following constraints
614 *
615 * 1. The window can never be shrunk once it is offered (RFC 793)
616 * 2. We limit memory per socket
617 *
618 * RFC 1122:
619 * "the suggested [SWS] avoidance algorithm for the receiver is to keep
620 * RECV.NEXT + RCV.WIN fixed until:
621 * RCV.BUFF - RCV.USER - RCV.WINDOW >= min(1/2 RCV.BUFF, MSS)"
622 *
623 * i.e. don't raise the right edge of the window until you can raise
624 * it at least MSS bytes.
625 *
626 * Unfortunately, the recommended algorithm breaks header prediction,
627 * since header prediction assumes th->window stays fixed.
628 *
629 * Strictly speaking, keeping th->window fixed violates the receiver
630 * side SWS prevention criteria. The problem is that under this rule
631 * a stream of single byte packets will cause the right side of the
632 * window to always advance by a single byte.
633 *
634 * Of course, if the sender implements sender side SWS prevention
635 * then this will not be a problem.
636 *
637 * BSD seems to make the following compromise:
638 *
639 * If the free space is less than the 1/4 of the maximum
640 * space available and the free space is less than 1/2 mss,
641 * then set the window to 0.
642 * [ Actually, bsd uses MSS and 1/4 of maximal _window_ ]
643 * Otherwise, just prevent the window from shrinking
644 * and from being larger than the largest representable value.
645 *
646 * This prevents incremental opening of the window in the regime
647 * where TCP is limited by the speed of the reader side taking
648 * data out of the TCP receive queue. It does nothing about
649 * those cases where the window is constrained on the sender side
650 * because the pipeline is full.
651 *
652 * BSD also seems to "accidentally" limit itself to windows that are a
653 * multiple of MSS, at least until the free space gets quite small.
654 * This would appear to be a side effect of the mbuf implementation.
655 * Combining these two algorithms results in the observed behavior
656 * of having a fixed window size at almost all times.
657 *
658 * Below we obtain similar behavior by forcing the offered window to
659 * a multiple of the mss when it is feasible to do so.
660 *
661 * Note, we don't "adjust" for TIMESTAMP or SACK option bytes.
662 * Regular options like TIMESTAMP are taken into account.
663 */
__tcp_select_window(struct sock * sk)664 u32 __tcp_select_window(struct sock *sk)
665 {
666 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
667 /* MSS for the peer's data. Previous verions used mss_clamp
668 * here. I don't know if the value based on our guesses
669 * of peer's MSS is better for the performance. It's more correct
670 * but may be worse for the performance because of rcv_mss
671 * fluctuations. --SAW 1998/11/1
672 */
673 int mss = tp->ack.rcv_mss;
674 int free_space = tcp_space(sk);
675 int full_space = min_t(int, tp->window_clamp, tcp_full_space(sk));
676 int window;
677
678 if (mss > full_space)
679 mss = full_space;
680
681 if (free_space < full_space/2) {
682 tp->ack.quick = 0;
683
684 if (tcp_memory_pressure)
685 tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U*tp->advmss);
686
687 if (free_space < mss)
688 return 0;
689 }
690
691 if (free_space > tp->rcv_ssthresh)
692 free_space = tp->rcv_ssthresh;
693
694 /* Get the largest window that is a nice multiple of mss.
695 * Window clamp already applied above.
696 * If our current window offering is within 1 mss of the
697 * free space we just keep it. This prevents the divide
698 * and multiply from happening most of the time.
699 * We also don't do any window rounding when the free space
700 * is too small.
701 */
702 window = tp->rcv_wnd;
703 if (window <= free_space - mss || window > free_space)
704 window = (free_space/mss)*mss;
705 else if (mss == full_space &&
706 free_space > window + full_space/2)
707 window = free_space;
708
709 return window;
710 }
711
712 /* Attempt to collapse two adjacent SKB's during retransmission. */
tcp_retrans_try_collapse(struct sock * sk,struct sk_buff * skb,int mss_now)713 static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int mss_now)
714 {
715 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
716 struct sk_buff *next_skb = skb->next;
717
718 /* The first test we must make is that neither of these two
719 * SKB's are still referenced by someone else.
720 */
721 if(!skb_cloned(skb) && !skb_cloned(next_skb)) {
722 int skb_size = skb->len, next_skb_size = next_skb->len;
723 u16 flags = TCP_SKB_CB(skb)->flags;
724
725 /* Also punt if next skb has been SACK'd. */
726 if(TCP_SKB_CB(next_skb)->sacked & TCPCB_SACKED_ACKED)
727 return;
728
729 /* Next skb is out of window. */
730 if (after(TCP_SKB_CB(next_skb)->end_seq, tp->snd_una+tp->snd_wnd))
731 return;
732
733 /* Punt if not enough space exists in the first SKB for
734 * the data in the second, or the total combined payload
735 * would exceed the MSS.
736 */
737 if ((next_skb_size > skb_tailroom(skb)) ||
738 ((skb_size + next_skb_size) > mss_now))
739 return;
740
741 /* Ok. We will be able to collapse the packet. */
742 __skb_unlink(next_skb, next_skb->list);
743
744 memcpy(skb_put(skb, next_skb_size), next_skb->data, next_skb_size);
745
746 if (next_skb->ip_summed == CHECKSUM_HW)
747 skb->ip_summed = CHECKSUM_HW;
748
749 if (skb->ip_summed != CHECKSUM_HW)
750 skb->csum = csum_block_add(skb->csum, next_skb->csum, skb_size);
751
752 /* Update sequence range on original skb. */
753 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq;
754
755 /* Merge over control information. */
756 flags |= TCP_SKB_CB(next_skb)->flags; /* This moves PSH/FIN etc. over */
757 TCP_SKB_CB(skb)->flags = flags;
758
759 /* All done, get rid of second SKB and account for it so
760 * packet counting does not break.
761 */
762 TCP_SKB_CB(skb)->sacked |= TCP_SKB_CB(next_skb)->sacked&(TCPCB_EVER_RETRANS|TCPCB_AT_TAIL);
763 if (TCP_SKB_CB(next_skb)->sacked&TCPCB_SACKED_RETRANS)
764 tp->retrans_out--;
765 if (TCP_SKB_CB(next_skb)->sacked&TCPCB_LOST) {
766 tp->lost_out--;
767 tp->left_out--;
768 }
769 /* Reno case is special. Sigh... */
770 if (!tp->sack_ok && tp->sacked_out) {
771 tp->sacked_out--;
772 tp->left_out--;
773 }
774
775 /* Not quite right: it can be > snd.fack, but
776 * it is better to underestimate fackets.
777 */
778 if (tp->fackets_out)
779 tp->fackets_out--;
780 tcp_free_skb(sk, next_skb);
781 tp->packets_out--;
782 }
783 }
784
785 /* Do a simple retransmit without using the backoff mechanisms in
786 * tcp_timer. This is used for path mtu discovery.
787 * The socket is already locked here.
788 */
tcp_simple_retransmit(struct sock * sk)789 void tcp_simple_retransmit(struct sock *sk)
790 {
791 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
792 struct sk_buff *skb;
793 unsigned int mss = tcp_current_mss(sk);
794 int lost = 0;
795
796 for_retrans_queue(skb, sk, tp) {
797 if (skb->len > mss &&
798 !(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED)) {
799 if (TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS) {
800 TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
801 tp->retrans_out--;
802 }
803 if (!(TCP_SKB_CB(skb)->sacked&TCPCB_LOST)) {
804 TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
805 tp->lost_out++;
806 lost = 1;
807 }
808 }
809 }
810
811 if (!lost)
812 return;
813
814 tcp_sync_left_out(tp);
815
816 /* Don't muck with the congestion window here.
817 * Reason is that we do not increase amount of _data_
818 * in network, but units changed and effective
819 * cwnd/ssthresh really reduced now.
820 */
821 if (tp->ca_state != TCP_CA_Loss) {
822 tp->high_seq = tp->snd_nxt;
823 tp->snd_ssthresh = tcp_current_ssthresh(tp);
824 tp->prior_ssthresh = 0;
825 tp->undo_marker = 0;
826 tcp_set_ca_state(tp, TCP_CA_Loss);
827 }
828 tcp_xmit_retransmit_queue(sk);
829 }
830
831 /* This retransmits one SKB. Policy decisions and retransmit queue
832 * state updates are done by the caller. Returns non-zero if an
833 * error occurred which prevented the send.
834 */
tcp_retransmit_skb(struct sock * sk,struct sk_buff * skb)835 int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
836 {
837 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
838 unsigned int cur_mss = tcp_current_mss(sk);
839 int err;
840
841 /* Do not sent more than we queued. 1/4 is reserved for possible
842 * copying overhead: frgagmentation, tunneling, mangling etc.
843 */
844 if (atomic_read(&sk->wmem_alloc) > min(sk->wmem_queued+(sk->wmem_queued>>2),sk->sndbuf))
845 return -EAGAIN;
846
847 /* If receiver has shrunk his window, and skb is out of
848 * new window, do not retransmit it. The exception is the
849 * case, when window is shrunk to zero. In this case
850 * our retransmit serves as a zero window probe.
851 */
852 if (!before(TCP_SKB_CB(skb)->seq, tp->snd_una+tp->snd_wnd)
853 && TCP_SKB_CB(skb)->seq != tp->snd_una)
854 return -EAGAIN;
855
856 if(skb->len > cur_mss) {
857 if(tcp_fragment(sk, skb, cur_mss))
858 return -ENOMEM; /* We'll try again later. */
859
860 /* New SKB created, account for it. */
861 tp->packets_out++;
862 }
863
864 /* Collapse two adjacent packets if worthwhile and we can. */
865 if(!(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_SYN) &&
866 (skb->len < (cur_mss >> 1)) &&
867 (skb->next != tp->send_head) &&
868 (skb->next != (struct sk_buff *)&sk->write_queue) &&
869 (skb_shinfo(skb)->nr_frags == 0 && skb_shinfo(skb->next)->nr_frags == 0) &&
870 (sysctl_tcp_retrans_collapse != 0))
871 tcp_retrans_try_collapse(sk, skb, cur_mss);
872
873 if(tp->af_specific->rebuild_header(sk))
874 return -EHOSTUNREACH; /* Routing failure or similar. */
875
876 /* Some Solaris stacks overoptimize and ignore the FIN on a
877 * retransmit when old data is attached. So strip it off
878 * since it is cheap to do so and saves bytes on the network.
879 */
880 if(skb->len > 0 &&
881 (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) &&
882 tp->snd_una == (TCP_SKB_CB(skb)->end_seq - 1)) {
883 if (!pskb_trim(skb, 0)) {
884 TCP_SKB_CB(skb)->seq = TCP_SKB_CB(skb)->end_seq - 1;
885 skb->ip_summed = CHECKSUM_NONE;
886 skb->csum = 0;
887 }
888 }
889
890 /* Make a copy, if the first transmission SKB clone we made
891 * is still in somebody's hands, else make a clone.
892 */
893 TCP_SKB_CB(skb)->when = tcp_time_stamp;
894
895 err = tcp_transmit_skb(sk, (skb_cloned(skb) ?
896 pskb_copy(skb, GFP_ATOMIC):
897 skb_clone(skb, GFP_ATOMIC)));
898
899 if (err == 0) {
900 /* Update global TCP statistics. */
901 TCP_INC_STATS(TcpRetransSegs);
902
903 #if FASTRETRANS_DEBUG > 0
904 if (TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS) {
905 if (net_ratelimit())
906 printk(KERN_DEBUG "retrans_out leaked.\n");
907 }
908 #endif
909 TCP_SKB_CB(skb)->sacked |= TCPCB_RETRANS;
910 tp->retrans_out++;
911
912 /* Save stamp of the first retransmit. */
913 if (!tp->retrans_stamp)
914 tp->retrans_stamp = TCP_SKB_CB(skb)->when;
915
916 tp->undo_retrans++;
917
918 /* snd_nxt is stored to detect loss of retransmitted segment,
919 * see tcp_input.c tcp_sacktag_write_queue().
920 */
921 TCP_SKB_CB(skb)->ack_seq = tp->snd_nxt;
922 }
923 return err;
924 }
925
926 /* This gets called after a retransmit timeout, and the initially
927 * retransmitted data is acknowledged. It tries to continue
928 * resending the rest of the retransmit queue, until either
929 * we've sent it all or the congestion window limit is reached.
930 * If doing SACK, the first ACK which comes back for a timeout
931 * based retransmit packet might feed us FACK information again.
932 * If so, we use it to avoid unnecessarily retransmissions.
933 */
tcp_xmit_retransmit_queue(struct sock * sk)934 void tcp_xmit_retransmit_queue(struct sock *sk)
935 {
936 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
937 struct sk_buff *skb;
938 int packet_cnt = tp->lost_out;
939
940 /* First pass: retransmit lost packets. */
941 if (packet_cnt) {
942 for_retrans_queue(skb, sk, tp) {
943 __u8 sacked = TCP_SKB_CB(skb)->sacked;
944
945 if (tcp_packets_in_flight(tp) >= tp->snd_cwnd)
946 return;
947
948 if (sacked&TCPCB_LOST) {
949 if (!(sacked&(TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))) {
950 if (tcp_retransmit_skb(sk, skb))
951 return;
952 if (tp->ca_state != TCP_CA_Loss)
953 NET_INC_STATS_BH(TCPFastRetrans);
954 else
955 NET_INC_STATS_BH(TCPSlowStartRetrans);
956
957 if (skb == skb_peek(&sk->write_queue))
958 tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
959 }
960
961 if (--packet_cnt <= 0)
962 break;
963 }
964 }
965 }
966
967 /* OK, demanded retransmission is finished. */
968
969 /* Forward retransmissions are possible only during Recovery. */
970 if (tp->ca_state != TCP_CA_Recovery)
971 return;
972
973 /* No forward retransmissions in Reno are possible. */
974 if (!tp->sack_ok)
975 return;
976
977 /* Yeah, we have to make difficult choice between forward transmission
978 * and retransmission... Both ways have their merits...
979 *
980 * For now we do not retrnamsit anything, while we have some new
981 * segments to send.
982 */
983
984 if (tcp_may_send_now(sk, tp))
985 return;
986
987 packet_cnt = 0;
988
989 for_retrans_queue(skb, sk, tp) {
990 if(++packet_cnt > tp->fackets_out)
991 break;
992
993 if (tcp_packets_in_flight(tp) >= tp->snd_cwnd)
994 break;
995
996 if(TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS)
997 continue;
998
999 /* Ok, retransmit it. */
1000 if(tcp_retransmit_skb(sk, skb))
1001 break;
1002
1003 if (skb == skb_peek(&sk->write_queue))
1004 tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
1005
1006 NET_INC_STATS_BH(TCPForwardRetrans);
1007 }
1008 }
1009
1010
1011 /* Send a fin. The caller locks the socket for us. This cannot be
1012 * allowed to fail queueing a FIN frame under any circumstances.
1013 */
tcp_send_fin(struct sock * sk)1014 void tcp_send_fin(struct sock *sk)
1015 {
1016 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1017 struct sk_buff *skb = skb_peek_tail(&sk->write_queue);
1018 unsigned int mss_now;
1019
1020 /* Optimization, tack on the FIN if we have a queue of
1021 * unsent frames. But be careful about outgoing SACKS
1022 * and IP options.
1023 */
1024 mss_now = tcp_current_mss(sk);
1025
1026 if(tp->send_head != NULL) {
1027 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_FIN;
1028 TCP_SKB_CB(skb)->end_seq++;
1029 tp->write_seq++;
1030 } else {
1031 /* Socket is locked, keep trying until memory is available. */
1032 for (;;) {
1033 skb = alloc_skb(MAX_TCP_HEADER, GFP_KERNEL);
1034 if (skb)
1035 break;
1036 yield();
1037 }
1038
1039 /* Reserve space for headers and prepare control bits. */
1040 skb_reserve(skb, MAX_TCP_HEADER);
1041 skb->csum = 0;
1042 TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_FIN);
1043 TCP_SKB_CB(skb)->sacked = 0;
1044
1045 /* FIN eats a sequence byte, write_seq advanced by tcp_send_skb(). */
1046 TCP_SKB_CB(skb)->seq = tp->write_seq;
1047 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1;
1048 tcp_send_skb(sk, skb, 1, mss_now);
1049 }
1050 __tcp_push_pending_frames(sk, tp, mss_now, 1);
1051 }
1052
1053 /* We get here when a process closes a file descriptor (either due to
1054 * an explicit close() or as a byproduct of exit()'ing) and there
1055 * was unread data in the receive queue. This behavior is recommended
1056 * by draft-ietf-tcpimpl-prob-03.txt section 3.10. -DaveM
1057 */
tcp_send_active_reset(struct sock * sk,int priority)1058 void tcp_send_active_reset(struct sock *sk, int priority)
1059 {
1060 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1061 struct sk_buff *skb;
1062
1063 /* NOTE: No TCP options attached and we never retransmit this. */
1064 skb = alloc_skb(MAX_TCP_HEADER, priority);
1065 if (!skb) {
1066 NET_INC_STATS(TCPAbortFailed);
1067 return;
1068 }
1069
1070 /* Reserve space for headers and prepare control bits. */
1071 skb_reserve(skb, MAX_TCP_HEADER);
1072 skb->csum = 0;
1073 TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_RST);
1074 TCP_SKB_CB(skb)->sacked = 0;
1075
1076 /* Send it off. */
1077 TCP_SKB_CB(skb)->seq = tcp_acceptable_seq(sk, tp);
1078 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq;
1079 TCP_SKB_CB(skb)->when = tcp_time_stamp;
1080 if (tcp_transmit_skb(sk, skb))
1081 NET_INC_STATS(TCPAbortFailed);
1082 }
1083
1084 /* WARNING: This routine must only be called when we have already sent
1085 * a SYN packet that crossed the incoming SYN that caused this routine
1086 * to get called. If this assumption fails then the initial rcv_wnd
1087 * and rcv_wscale values will not be correct.
1088 */
tcp_send_synack(struct sock * sk)1089 int tcp_send_synack(struct sock *sk)
1090 {
1091 struct sk_buff* skb;
1092
1093 skb = skb_peek(&sk->write_queue);
1094 if (skb == NULL || !(TCP_SKB_CB(skb)->flags&TCPCB_FLAG_SYN)) {
1095 printk(KERN_DEBUG "tcp_send_synack: wrong queue state\n");
1096 return -EFAULT;
1097 }
1098 if (!(TCP_SKB_CB(skb)->flags&TCPCB_FLAG_ACK)) {
1099 if (skb_cloned(skb)) {
1100 struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC);
1101 if (nskb == NULL)
1102 return -ENOMEM;
1103 __skb_unlink(skb, &sk->write_queue);
1104 __skb_queue_head(&sk->write_queue, nskb);
1105 tcp_free_skb(sk, skb);
1106 tcp_charge_skb(sk, nskb);
1107 skb = nskb;
1108 }
1109
1110 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_ACK;
1111 TCP_ECN_send_synack(&sk->tp_pinfo.af_tcp, skb);
1112 }
1113 TCP_SKB_CB(skb)->when = tcp_time_stamp;
1114 return tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
1115 }
1116
1117 /*
1118 * Prepare a SYN-ACK.
1119 */
tcp_make_synack(struct sock * sk,struct dst_entry * dst,struct open_request * req)1120 struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst,
1121 struct open_request *req)
1122 {
1123 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1124 struct tcphdr *th;
1125 int tcp_header_size;
1126 struct sk_buff *skb;
1127
1128 skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15, 1, GFP_ATOMIC);
1129 if (skb == NULL)
1130 return NULL;
1131
1132 /* Reserve space for headers. */
1133 skb_reserve(skb, MAX_TCP_HEADER);
1134
1135 skb->dst = dst_clone(dst);
1136
1137 tcp_header_size = (sizeof(struct tcphdr) + TCPOLEN_MSS +
1138 (req->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0) +
1139 (req->wscale_ok ? TCPOLEN_WSCALE_ALIGNED : 0) +
1140 /* SACK_PERM is in the place of NOP NOP of TS */
1141 ((req->sack_ok && !req->tstamp_ok) ? TCPOLEN_SACKPERM_ALIGNED : 0));
1142 skb->h.th = th = (struct tcphdr *) skb_push(skb, tcp_header_size);
1143
1144 memset(th, 0, sizeof(struct tcphdr));
1145 th->syn = 1;
1146 th->ack = 1;
1147 TCP_ECN_make_synack(req, th);
1148 th->source = sk->sport;
1149 th->dest = req->rmt_port;
1150 TCP_SKB_CB(skb)->seq = req->snt_isn;
1151 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1;
1152 th->seq = htonl(TCP_SKB_CB(skb)->seq);
1153 th->ack_seq = htonl(req->rcv_isn + 1);
1154 if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */
1155 __u8 rcv_wscale;
1156 /* Set this up on the first call only */
1157 req->window_clamp = tp->window_clamp ? : dst->window;
1158 /* tcp_full_space because it is guaranteed to be the first packet */
1159 tcp_select_initial_window(tcp_full_space(sk),
1160 dst->advmss - (req->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
1161 &req->rcv_wnd,
1162 &req->window_clamp,
1163 req->wscale_ok,
1164 &rcv_wscale);
1165 req->rcv_wscale = rcv_wscale;
1166 }
1167
1168 /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */
1169 th->window = htons(req->rcv_wnd);
1170
1171 TCP_SKB_CB(skb)->when = tcp_time_stamp;
1172 tcp_syn_build_options((__u32 *)(th + 1), dst->advmss, req->tstamp_ok,
1173 req->sack_ok, req->wscale_ok, req->rcv_wscale,
1174 TCP_SKB_CB(skb)->when,
1175 req->ts_recent);
1176
1177 skb->csum = 0;
1178 th->doff = (tcp_header_size >> 2);
1179 TCP_INC_STATS(TcpOutSegs);
1180 return skb;
1181 }
1182
1183 /*
1184 * Do all connect socket setups that can be done AF independent.
1185 */
tcp_connect_init(struct sock * sk)1186 static inline void tcp_connect_init(struct sock *sk)
1187 {
1188 struct dst_entry *dst = __sk_dst_get(sk);
1189 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1190
1191 /* We'll fix this up when we get a response from the other end.
1192 * See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT.
1193 */
1194 tp->tcp_header_len = sizeof(struct tcphdr) +
1195 (sysctl_tcp_timestamps ? TCPOLEN_TSTAMP_ALIGNED : 0);
1196
1197 /* If user gave his TCP_MAXSEG, record it to clamp */
1198 if (tp->user_mss)
1199 tp->mss_clamp = tp->user_mss;
1200 tp->max_window = 0;
1201 tcp_sync_mss(sk, dst->pmtu);
1202
1203 if (!tp->window_clamp)
1204 tp->window_clamp = dst->window;
1205 tp->advmss = dst->advmss;
1206 tcp_initialize_rcv_mss(sk);
1207 tcp_ca_init(tp);
1208
1209 tcp_select_initial_window(tcp_full_space(sk),
1210 tp->advmss - (tp->ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0),
1211 &tp->rcv_wnd,
1212 &tp->window_clamp,
1213 sysctl_tcp_window_scaling,
1214 &tp->rcv_wscale);
1215
1216 tp->rcv_ssthresh = tp->rcv_wnd;
1217
1218 sk->err = 0;
1219 sk->done = 0;
1220 tp->snd_wnd = 0;
1221 tcp_init_wl(tp, tp->write_seq, 0);
1222 tp->snd_una = tp->write_seq;
1223 tp->snd_sml = tp->write_seq;
1224 tp->rcv_nxt = 0;
1225 tp->rcv_wup = 0;
1226 tp->copied_seq = 0;
1227
1228 tp->rto = TCP_TIMEOUT_INIT;
1229 tp->retransmits = 0;
1230 tcp_clear_retrans(tp);
1231 }
1232
1233 /*
1234 * Build a SYN and send it off.
1235 */
tcp_connect(struct sock * sk)1236 int tcp_connect(struct sock *sk)
1237 {
1238 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1239 struct sk_buff *buff;
1240
1241 tcp_connect_init(sk);
1242
1243 buff = alloc_skb(MAX_TCP_HEADER + 15, sk->allocation);
1244 if (unlikely(buff == NULL))
1245 return -ENOBUFS;
1246
1247 /* Reserve space for headers. */
1248 skb_reserve(buff, MAX_TCP_HEADER);
1249
1250 TCP_SKB_CB(buff)->flags = TCPCB_FLAG_SYN;
1251 TCP_ECN_send_syn(tp, buff);
1252 TCP_SKB_CB(buff)->sacked = 0;
1253 buff->csum = 0;
1254 TCP_SKB_CB(buff)->seq = tp->write_seq++;
1255 TCP_SKB_CB(buff)->end_seq = tp->write_seq;
1256 tp->snd_nxt = tp->write_seq;
1257 tp->pushed_seq = tp->write_seq;
1258 tcp_ca_init(tp);
1259
1260 /* Send it off. */
1261 TCP_SKB_CB(buff)->when = tcp_time_stamp;
1262 tp->retrans_stamp = TCP_SKB_CB(buff)->when;
1263 __skb_queue_tail(&sk->write_queue, buff);
1264 tcp_charge_skb(sk, buff);
1265 tp->packets_out++;
1266 tcp_transmit_skb(sk, skb_clone(buff, GFP_KERNEL));
1267 TCP_INC_STATS(TcpActiveOpens);
1268
1269 /* Timer for repeating the SYN until an answer. */
1270 tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
1271 return 0;
1272 }
1273
1274 /* Send out a delayed ack, the caller does the policy checking
1275 * to see if we should even be here. See tcp_input.c:tcp_ack_snd_check()
1276 * for details.
1277 */
tcp_send_delayed_ack(struct sock * sk)1278 void tcp_send_delayed_ack(struct sock *sk)
1279 {
1280 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
1281 int ato = tp->ack.ato;
1282 unsigned long timeout;
1283
1284 if (ato > TCP_DELACK_MIN) {
1285 int max_ato = HZ/2;
1286
1287 if (tp->ack.pingpong || (tp->ack.pending&TCP_ACK_PUSHED))
1288 max_ato = TCP_DELACK_MAX;
1289
1290 /* Slow path, intersegment interval is "high". */
1291
1292 /* If some rtt estimate is known, use it to bound delayed ack.
1293 * Do not use tp->rto here, use results of rtt measurements
1294 * directly.
1295 */
1296 if (tp->srtt) {
1297 int rtt = max(tp->srtt>>3, TCP_DELACK_MIN);
1298
1299 if (rtt < max_ato)
1300 max_ato = rtt;
1301 }
1302
1303 ato = min(ato, max_ato);
1304 }
1305
1306 /* Stay within the limit we were given */
1307 timeout = jiffies + ato;
1308
1309 /* Use new timeout only if there wasn't a older one earlier. */
1310 if (tp->ack.pending&TCP_ACK_TIMER) {
1311 /* If delack timer was blocked or is about to expire,
1312 * send ACK now.
1313 */
1314 if (tp->ack.blocked || time_before_eq(tp->ack.timeout, jiffies+(ato>>2))) {
1315 tcp_send_ack(sk);
1316 return;
1317 }
1318
1319 if (!time_before(timeout, tp->ack.timeout))
1320 timeout = tp->ack.timeout;
1321 }
1322 tp->ack.pending |= TCP_ACK_SCHED|TCP_ACK_TIMER;
1323 tp->ack.timeout = timeout;
1324 if (!mod_timer(&tp->delack_timer, timeout))
1325 sock_hold(sk);
1326 }
1327
1328 /* This routine sends an ack and also updates the window. */
tcp_send_ack(struct sock * sk)1329 void tcp_send_ack(struct sock *sk)
1330 {
1331 /* If we have been reset, we may not send again. */
1332 if(sk->state != TCP_CLOSE) {
1333 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1334 struct sk_buff *buff;
1335
1336 /* We are not putting this on the write queue, so
1337 * tcp_transmit_skb() will set the ownership to this
1338 * sock.
1339 */
1340 buff = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
1341 if (buff == NULL) {
1342 tcp_schedule_ack(tp);
1343 tp->ack.ato = TCP_ATO_MIN;
1344 tcp_reset_xmit_timer(sk, TCP_TIME_DACK, TCP_DELACK_MAX);
1345 return;
1346 }
1347
1348 /* Reserve space for headers and prepare control bits. */
1349 skb_reserve(buff, MAX_TCP_HEADER);
1350 buff->csum = 0;
1351 TCP_SKB_CB(buff)->flags = TCPCB_FLAG_ACK;
1352 TCP_SKB_CB(buff)->sacked = 0;
1353
1354 /* Send it off, this clears delayed acks for us. */
1355 TCP_SKB_CB(buff)->seq = TCP_SKB_CB(buff)->end_seq = tcp_acceptable_seq(sk, tp);
1356 TCP_SKB_CB(buff)->when = tcp_time_stamp;
1357 tcp_transmit_skb(sk, buff);
1358 }
1359 }
1360
1361 /* This routine sends a packet with an out of date sequence
1362 * number. It assumes the other end will try to ack it.
1363 *
1364 * Question: what should we make while urgent mode?
1365 * 4.4BSD forces sending single byte of data. We cannot send
1366 * out of window data, because we have SND.NXT==SND.MAX...
1367 *
1368 * Current solution: to send TWO zero-length segments in urgent mode:
1369 * one is with SEG.SEQ=SND.UNA to deliver urgent pointer, another is
1370 * out-of-date with SND.UNA-1 to probe window.
1371 */
tcp_xmit_probe_skb(struct sock * sk,int urgent)1372 static int tcp_xmit_probe_skb(struct sock *sk, int urgent)
1373 {
1374 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1375 struct sk_buff *skb;
1376
1377 /* We don't queue it, tcp_transmit_skb() sets ownership. */
1378 skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
1379 if (skb == NULL)
1380 return -1;
1381
1382 /* Reserve space for headers and set control bits. */
1383 skb_reserve(skb, MAX_TCP_HEADER);
1384 skb->csum = 0;
1385 TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK;
1386 TCP_SKB_CB(skb)->sacked = urgent;
1387
1388 /* Use a previous sequence. This should cause the other
1389 * end to send an ack. Don't queue or clone SKB, just
1390 * send it.
1391 */
1392 TCP_SKB_CB(skb)->seq = urgent ? tp->snd_una : tp->snd_una - 1;
1393 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq;
1394 TCP_SKB_CB(skb)->when = tcp_time_stamp;
1395 return tcp_transmit_skb(sk, skb);
1396 }
1397
tcp_write_wakeup(struct sock * sk)1398 int tcp_write_wakeup(struct sock *sk)
1399 {
1400 if (sk->state != TCP_CLOSE) {
1401 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1402 struct sk_buff *skb;
1403
1404 if ((skb = tp->send_head) != NULL &&
1405 before(TCP_SKB_CB(skb)->seq, tp->snd_una+tp->snd_wnd)) {
1406 int err;
1407 int mss = tcp_current_mss(sk);
1408 int seg_size = tp->snd_una+tp->snd_wnd-TCP_SKB_CB(skb)->seq;
1409
1410 if (before(tp->pushed_seq, TCP_SKB_CB(skb)->end_seq))
1411 tp->pushed_seq = TCP_SKB_CB(skb)->end_seq;
1412
1413 /* We are probing the opening of a window
1414 * but the window size is != 0
1415 * must have been a result SWS avoidance ( sender )
1416 */
1417 if (seg_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq ||
1418 skb->len > mss) {
1419 seg_size = min(seg_size, mss);
1420 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
1421 if (tcp_fragment(sk, skb, seg_size))
1422 return -1;
1423 }
1424 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
1425 TCP_SKB_CB(skb)->when = tcp_time_stamp;
1426 err = tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
1427 if (!err) {
1428 update_send_head(sk, tp, skb);
1429 }
1430 return err;
1431 } else {
1432 if (tp->urg_mode &&
1433 between(tp->snd_up, tp->snd_una+1, tp->snd_una+0xFFFF))
1434 tcp_xmit_probe_skb(sk, TCPCB_URG);
1435 return tcp_xmit_probe_skb(sk, 0);
1436 }
1437 }
1438 return -1;
1439 }
1440
1441 /* A window probe timeout has occurred. If window is not closed send
1442 * a partial packet else a zero probe.
1443 */
tcp_send_probe0(struct sock * sk)1444 void tcp_send_probe0(struct sock *sk)
1445 {
1446 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1447 int err;
1448
1449 err = tcp_write_wakeup(sk);
1450
1451 if (tp->packets_out || !tp->send_head) {
1452 /* Cancel probe timer, if it is not required. */
1453 tp->probes_out = 0;
1454 tp->backoff = 0;
1455 return;
1456 }
1457
1458 if (err <= 0) {
1459 if (tp->backoff < sysctl_tcp_retries2)
1460 tp->backoff++;
1461 tp->probes_out++;
1462 tcp_reset_xmit_timer (sk, TCP_TIME_PROBE0,
1463 min(tp->rto << tp->backoff, TCP_RTO_MAX));
1464 } else {
1465 /* If packet was not sent due to local congestion,
1466 * do not backoff and do not remember probes_out.
1467 * Let local senders to fight for local resources.
1468 *
1469 * Use accumulated backoff yet.
1470 */
1471 if (!tp->probes_out)
1472 tp->probes_out=1;
1473 tcp_reset_xmit_timer (sk, TCP_TIME_PROBE0,
1474 min(tp->rto << tp->backoff, TCP_RESOURCE_PROBE_INTERVAL));
1475 }
1476 }
1477