tcp_output.c source code [src/src/sys/netinet/tcp_output.c]

1	/ $NetBSD: tcp_output.c,v 1.186 2016/06/10 13:27:16 ozaki-r Exp $ /
2
3	/*
4	* Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
5	* All rights reserved.
6	*
7	* Redistribution and use in source and binary forms, with or without
8	* modification, are permitted provided that the following conditions
9	* are met:
10	* 1. Redistributions of source code must retain the above copyright
11	* notice, this list of conditions and the following disclaimer.
12	* 2. Redistributions in binary form must reproduce the above copyright
13	* notice, this list of conditions and the following disclaimer in the
14	* documentation and/or other materials provided with the distribution.
15	* 3. Neither the name of the project nor the names of its contributors
16	* may be used to endorse or promote products derived from this software
17	* without specific prior written permission.
18	*
19	* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
20	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22	* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
23	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29	* SUCH DAMAGE.
30	*/
31
32	/*
33	* @(#)COPYRIGHT 1.1 (NRL) 17 January 1995
34	*
35	* NRL grants permission for redistribution and use in source and binary
36	* forms, with or without modification, of the software and documentation
37	* created at NRL provided that the following conditions are met:
38	*
39	* 1. Redistributions of source code must retain the above copyright
40	* notice, this list of conditions and the following disclaimer.
41	* 2. Redistributions in binary form must reproduce the above copyright
42	* notice, this list of conditions and the following disclaimer in the
43	* documentation and/or other materials provided with the distribution.
44	* 3. All advertising materials mentioning features or use of this software
45	* must display the following acknowledgements:
46	* This product includes software developed by the University of
47	* California, Berkeley and its contributors.
48	* This product includes software developed at the Information
49	* Technology Division, US Naval Research Laboratory.
50	* 4. Neither the name of the NRL nor the names of its contributors
51	* may be used to endorse or promote products derived from this software
52	* without specific prior written permission.
53	*
54	* THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS
55	* IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
56	* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
57	* PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR
58	* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
59	* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
60	* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
61	* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
62	* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
63	* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
64	* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
65	*
66	* The views and conclusions contained in the software and documentation
67	* are those of the authors and should not be interpreted as representing
68	* official policies, either expressed or implied, of the US Naval
69	* Research Laboratory (NRL).
70	*/
71
72	/-*
73	* Copyright (c) 1997, 1998, 2001, 2005, 2006 The NetBSD Foundation, Inc.
74	* All rights reserved.
75	*
76	* This code is derived from software contributed to The NetBSD Foundation
77	* by Jason R. Thorpe and Kevin M. Lahey of the Numerical Aerospace Simulation
78	* Facility, NASA Ames Research Center.
79	* This code is derived from software contributed to The NetBSD Foundation
80	* by Charles M. Hannum.
81	* This code is derived from software contributed to The NetBSD Foundation
82	* by Rui Paulo.
83	*
84	* Redistribution and use in source and binary forms, with or without
85	* modification, are permitted provided that the following conditions
86	* are met:
87	* 1. Redistributions of source code must retain the above copyright
88	* notice, this list of conditions and the following disclaimer.
89	* 2. Redistributions in binary form must reproduce the above copyright
90	* notice, this list of conditions and the following disclaimer in the
91	* documentation and/or other materials provided with the distribution.
92	*
93	* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
94	* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
95	* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
96	* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
97	* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
98	* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
99	* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
100	* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
101	* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
102	* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
103	* POSSIBILITY OF SUCH DAMAGE.
104	*/
105
106	/*
107	* Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
108	* The Regents of the University of California. All rights reserved.
109	*
110	* Redistribution and use in source and binary forms, with or without
111	* modification, are permitted provided that the following conditions
112	* are met:
113	* 1. Redistributions of source code must retain the above copyright
114	* notice, this list of conditions and the following disclaimer.
115	* 2. Redistributions in binary form must reproduce the above copyright
116	* notice, this list of conditions and the following disclaimer in the
117	* documentation and/or other materials provided with the distribution.
118	* 3. Neither the name of the University nor the names of its contributors
119	* may be used to endorse or promote products derived from this software
120	* without specific prior written permission.
121	*
122	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
123	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
124	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
125	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
126	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
127	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
128	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
129	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
130	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
131	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
132	* SUCH DAMAGE.
133	*
134	* @(#)tcp_output.c 8.4 (Berkeley) 5/24/95
135	*/
136
137	#include <sys/cdefs.h>
138	__KERNEL_RCSID(`0`, "$NetBSD: tcp_output.c,v 1.186 2016/06/10 13:27:16 ozaki-r Exp $");
139
140	#ifdef _KERNEL_OPT
141	#include "opt_inet.h"
142	#include "opt_ipsec.h"
143	#include "opt_tcp_debug.h"
144	#endif
145
146	#include <sys/param.h>
147	#include <sys/systm.h>
148	#include <sys/mbuf.h>
149	#include <sys/protosw.h>
150	#include <sys/socket.h>
151	#include <sys/socketvar.h>
152	#include <sys/errno.h>
153	#include <sys/domain.h>
154	#include <sys/kernel.h>
155	#ifdef TCP_SIGNATURE
156	#include <sys/md5.h>
157	#endif
158
159	#include <net/if.h>
160	#include <net/route.h>
161
162	#include <netinet/in.h>
163	#include <netinet/in_systm.h>
164	#include <netinet/ip.h>
165	#include <netinet/in_pcb.h>
166	#include <netinet/ip_var.h>
167
168	#ifdef INET6
169	#ifndef INET
170	#include <netinet/in.h>
171	#endif
172	#include <netinet/ip6.h>
173	#include <netinet6/in6_var.h>
174	#include <netinet6/ip6_var.h>
175	#include <netinet6/in6_pcb.h>
176	#include <netinet6/nd6.h>
177	#endif
178
179	#ifdef IPSEC
180	#include <netipsec/ipsec.h>
181	#include <netipsec/key.h>
182	#ifdef INET6
183	#include <netipsec/ipsec6.h>
184	#endif
185	#endif /* IPSEC*/
186
187	#include <netinet/tcp.h>
188	#define TCPOUTFLAGS
189	#include <netinet/tcp_fsm.h>
190	#include <netinet/tcp_seq.h>
191	#include <netinet/tcp_timer.h>
192	#include <netinet/tcp_var.h>
193	#include <netinet/tcp_private.h>
194	#include <netinet/tcp_congctl.h>
195	#include <netinet/tcpip.h>
196	#include <netinet/tcp_debug.h>
197	#include <netinet/in_offload.h>
198	#include <netinet6/in6_offload.h>
199
200	#ifdef notyet
201	extern struct mbuf *m_copypack();
202	#endif
203
204	/*
205	* Knob to enable Congestion Window Monitoring, and control
206	* the burst size it allows. Default burst is 4 packets, per
207	* the Internet draft.
208	*/
209	int tcp_cwm = `0`;
210	int tcp_cwm_burstsize = `4`;
211
212	int tcp_do_autosndbuf = `1`;
213	int tcp_autosndbuf_inc = `8` * `1024`;
214	int tcp_autosndbuf_max = `256` * `1024`;
215
216	#ifdef TCP_OUTPUT_COUNTERS
217	#include <sys/device.h>
218
219	extern struct evcnt tcp_output_bigheader;
220	extern struct evcnt tcp_output_predict_hit;
221	extern struct evcnt tcp_output_predict_miss;
222	extern struct evcnt tcp_output_copysmall;
223	extern struct evcnt tcp_output_copybig;
224	extern struct evcnt tcp_output_refbig;
225
226	#define TCP_OUTPUT_COUNTER_INCR(ev) (ev)->ev_count++
227	#else
228
229	#define TCP_OUTPUT_COUNTER_INCR(ev) /* nothing */
230
231	#endif /* TCP_OUTPUT_COUNTERS */
232
233	static
234	#ifndef GPROF
235	inline
236	#endif
237	int
238	tcp_segsize(struct tcpcb tp, int* txsegsizep, int* *rxsegsizep,
239	bool *alwaysfragp)
240	{
241	#ifdef INET
242	struct inpcb *inp = tp->t_inpcb;
243	#endif
244	#ifdef INET6
245	struct in6pcb *in6p = tp->t_in6pcb;
246	#endif
247	struct socket *so = NULL;
248	struct rtentry *rt;
249	struct ifnet *ifp;
250	int size;
251	int hdrlen;
252	int optlen;
253
254	*alwaysfragp = false;
255
256	#ifdef DIAGNOSTIC
257	if (tp->t_inpcb && tp->t_in6pcb)
258	panic("tcp_segsize: both t_inpcb and t_in6pcb are set");
259	#endif
260	switch (tp->t_family) {
261	#ifdef INET
262	case AF_INET:
263	hdrlen = sizeof(struct ip) + sizeof(struct tcphdr);
264	break;
265	#endif
266	#ifdef INET6
267	case AF_INET6:
268	hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
269	break;
270	#endif
271	default:
272	size = tcp_mssdflt;
273	goto out;
274	}
275
276	rt = NULL;
277	#ifdef INET
278	if (inp) {
279	rt = in_pcbrtentry(inp);
280	so = inp->inp_socket;
281	}
282	#endif
283	#ifdef INET6
284	if (in6p) {
285	rt = in6_pcbrtentry(in6p);
286	so = in6p->in6p_socket;
287	}
288	#endif
289	if (rt == NULL) {
290	size = tcp_mssdflt;
291	goto out;
292	}
293
294	ifp = rt->rt_ifp;
295
296	size = tcp_mssdflt;
297	if (tp->t_mtudisc && rt->rt_rmx.rmx_mtu != `0`) {
298	#ifdef INET6
299	if (in6p && rt->rt_rmx.rmx_mtu < IPV6_MMTU) {
300	/*
301	* RFC2460 section 5, last paragraph: if path MTU is
302	* smaller than 1280, use 1280 as packet size and
303	* attach fragment header.
304	*/
305	size = IPV6_MMTU - hdrlen - sizeof(struct ip6_frag);
306	*alwaysfragp = true;
307	} else
308	size = rt->rt_rmx.rmx_mtu - hdrlen;
309	#else
310	size = rt->rt_rmx.rmx_mtu - hdrlen;
311	#endif
312	} else if (ifp->if_flags & IFF_LOOPBACK)
313	size = ifp->if_mtu - hdrlen;
314	#ifdef INET
315	else if (inp && tp->t_mtudisc)
316	size = ifp->if_mtu - hdrlen;
317	else if (inp && in_localaddr(inp->inp_faddr))
318	size = ifp->if_mtu - hdrlen;
319	#endif
320	#ifdef INET6
321	else if (in6p) {
322	#ifdef INET
323	if (IN6_IS_ADDR_V4MAPPED(&in6p->in6p_faddr)) {
324	/ mapped addr case /
325	struct in_addr d;
326	bcopy(&in6p->in6p_faddr.s6_addr32[`3`], &d, sizeof(d));
327	if (tp->t_mtudisc \|\| in_localaddr(d))
328	size = ifp->if_mtu - hdrlen;
329	} else
330	#endif
331	{
332	/*
333	* for IPv6, path MTU discovery is always turned on,
334	* or the node must use packet size <= 1280.
335	*/
336	size = tp->t_mtudisc ? IN6_LINKMTU(ifp) : IPV6_MMTU;
337	size -= hdrlen;
338	}
339	}
340	#endif
341	out:
342	/*
343	* Now we must make room for whatever extra TCP/IP options are in
344	* the packet.
345	*/
346	optlen = tcp_optlen(tp);
347
348	/*
349	* XXX tp->t_ourmss should have the right size, but without this code
350	* fragmentation will occur... need more investigation
351	*/
352	#ifdef INET
353	if (inp) {
354	#if defined(IPSEC)
355	if (ipsec_used &&
356	!IPSEC_PCB_SKIP_IPSEC(inp->inp_sp, IPSEC_DIR_OUTBOUND))
357	optlen += ipsec4_hdrsiz_tcp(tp);
358	#endif
359	optlen += ip_optlen(inp);
360	}
361	#endif
362	#ifdef INET6
363	#ifdef INET
364	if (in6p && tp->t_family == AF_INET) {
365	#if defined(IPSEC)
366	if (ipsec_used &&
367	!IPSEC_PCB_SKIP_IPSEC(in6p->in6p_sp, IPSEC_DIR_OUTBOUND))
368	optlen += ipsec4_hdrsiz_tcp(tp);
369	#endif
370	/ XXX size -= ip_optlen(in6p); /
371	} else
372	#endif
373	if (in6p && tp->t_family == AF_INET6) {
374	#if defined(IPSEC)
375	if (ipsec_used &&
376	!IPSEC_PCB_SKIP_IPSEC(in6p->in6p_sp, IPSEC_DIR_OUTBOUND))
377	optlen += ipsec6_hdrsiz_tcp(tp);
378	#endif
379	optlen += ip6_optlen(in6p);
380	}
381	#endif
382	size -= optlen;
383
384	/ there may not be any room for data if mtu is too small /
385	if (size < `0`)
386	return (EMSGSIZE);
387
388	/*
389	* rxsegsizep holds estimated* inbound segment size (estimation
390	* assumes that path MTU is the same for both ways). this is only
391	* for silly window avoidance, do not use the value for other purposes.
392	*
393	* ipseclen is subtracted from both sides, this may not be right.
394	* I'm not quite sure about this (could someone comment).
395	*/
396	*txsegsizep = min(tp->t_peermss - optlen, size);
397	/*
398	* Never send more than half a buffer full. This insures that we can
399	* always keep 2 packets on the wire, no matter what SO_SNDBUF is, and
400	* therefore acks will never be delayed unless we run out of data to
401	* transmit.
402	*/
403	if (so)
404	txsegsizep = min(so->so_snd.sb_hiwat >> `1`, txsegsizep);
405	*rxsegsizep = min(tp->t_ourmss - optlen, size);
406
407	if (*txsegsizep != tp->t_segsz) {
408	/*
409	* If the new segment size is larger, we don't want to
410	* mess up the congestion window, but if it is smaller
411	* we'll have to reduce the congestion window to ensure
412	* that we don't get into trouble with initial windows
413	* and the rest. In any case, if the segment size
414	* has changed, chances are the path has, too, and
415	* our congestion window will be different.
416	*/
417	if (*txsegsizep < tp->t_segsz) {
418	tp->snd_cwnd = max((tp->snd_cwnd / tp->t_segsz)
419	* txsegsizep, txsegsizep);
420	tp->snd_ssthresh = max((tp->snd_ssthresh / tp->t_segsz)
421	* txsegsizep, txsegsizep);
422	}
423	tp->t_segsz = *txsegsizep;
424	}
425
426	return (`0`);
427	}
428
429	static
430	#ifndef GPROF
431	inline
432	#endif
433	int
434	tcp_build_datapkt(struct tcpcb tp, struct* socket so, int* off,
435	long len, int hdrlen, struct mbuf **mp)
436	{
437	struct mbuf m, m0;
438	uint64_t *tcps;
439
440	tcps = TCP_STAT_GETREF();
441	if (tp->t_force && len == `1`)
442	tcps[TCP_STAT_SNDPROBE]++;
443	else if (SEQ_LT(tp->snd_nxt, tp->snd_max)) {
444	tp->t_sndrexmitpack++;
445	tcps[TCP_STAT_SNDREXMITPACK]++;
446	tcps[TCP_STAT_SNDREXMITBYTE] += len;
447	} else {
448	tcps[TCP_STAT_SNDPACK]++;
449	tcps[TCP_STAT_SNDBYTE] += len;
450	}
451	TCP_STAT_PUTREF();
452	#ifdef notyet
453	if ((m = m_copypack(so->so_snd.sb_mb, off,
454	(int)len, max_linkhdr + hdrlen)) == `0`)
455	return (ENOBUFS);
456	/*
457	* m_copypack left space for our hdr; use it.
458	*/
459	m->m_len += hdrlen;
460	m->m_data -= hdrlen;
461	#else
462	MGETHDR(m, M_DONTWAIT, MT_HEADER);
463	if (__predict_false(m == NULL))
464	return (ENOBUFS);
465	MCLAIM(m, &tcp_tx_mowner);
466
467	/*
468	* XXX Because other code assumes headers will fit in
469	* XXX one header mbuf.
470	*
471	* (This code should almost never be run.)
472	*/
473	if (__predict_false((max_linkhdr + hdrlen) > MHLEN)) {
474	TCP_OUTPUT_COUNTER_INCR(&tcp_output_bigheader);
475	MCLGET(m, M_DONTWAIT);
476	if ((m->m_flags & M_EXT) == `0`) {
477	m_freem(m);
478	return (ENOBUFS);
479	}
480	}
481
482	m->m_data += max_linkhdr;
483	m->m_len = hdrlen;
484
485	/*
486	* To avoid traversing the whole sb_mb chain for correct
487	* data to send, remember last sent mbuf, its offset and
488	* the sent size. When called the next time, see if the
489	* data to send is directly following the previous transfer.
490	* This is important for large TCP windows.
491	*/
492	if (off == `0` \|\| tp->t_lastm == NULL \|\|
493	(tp->t_lastoff + tp->t_lastlen) != off) {
494	TCP_OUTPUT_COUNTER_INCR(&tcp_output_predict_miss);
495	/*
496	* Either a new packet or a retransmit.
497	* Start from the beginning.
498	*/
499	tp->t_lastm = so->so_snd.sb_mb;
500	tp->t_inoff = off;
501	} else {
502	TCP_OUTPUT_COUNTER_INCR(&tcp_output_predict_hit);
503	tp->t_inoff += tp->t_lastlen;
504	}
505
506	/ Traverse forward to next packet /
507	while (tp->t_inoff > `0`) {
508	if (tp->t_lastm == NULL)
509	panic("tp->t_lastm == NULL");
510	if (tp->t_inoff < tp->t_lastm->m_len)
511	break;
512	tp->t_inoff -= tp->t_lastm->m_len;
513	tp->t_lastm = tp->t_lastm->m_next;
514	}
515
516	tp->t_lastoff = off;
517	tp->t_lastlen = len;
518	m0 = tp->t_lastm;
519	off = tp->t_inoff;
520
521	if (len <= M_TRAILINGSPACE(m)) {
522	m_copydata(m0, off, (int) len, mtod(m, char *) + hdrlen);
523	m->m_len += len;
524	TCP_OUTPUT_COUNTER_INCR(&tcp_output_copysmall);
525	} else {
526	m->m_next = m_copym(m0, off, (int) len, M_DONTWAIT);
527	if (m->m_next == NULL) {
528	m_freem(m);
529	return (ENOBUFS);
530	}
531	#ifdef TCP_OUTPUT_COUNTERS
532	if (m->m_next->m_flags & M_EXT)
533	TCP_OUTPUT_COUNTER_INCR(&tcp_output_refbig);
534	else
535	TCP_OUTPUT_COUNTER_INCR(&tcp_output_copybig);
536	#endif /* TCP_OUTPUT_COUNTERS */
537	}
538	#endif
539
540	*mp = m;
541	return (`0`);
542	}
543
544	/*
545	* Tcp output routine: figure out what should be sent and send it.
546	*/
547	int
548	tcp_output(struct tcpcb *tp)
549	{
550	struct rtentry *rt;
551	struct socket *so;
552	struct route *ro;
553	long len, win;
554	int off, flags, error;
555	struct mbuf *m;
556	struct ip *ip;
557	#ifdef INET6
558	struct ip6_hdr *ip6;
559	#endif
560	struct tcphdr *th;
561	u_char opt[MAX_TCPOPTLEN];
562	#define OPT_FITS(more) ((optlen + (more)) < sizeof(opt))
563	unsigned optlen, hdrlen, packetlen;
564	unsigned int sack_numblks;
565	int idle, sendalot, txsegsize, rxsegsize;
566	int txsegsize_nosack;
567	int maxburst = TCP_MAXBURST;
568	int af; / address family on the wire /
569	int iphdrlen;
570	int has_tso4, has_tso6;
571	int has_tso, use_tso;
572	bool alwaysfrag;
573	int sack_rxmit;
574	int sack_bytes_rxmt;
575	int ecn_tos;
576	struct sackhole *p;
577	#ifdef TCP_SIGNATURE
578	int sigoff = `0`;
579	#endif
580	uint64_t *tcps;
581
582	#ifdef DIAGNOSTIC
583	if (tp->t_inpcb && tp->t_in6pcb)
584	panic("tcp_output: both t_inpcb and t_in6pcb are set");
585	#endif
586	so = NULL;
587	ro = NULL;
588	if (tp->t_inpcb) {
589	so = tp->t_inpcb->inp_socket;
590	ro = &tp->t_inpcb->inp_route;
591	}
592	#ifdef INET6
593	else if (tp->t_in6pcb) {
594	so = tp->t_in6pcb->in6p_socket;
595	ro = &tp->t_in6pcb->in6p_route;
596	}
597	#endif
598
599	switch (af = tp->t_family) {
600	#ifdef INET
601	case AF_INET:
602	if (tp->t_inpcb)
603	break;
604	#ifdef INET6
605	/ mapped addr case /
606	if (tp->t_in6pcb)
607	break;
608	#endif
609	return (EINVAL);
610	#endif
611	#ifdef INET6
612	case AF_INET6:
613	if (tp->t_in6pcb)
614	break;
615	return (EINVAL);
616	#endif
617	default:
618	return (EAFNOSUPPORT);
619	}
620
621	if (tcp_segsize(tp, &txsegsize, &rxsegsize, &alwaysfrag))
622	return (EMSGSIZE);
623
624	idle = (tp->snd_max == tp->snd_una);
625
626	/*
627	* Determine if we can use TCP segmentation offload:
628	* - If we're using IPv4
629	* - If there is not an IPsec policy that prevents it
630	* - If the interface can do it
631	*/
632	has_tso4 = has_tso6 = false;
633	#if defined(INET)
634	has_tso4 = tp->t_inpcb != NULL &&
635	#if defined(IPSEC)
636	(!ipsec_used \|\| IPSEC_PCB_SKIP_IPSEC(tp->t_inpcb->inp_sp,
637	IPSEC_DIR_OUTBOUND)) &&
638	#endif
639	(rt = rtcache_validate(&tp->t_inpcb->inp_route)) != NULL &&
640	(rt->rt_ifp->if_capenable & IFCAP_TSOv4) != `0`;
641	#endif /* defined(INET) */
642	#if defined(INET6)
643	has_tso6 = tp->t_in6pcb != NULL &&
644	#if defined(IPSEC)
645	(!ipsec_used \|\| IPSEC_PCB_SKIP_IPSEC(tp->t_in6pcb->in6p_sp,
646	IPSEC_DIR_OUTBOUND)) &&
647	#endif
648	(rt = rtcache_validate(&tp->t_in6pcb->in6p_route)) != NULL &&
649	(rt->rt_ifp->if_capenable & IFCAP_TSOv6) != `0`;
650	#endif /* defined(INET6) */
651	has_tso = (has_tso4 \|\| has_tso6) && !alwaysfrag;
652
653	/*
654	* Restart Window computation. From draft-floyd-incr-init-win-03:
655	*
656	* Optionally, a TCP MAY set the restart window to the
657	* minimum of the value used for the initial window and
658	* the current value of cwnd (in other words, using a
659	* larger value for the restart window should never increase
660	* the size of cwnd).
661	*/
662	if (tcp_cwm) {
663	/*
664	* Hughes/Touch/Heidemann Congestion Window Monitoring.
665	* Count the number of packets currently pending
666	* acknowledgement, and limit our congestion window
667	* to a pre-determined allowed burst size plus that count.
668	* This prevents bursting once all pending packets have
669	* been acknowledged (i.e. transmission is idle).
670	*
671	* XXX Link this to Initial Window?
672	*/
673	tp->snd_cwnd = min(tp->snd_cwnd,
674	(tcp_cwm_burstsize * txsegsize) +
675	(tp->snd_nxt - tp->snd_una));
676	} else {
677	if (idle && (tcp_now - tp->t_rcvtime) >= tp->t_rxtcur) {
678	/*
679	* We have been idle for "a while" and no acks are
680	* expected to clock out any data we send --
681	* slow start to get ack "clock" running again.
682	*/
683	int ss = tcp_init_win;
684	#ifdef INET
685	if (tp->t_inpcb &&
686	in_localaddr(tp->t_inpcb->inp_faddr))
687	ss = tcp_init_win_local;
688	#endif
689	#ifdef INET6
690	if (tp->t_in6pcb &&
691	in6_localaddr(&tp->t_in6pcb->in6p_faddr))
692	ss = tcp_init_win_local;
693	#endif
694	tp->snd_cwnd = min(tp->snd_cwnd,
695	TCP_INITIAL_WINDOW(ss, txsegsize));
696	}
697	}
698
699	txsegsize_nosack = txsegsize;
700	again:
701	ecn_tos = `0`;
702	use_tso = has_tso;
703	if ((tp->t_flags & (TF_ECN_SND_CWR\|TF_ECN_SND_ECE)) != `0`) {
704	/ don't duplicate CWR/ECE. /
705	use_tso = `0`;
706	}
707	TCP_REASS_LOCK(tp);
708	sack_numblks = tcp_sack_numblks(tp);
709	if (sack_numblks) {
710	int sackoptlen;
711
712	sackoptlen = TCP_SACK_OPTLEN(sack_numblks);
713	if (sackoptlen > txsegsize_nosack) {
714	sack_numblks = `0`; / give up SACK /
715	txsegsize = txsegsize_nosack;
716	} else {
717	if ((tp->rcv_sack_flags & TCPSACK_HAVED) != `0`) {
718	/ don't duplicate D-SACK. /
719	use_tso = `0`;
720	}
721	txsegsize = txsegsize_nosack - sackoptlen;
722	}
723	} else {
724	txsegsize = txsegsize_nosack;
725	}
726
727	/*
728	* Determine length of data that should be transmitted, and
729	* flags that should be used. If there is some data or critical
730	* controls (SYN, RST) to send, then transmit; otherwise,
731	* investigate further.
732	*
733	* Readjust SACK information to avoid resending duplicate data.
734	*/
735	if (TCP_SACK_ENABLED(tp) && SEQ_LT(tp->snd_nxt, tp->snd_max))
736	tcp_sack_adjust(tp);
737	sendalot = `0`;
738	off = tp->snd_nxt - tp->snd_una;
739	win = min(tp->snd_wnd, tp->snd_cwnd);
740
741	flags = tcp_outflags[tp->t_state];
742
743	/*
744	* Send any SACK-generated retransmissions. If we're explicitly trying
745	* to send out new data (when sendalot is 1), bypass this function.
746	* If we retransmit in fast recovery mode, decrement snd_cwnd, since
747	* we're replacing a (future) new transmission with a retransmission
748	* now, and we previously incremented snd_cwnd in tcp_input().
749	*/
750	/*
751	* Still in sack recovery , reset rxmit flag to zero.
752	*/
753	sack_rxmit = `0`;
754	sack_bytes_rxmt = `0`;
755	len = `0`;
756	p = NULL;
757	do {
758	long cwin;
759	if (!TCP_SACK_ENABLED(tp))
760	break;
761	if (tp->t_partialacks < `0`)
762	break;
763	p = tcp_sack_output(tp, &sack_bytes_rxmt);
764	if (p == NULL)
765	break;
766
767	cwin = min(tp->snd_wnd, tp->snd_cwnd) - sack_bytes_rxmt;
768	if (cwin < `0`)
769	cwin = `0`;
770	/ Do not retransmit SACK segments beyond snd_recover /
771	if (SEQ_GT(p->end, tp->snd_recover)) {
772	/*
773	* (At least) part of sack hole extends beyond
774	* snd_recover. Check to see if we can rexmit data
775	* for this hole.
776	*/
777	if (SEQ_GEQ(p->rxmit, tp->snd_recover)) {
778	/*
779	* Can't rexmit any more data for this hole.
780	* That data will be rexmitted in the next
781	* sack recovery episode, when snd_recover
782	* moves past p->rxmit.
783	*/
784	p = NULL;
785	break;
786	}
787	/ Can rexmit part of the current hole /
788	len = ((long)ulmin(cwin, tp->snd_recover - p->rxmit));
789	} else
790	len = ((long)ulmin(cwin, p->end - p->rxmit));
791	off = p->rxmit - tp->snd_una;
792	if (off + len > so->so_snd.sb_cc) {
793	/ 1 for TH_FIN /
794	KASSERT(off + len == so->so_snd.sb_cc + `1`);
795	KASSERT(p->rxmit + len == tp->snd_max);
796	len = so->so_snd.sb_cc - off;
797	}
798	if (len > `0`) {
799	sack_rxmit = `1`;
800	sendalot = `1`;
801	}
802	} while (/CONSTCOND/`0`);
803
804	/*
805	* If in persist timeout with window of 0, send 1 byte.
806	* Otherwise, if window is small but nonzero
807	* and timer expired, we will send what we can
808	* and go to transmit state.
809	*/
810	if (tp->t_force) {
811	if (win == `0`) {
812	/*
813	* If we still have some data to send, then
814	* clear the FIN bit. Usually this would
815	* happen below when it realizes that we
816	* aren't sending all the data. However,
817	* if we have exactly 1 byte of unset data,
818	* then it won't clear the FIN bit below,
819	* and if we are in persist state, we wind
820	* up sending the packet without recording
821	* that we sent the FIN bit.
822	*
823	* We can't just blindly clear the FIN bit,
824	* because if we don't have any more data
825	* to send then the probe will be the FIN
826	* itself.
827	*/
828	if (off < so->so_snd.sb_cc)
829	flags &= ~TH_FIN;
830	win = `1`;
831	} else {
832	TCP_TIMER_DISARM(tp, TCPT_PERSIST);
833	tp->t_rxtshift = `0`;
834	}
835	}
836
837	if (sack_rxmit == `0`) {
838	if (TCP_SACK_ENABLED(tp) && tp->t_partialacks >= `0`) {
839	long cwin;
840
841	/*
842	* We are inside of a SACK recovery episode and are
843	* sending new data, having retransmitted all the
844	* data possible in the scoreboard.
845	*/
846	if (tp->snd_wnd < so->so_snd.sb_cc) {
847	len = tp->snd_wnd - off;
848	flags &= ~TH_FIN;
849	} else {
850	len = so->so_snd.sb_cc - off;
851	}
852
853	/*
854	* From FreeBSD:
855	* Don't remove this (len > 0) check !
856	* We explicitly check for len > 0 here (although it
857	* isn't really necessary), to work around a gcc
858	* optimization issue - to force gcc to compute
859	* len above. Without this check, the computation
860	* of len is bungled by the optimizer.
861	*/
862	if (len > `0`) {
863	cwin = tp->snd_cwnd -
864	(tp->snd_nxt - tp->sack_newdata) -
865	sack_bytes_rxmt;
866	if (cwin < `0`)
867	cwin = `0`;
868	if (cwin < len) {
869	len = cwin;
870	flags &= ~TH_FIN;
871	}
872	}
873	} else if (win < so->so_snd.sb_cc) {
874	len = win - off;
875	flags &= ~TH_FIN;
876	} else {
877	len = so->so_snd.sb_cc - off;
878	}
879	}
880
881	if (len < `0`) {
882	/*
883	* If FIN has been sent but not acked,
884	* but we haven't been called to retransmit,
885	* len will be -1. Otherwise, window shrank
886	* after we sent into it. If window shrank to 0,
887	* cancel pending retransmit, pull snd_nxt back
888	* to (closed) window, and set the persist timer
889	* if it isn't already going. If the window didn't
890	* close completely, just wait for an ACK.
891	*
892	* If we have a pending FIN, either it has already been
893	* transmitted or it is outside the window, so drop it.
894	* If the FIN has been transmitted, but this is not a
895	* retransmission, then len must be -1. Therefore we also
896	* prevent here the sending of `gratuitous FINs'. This
897	* eliminates the need to check for that case below (e.g.
898	* to back up snd_nxt before the FIN so that the sequence
899	* number is correct).
900	*/
901	len = `0`;
902	flags &= ~TH_FIN;
903	if (win == `0`) {
904	TCP_TIMER_DISARM(tp, TCPT_REXMT);
905	tp->t_rxtshift = `0`;
906	tp->snd_nxt = tp->snd_una;
907	if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == `0`)
908	tcp_setpersist(tp);
909	}
910	}
911
912	/*
913	* Automatic sizing enables the performance of large buffers
914	* and most of the efficiency of small ones by only allocating
915	* space when it is needed.
916	*
917	* The criteria to step up the send buffer one notch are:
918	* 1. receive window of remote host is larger than send buffer
919	* (with a fudge factor of 5/4th);
920	* 2. send buffer is filled to 7/8th with data (so we actually
921	* have data to make use of it);
922	* 3. send buffer fill has not hit maximal automatic size;
923	* 4. our send window (slow start and cogestion controlled) is
924	* larger than sent but unacknowledged data in send buffer.
925	*
926	* The remote host receive window scaling factor may limit the
927	* growing of the send buffer before it reaches its allowed
928	* maximum.
929	*
930	* It scales directly with slow start or congestion window
931	* and does at most one step per received ACK. This fast
932	* scaling has the drawback of growing the send buffer beyond
933	* what is strictly necessary to make full use of a given
934	* delay*bandwith product. However testing has shown this not
935	* to be much of an problem. At worst we are trading wasting
936	* of available bandwith (the non-use of it) for wasting some
937	* socket buffer memory.
938	*
939	* TODO: Shrink send buffer during idle periods together
940	* with congestion window. Requires another timer.
941	*/
942	if (tcp_do_autosndbuf && so->so_snd.sb_flags & SB_AUTOSIZE) {
943	if ((tp->snd_wnd / `4` * `5`) >= so->so_snd.sb_hiwat &&
944	so->so_snd.sb_cc >= (so->so_snd.sb_hiwat / `8` * `7`) &&
945	so->so_snd.sb_cc < tcp_autosndbuf_max &&
946	win >= (so->so_snd.sb_cc - (tp->snd_nxt - tp->snd_una))) {
947	if (!sbreserve(&so->so_snd,
948	min(so->so_snd.sb_hiwat + tcp_autosndbuf_inc,
949	tcp_autosndbuf_max), so))
950	so->so_snd.sb_flags &= ~SB_AUTOSIZE;
951	}
952	}
953
954	if (len > txsegsize) {
955	if (use_tso) {
956	/*
957	* Truncate TSO transfers to IP_MAXPACKET, and make
958	* sure that we send equal size transfers down the
959	* stack (rather than big-small-big-small-...).
960	*/
961	#ifdef INET6
962	CTASSERT(IPV6_MAXPACKET == IP_MAXPACKET);
963	#endif
964	len = (min(len, IP_MAXPACKET) / txsegsize) * txsegsize;
965	if (len <= txsegsize) {
966	use_tso = `0`;
967	}
968	} else
969	len = txsegsize;
970	flags &= ~TH_FIN;
971	sendalot = `1`;
972	} else
973	use_tso = `0`;
974	if (sack_rxmit) {
975	if (SEQ_LT(p->rxmit + len, tp->snd_una + so->so_snd.sb_cc))
976	flags &= ~TH_FIN;
977	}
978
979	win = sbspace(&so->so_rcv);
980
981	/*
982	* Sender silly window avoidance. If connection is idle
983	* and can send all data, a maximum segment,
984	* at least a maximum default-size segment do it,
985	* or are forced, do it; otherwise don't bother.
986	* If peer's buffer is tiny, then send
987	* when window is at least half open.
988	* If retransmitting (possibly after persist timer forced us
989	* to send into a small window), then must resend.
990	*/
991	if (len) {
992	if (len >= txsegsize)
993	goto send;
994	if ((so->so_state & SS_MORETOCOME) == `0` &&
995	((idle \|\| tp->t_flags & TF_NODELAY) &&
996	len + off >= so->so_snd.sb_cc))
997	goto send;
998	if (tp->t_force)
999	goto send;
1000	if (len >= tp->max_sndwnd / `2`)
1001	goto send;
1002	if (SEQ_LT(tp->snd_nxt, tp->snd_max))
1003	goto send;
1004	if (sack_rxmit)
1005	goto send;
1006	}
1007
1008	/*
1009	* Compare available window to amount of window known to peer
1010	* (as advertised window less next expected input). If the
1011	* difference is at least twice the size of the largest segment
1012	* we expect to receive (i.e. two segments) or at least 50% of
1013	* the maximum possible window, then want to send a window update
1014	* to peer.
1015	*/
1016	if (win > `0`) {
1017	/*
1018	* "adv" is the amount we can increase the window,
1019	* taking into account that we are limited by
1020	* TCP_MAXWIN << tp->rcv_scale.
1021	*/
1022	long adv = min(win, (long)TCP_MAXWIN << tp->rcv_scale) -
1023	(tp->rcv_adv - tp->rcv_nxt);
1024
1025	/*
1026	* If the new window size ends up being the same as the old
1027	* size when it is scaled, then don't force a window update.
1028	*/
1029	if ((tp->rcv_adv - tp->rcv_nxt) >> tp->rcv_scale ==
1030	(adv + tp->rcv_adv - tp->rcv_nxt) >> tp->rcv_scale)
1031	goto dontupdate;
1032	if (adv >= (long) (`2` * rxsegsize))
1033	goto send;
1034	if (`2` * adv >= (long) so->so_rcv.sb_hiwat)
1035	goto send;
1036	}
1037	dontupdate:
1038
1039	/*
1040	* Send if we owe peer an ACK.
1041	*/
1042	if (tp->t_flags & TF_ACKNOW)
1043	goto send;
1044	if (flags & (TH_SYN\|TH_FIN\|TH_RST))
1045	goto send;
1046	if (SEQ_GT(tp->snd_up, tp->snd_una))
1047	goto send;
1048	/*
1049	* In SACK, it is possible for tcp_output to fail to send a segment
1050	* after the retransmission timer has been turned off. Make sure
1051	* that the retransmission timer is set.
1052	*/
1053	if (TCP_SACK_ENABLED(tp) && SEQ_GT(tp->snd_max, tp->snd_una) &&
1054	!TCP_TIMER_ISARMED(tp, TCPT_REXMT) &&
1055	!TCP_TIMER_ISARMED(tp, TCPT_PERSIST)) {
1056	TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur);
1057	goto just_return;
1058	}
1059
1060	/*
1061	* TCP window updates are not reliable, rather a polling protocol
1062	* using ``persist'' packets is used to insure receipt of window
1063	* updates. The three ``states'' for the output side are:
1064	* idle not doing retransmits or persists
1065	* persisting to move a small or zero window
1066	* (re)transmitting and thereby not persisting
1067	*
1068	* tp->t_timer[TCPT_PERSIST]
1069	* is set when we are in persist state.
1070	* tp->t_force
1071	* is set when we are called to send a persist packet.
1072	* tp->t_timer[TCPT_REXMT]
1073	* is set when we are retransmitting
1074	* The output side is idle when both timers are zero.
1075	*
1076	* If send window is too small, there is data to transmit, and no
1077	* retransmit or persist is pending, then go to persist state.
1078	* If nothing happens soon, send when timer expires:
1079	* if window is nonzero, transmit what we can,
1080	* otherwise force out a byte.
1081	*/
1082	if (so->so_snd.sb_cc && TCP_TIMER_ISARMED(tp, TCPT_REXMT) == `0` &&
1083	TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == `0`) {
1084	tp->t_rxtshift = `0`;
1085	tcp_setpersist(tp);
1086	}
1087
1088	/*
1089	* No reason to send a segment, just return.
1090	*/
1091	just_return:
1092	TCP_REASS_UNLOCK(tp);
1093	return (`0`);
1094
1095	send:
1096	/*
1097	* Before ESTABLISHED, force sending of initial options
1098	* unless TCP set not to do any options.
1099	* NOTE: we assume that the IP/TCP header plus TCP options
1100	* always fit in a single mbuf, leaving room for a maximum
1101	* link header, i.e.
1102	* max_linkhdr + sizeof (struct tcpiphdr) + optlen <= MCLBYTES
1103	*/
1104	optlen = `0`;
1105	switch (af) {
1106	#ifdef INET
1107	case AF_INET:
1108	iphdrlen = sizeof(struct ip) + sizeof(struct tcphdr);
1109	break;
1110	#endif
1111	#ifdef INET6
1112	case AF_INET6:
1113	iphdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
1114	break;
1115	#endif
1116	default: /pacify gcc/
1117	iphdrlen = `0`;
1118	break;
1119	}
1120	hdrlen = iphdrlen;
1121	if (flags & TH_SYN) {
1122	struct rtentry *synrt;
1123
1124	synrt = NULL;
1125	#ifdef INET
1126	if (tp->t_inpcb)
1127	synrt = in_pcbrtentry(tp->t_inpcb);
1128	#endif
1129	#ifdef INET6
1130	if (tp->t_in6pcb)
1131	synrt = in6_pcbrtentry(tp->t_in6pcb);
1132	#endif
1133
1134	tp->snd_nxt = tp->iss;
1135	tp->t_ourmss = tcp_mss_to_advertise(synrt != NULL ?
1136	synrt->rt_ifp : NULL, af);
1137	if ((tp->t_flags & TF_NOOPT) == `0` && OPT_FITS(`4`)) {
1138	opt[`0`] = TCPOPT_MAXSEG;
1139	opt[`1`] = `4`;
1140	opt[`2`] = (tp->t_ourmss >> `8`) & `0xff`;
1141	opt[`3`] = tp->t_ourmss & `0xff`;
1142	optlen = `4`;
1143
1144	if ((tp->t_flags & TF_REQ_SCALE) &&
1145	((flags & TH_ACK) == `0` \|\|
1146	(tp->t_flags & TF_RCVD_SCALE)) &&
1147	OPT_FITS(`4`)) {
1148	((u_int32_t ) (opt + optlen)) = htonl(
1149	TCPOPT_NOP << `24` \|
1150	TCPOPT_WINDOW << `16` \|
1151	TCPOLEN_WINDOW << `8` \|
1152	tp->request_r_scale);
1153	optlen += `4`;
1154	}
1155	if (tcp_do_sack && OPT_FITS(`4`)) {
1156	u_int8_t cp = (u_int8_t )(opt + optlen);
1157
1158	cp[`0`] = TCPOPT_SACK_PERMITTED;
1159	cp[`1`] = `2`;
1160	cp[`2`] = TCPOPT_NOP;
1161	cp[`3`] = TCPOPT_NOP;
1162	optlen += `4`;
1163	}
1164	}
1165	}
1166
1167	/*
1168	* Send a timestamp and echo-reply if this is a SYN and our side
1169	* wants to use timestamps (TF_REQ_TSTMP is set) or both our side
1170	* and our peer have sent timestamps in our SYN's.
1171	*/
1172	if ((tp->t_flags & (TF_REQ_TSTMP\|TF_NOOPT)) == TF_REQ_TSTMP &&
1173	(flags & TH_RST) == `0` &&
1174	((flags & (TH_SYN\|TH_ACK)) == TH_SYN \|\|
1175	(tp->t_flags & TF_RCVD_TSTMP)) && OPT_FITS(TCPOLEN_TSTAMP_APPA)) {
1176	u_int32_t lp = (u_int32_t )(opt + optlen);
1177
1178	/ Form timestamp option as shown in appendix A of RFC 1323. /
1179	*lp++ = htonl(TCPOPT_TSTAMP_HDR);
1180	*lp++ = htonl(TCP_TIMESTAMP(tp));
1181	*lp = htonl(tp->ts_recent);
1182	optlen += TCPOLEN_TSTAMP_APPA;
1183
1184	/ Set receive buffer autosizing timestamp. /
1185	if (tp->rfbuf_ts == `0` && (so->so_rcv.sb_flags & SB_AUTOSIZE))
1186	tp->rfbuf_ts = TCP_TIMESTAMP(tp);
1187	}
1188
1189	/*
1190	* Tack on the SACK block if it is necessary.
1191	*/
1192	if (sack_numblks) {
1193	int sack_len;
1194	u_char bp = (u_char )(opt + optlen);
1195	u_int32_t lp = (u_int32_t )(bp + `4`);
1196	struct ipqent *tiqe;
1197
1198	sack_len = sack_numblks * `8` + `2`;
1199	if (OPT_FITS(sack_len + `2`)) {
1200	bp[`0`] = TCPOPT_NOP;
1201	bp[`1`] = TCPOPT_NOP;
1202	bp[`2`] = TCPOPT_SACK;
1203	bp[`3`] = sack_len;
1204	if ((tp->rcv_sack_flags & TCPSACK_HAVED) != `0`) {
1205	sack_numblks--;
1206	*lp++ = htonl(tp->rcv_dsack_block.left);
1207	*lp++ = htonl(tp->rcv_dsack_block.right);
1208	tp->rcv_sack_flags &= ~TCPSACK_HAVED;
1209	}
1210	for (tiqe = TAILQ_FIRST(&tp->timeq);
1211	sack_numblks > `0`;
1212	tiqe = TAILQ_NEXT(tiqe, ipqe_timeq)) {
1213	KASSERT(tiqe != NULL);
1214	sack_numblks--;
1215	*lp++ = htonl(tiqe->ipqe_seq);
1216	*lp++ = htonl(tiqe->ipqe_seq + tiqe->ipqe_len +
1217	((tiqe->ipqe_flags & TH_FIN) != `0` ? `1` : `0`));
1218	}
1219	optlen += sack_len + `2`;
1220	}
1221	}
1222	TCP_REASS_UNLOCK(tp);
1223
1224	#ifdef TCP_SIGNATURE
1225	if ((tp->t_flags & TF_SIGNATURE) && OPT_FITS(TCPOLEN_SIGNATURE + `2`)) {
1226	u_char *bp;
1227	/*
1228	* Initialize TCP-MD5 option (RFC2385)
1229	*/
1230	bp = (u_char *)opt + optlen;
1231	*bp++ = TCPOPT_SIGNATURE;
1232	*bp++ = TCPOLEN_SIGNATURE;
1233	sigoff = optlen + `2`;
1234	memset(bp, `0`, TCP_SIGLEN);
1235	bp += TCP_SIGLEN;
1236	optlen += TCPOLEN_SIGNATURE;
1237	/*
1238	* Terminate options list and maintain 32-bit alignment.
1239	*/
1240	*bp++ = TCPOPT_NOP;
1241	*bp++ = TCPOPT_EOL;
1242	optlen += `2`;
1243	} else if ((tp->t_flags & TF_SIGNATURE) != `0`) {
1244	error = ECONNABORTED;
1245	goto out;
1246	}
1247	#endif /* TCP_SIGNATURE */
1248
1249	hdrlen += optlen;
1250
1251	#ifdef DIAGNOSTIC
1252	if (!use_tso && len > txsegsize)
1253	panic("tcp data to be sent is larger than segment");
1254	else if (use_tso && len > IP_MAXPACKET)
1255	panic("tcp data to be sent is larger than max TSO size");
1256	if (max_linkhdr + hdrlen > MCLBYTES)
1257	panic("tcphdr too big");
1258	#endif
1259
1260	/*
1261	* Grab a header mbuf, attaching a copy of data to
1262	* be transmitted, and initialize the header from
1263	* the template for sends on this connection.
1264	*/
1265	if (len) {
1266	error = tcp_build_datapkt(tp, so, off, len, hdrlen, &m);
1267	if (error)
1268	goto out;
1269	/*
1270	* If we're sending everything we've got, set PUSH.
1271	* (This will keep happy those implementations which only
1272	* give data to the user when a buffer fills or
1273	* a PUSH comes in.)
1274	*/
1275	if (off + len == so->so_snd.sb_cc)
1276	flags \|= TH_PUSH;
1277	} else {
1278	tcps = TCP_STAT_GETREF();
1279	if (tp->t_flags & TF_ACKNOW)
1280	tcps[TCP_STAT_SNDACKS]++;
1281	else if (flags & (TH_SYN\|TH_FIN\|TH_RST))
1282	tcps[TCP_STAT_SNDCTRL]++;
1283	else if (SEQ_GT(tp->snd_up, tp->snd_una))
1284	tcps[TCP_STAT_SNDURG]++;
1285	else
1286	tcps[TCP_STAT_SNDWINUP]++;
1287	TCP_STAT_PUTREF();
1288
1289	MGETHDR(m, M_DONTWAIT, MT_HEADER);
1290	if (m != NULL && max_linkhdr + hdrlen > MHLEN) {
1291	MCLGET(m, M_DONTWAIT);
1292	if ((m->m_flags & M_EXT) == `0`) {
1293	m_freem(m);
1294	m = NULL;
1295	}
1296	}
1297	if (m == NULL) {
1298	error = ENOBUFS;
1299	goto out;
1300	}
1301	MCLAIM(m, &tcp_tx_mowner);
1302	m->m_data += max_linkhdr;
1303	m->m_len = hdrlen;
1304	}
1305	m_reset_rcvif(m);
1306	switch (af) {
1307	#ifdef INET
1308	case AF_INET:
1309	ip = mtod(m, struct ip *);
1310	#ifdef INET6
1311	ip6 = NULL;
1312	#endif
1313	th = (struct tcphdr *)(ip + `1`);
1314	break;
1315	#endif
1316	#ifdef INET6
1317	case AF_INET6:
1318	ip = NULL;
1319	ip6 = mtod(m, struct ip6_hdr *);
1320	th = (struct tcphdr *)(ip6 + `1`);
1321	break;
1322	#endif
1323	default: /pacify gcc/
1324	ip = NULL;
1325	#ifdef INET6
1326	ip6 = NULL;
1327	#endif
1328	th = NULL;
1329	break;
1330	}
1331	if (tp->t_template == `0`)
1332	panic("tcp_output");
1333	if (tp->t_template->m_len < iphdrlen)
1334	panic("tcp_output");
1335	bcopy(mtod(tp->t_template, void ), mtod(m, void* *), iphdrlen);
1336
1337	/*
1338	* If we are starting a connection, send ECN setup
1339	* SYN packet. If we are on a retransmit, we may
1340	* resend those bits a number of times as per
1341	* RFC 3168.
1342	*/
1343	if (tp->t_state == TCPS_SYN_SENT && tcp_do_ecn) {
1344	if (tp->t_flags & TF_SYN_REXMT) {
1345	if (tp->t_ecn_retries--)
1346	flags \|= TH_ECE\|TH_CWR;
1347	} else {
1348	flags \|= TH_ECE\|TH_CWR;
1349	tp->t_ecn_retries = tcp_ecn_maxretries;
1350	}
1351	}
1352
1353	if (TCP_ECN_ALLOWED(tp)) {
1354	/*
1355	* If the peer has ECN, mark data packets
1356	* ECN capable. Ignore pure ack packets, retransmissions
1357	* and window probes.
1358	*/
1359	if (len > `0` && SEQ_GEQ(tp->snd_nxt, tp->snd_max) &&
1360	!(tp->t_force && len == `1`)) {
1361	ecn_tos = IPTOS_ECN_ECT0;
1362	TCP_STATINC(TCP_STAT_ECN_ECT);
1363	}
1364
1365	/*
1366	* Reply with proper ECN notifications.
1367	*/
1368	if (tp->t_flags & TF_ECN_SND_CWR) {
1369	flags \|= TH_CWR;
1370	tp->t_flags &= ~TF_ECN_SND_CWR;
1371	}
1372	if (tp->t_flags & TF_ECN_SND_ECE) {
1373	flags \|= TH_ECE;
1374	}
1375	}
1376
1377
1378	/*
1379	* If we are doing retransmissions, then snd_nxt will
1380	* not reflect the first unsent octet. For ACK only
1381	* packets, we do not want the sequence number of the
1382	* retransmitted packet, we want the sequence number
1383	* of the next unsent octet. So, if there is no data
1384	* (and no SYN or FIN), use snd_max instead of snd_nxt
1385	* when filling in ti_seq. But if we are in persist
1386	* state, snd_max might reflect one byte beyond the
1387	* right edge of the window, so use snd_nxt in that
1388	* case, since we know we aren't doing a retransmission.
1389	* (retransmit and persist are mutually exclusive...)
1390	*/
1391	if (TCP_SACK_ENABLED(tp) && sack_rxmit) {
1392	th->th_seq = htonl(p->rxmit);
1393	p->rxmit += len;
1394	} else {
1395	if (len \|\| (flags & (TH_SYN\|TH_FIN)) \|\|
1396	TCP_TIMER_ISARMED(tp, TCPT_PERSIST))
1397	th->th_seq = htonl(tp->snd_nxt);
1398	else
1399	th->th_seq = htonl(tp->snd_max);
1400	}
1401	th->th_ack = htonl(tp->rcv_nxt);
1402	if (optlen) {
1403	bcopy((void )opt, (void* *)(th + `1`), optlen);
1404	th->th_off = (sizeof (struct tcphdr) + optlen) >> `2`;
1405	}
1406	th->th_flags = flags;
1407	/*
1408	* Calculate receive window. Don't shrink window,
1409	* but avoid silly window syndrome.
1410	*/
1411	if (win < (long)(so->so_rcv.sb_hiwat / `4`) && win < (long)rxsegsize)
1412	win = `0`;
1413	if (win > (long)TCP_MAXWIN << tp->rcv_scale)
1414	win = (long)TCP_MAXWIN << tp->rcv_scale;
1415	if (win < (long)(int32_t)(tp->rcv_adv - tp->rcv_nxt))
1416	win = (long)(int32_t)(tp->rcv_adv - tp->rcv_nxt);
1417	th->th_win = htons((u_int16_t) (win>>tp->rcv_scale));
1418	if (th->th_win == `0`) {
1419	tp->t_sndzerowin++;
1420	}
1421	if (SEQ_GT(tp->snd_up, tp->snd_nxt)) {
1422	u_int32_t urp = tp->snd_up - tp->snd_nxt;
1423	if (urp > IP_MAXPACKET)
1424	urp = IP_MAXPACKET;
1425	th->th_urp = htons((u_int16_t)urp);
1426	th->th_flags \|= TH_URG;
1427	} else
1428	/*
1429	* If no urgent pointer to send, then we pull
1430	* the urgent pointer to the left edge of the send window
1431	* so that it doesn't drift into the send window on sequence
1432	* number wraparound.
1433	*/
1434	tp->snd_up = tp->snd_una; / drag it along /
1435
1436	#ifdef TCP_SIGNATURE
1437	if (sigoff && (tp->t_flags & TF_SIGNATURE)) {
1438	struct secasvar *sav;
1439	u_int8_t *sigp;
1440
1441	sav = tcp_signature_getsav(m, th);
1442
1443	if (sav == NULL) {
1444	if (m)
1445	m_freem(m);
1446	return (EPERM);
1447	}
1448
1449	m->m_pkthdr.len = hdrlen + len;
1450	sigp = (char )th + sizeof(th) + sigoff;
1451	tcp_signature(m, th, (char )th - mtod(m, char* *), sav, sigp);
1452
1453	key_sa_recordxfer(sav, m);
1454	KEY_FREESAV(&sav);
1455	}
1456	#endif
1457
1458	/*
1459	* Set ourselves up to be checksummed just before the packet
1460	* hits the wire.
1461	*/
1462	switch (af) {
1463	#ifdef INET
1464	case AF_INET:
1465	m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
1466	if (use_tso) {
1467	m->m_pkthdr.segsz = txsegsize;
1468	m->m_pkthdr.csum_flags = M_CSUM_TSOv4;
1469	} else {
1470	m->m_pkthdr.csum_flags = M_CSUM_TCPv4;
1471	if (len + optlen) {
1472	/ Fixup the pseudo-header checksum. /
1473	/ XXXJRT Not IP Jumbogram safe. /
1474	th->th_sum = in_cksum_addword(th->th_sum,
1475	htons((u_int16_t) (len + optlen)));
1476	}
1477	}
1478	break;
1479	#endif
1480	#ifdef INET6
1481	case AF_INET6:
1482	m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
1483	if (use_tso) {
1484	m->m_pkthdr.segsz = txsegsize;
1485	m->m_pkthdr.csum_flags = M_CSUM_TSOv6;
1486	} else {
1487	m->m_pkthdr.csum_flags = M_CSUM_TCPv6;
1488	if (len + optlen) {
1489	/ Fixup the pseudo-header checksum. /
1490	/ XXXJRT: Not IPv6 Jumbogram safe. /
1491	th->th_sum = in_cksum_addword(th->th_sum,
1492	htons((u_int16_t) (len + optlen)));
1493	}
1494	}
1495	break;
1496	#endif
1497	}
1498
1499	/*
1500	* In transmit state, time the transmission and arrange for
1501	* the retransmit. In persist state, just set snd_max.
1502	*/
1503	if (tp->t_force == `0` \|\| TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == `0`) {
1504	tcp_seq startseq = tp->snd_nxt;
1505
1506	/*
1507	* Advance snd_nxt over sequence space of this segment.
1508	* There are no states in which we send both a SYN and a FIN,
1509	* so we collapse the tests for these flags.
1510	*/
1511	if (flags & (TH_SYN\|TH_FIN))
1512	tp->snd_nxt++;
1513	if (sack_rxmit)
1514	goto timer;
1515	tp->snd_nxt += len;
1516	if (SEQ_GT(tp->snd_nxt, tp->snd_max)) {
1517	tp->snd_max = tp->snd_nxt;
1518	/*
1519	* Time this transmission if not a retransmission and
1520	* not currently timing anything.
1521	*/
1522	if (tp->t_rtttime == `0`) {
1523	tp->t_rtttime = tcp_now;
1524	tp->t_rtseq = startseq;
1525	TCP_STATINC(TCP_STAT_SEGSTIMED);
1526	}
1527	}
1528
1529	/*
1530	* Set retransmit timer if not currently set,
1531	* and not doing an ack or a keep-alive probe.
1532	* Initial value for retransmit timer is smoothed
1533	* round-trip time + 2 * round-trip time variance.
1534	* Initialize shift counter which is used for backoff
1535	* of retransmit time.
1536	*/
1537	timer:
1538	if (TCP_TIMER_ISARMED(tp, TCPT_REXMT) == `0`) {
1539	if ((sack_rxmit && tp->snd_nxt != tp->snd_max)
1540	\|\| tp->snd_nxt != tp->snd_una) {
1541	if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST)) {
1542	TCP_TIMER_DISARM(tp, TCPT_PERSIST);
1543	tp->t_rxtshift = `0`;
1544	}
1545	TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur);
1546	} else if (len == `0` && so->so_snd.sb_cc > `0`
1547	&& TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == `0`) {
1548	/*
1549	* If we are sending a window probe and there's
1550	* unacked data in the socket, make sure at
1551	* least the persist timer is running.
1552	*/
1553	tp->t_rxtshift = `0`;
1554	tcp_setpersist(tp);
1555	}
1556	}
1557	} else
1558	if (SEQ_GT(tp->snd_nxt + len, tp->snd_max))
1559	tp->snd_max = tp->snd_nxt + len;
1560
1561	#ifdef TCP_DEBUG
1562	/*
1563	* Trace.
1564	*/
1565	if (so->so_options & SO_DEBUG)
1566	tcp_trace(TA_OUTPUT, tp->t_state, tp, m, `0`);
1567	#endif
1568
1569	/*
1570	* Fill in IP length and desired time to live and
1571	* send to IP level. There should be a better way
1572	* to handle ttl and tos; we could keep them in
1573	* the template, but need a way to checksum without them.
1574	*/
1575	m->m_pkthdr.len = hdrlen + len;
1576
1577	switch (af) {
1578	#ifdef INET
1579	case AF_INET:
1580	ip->ip_len = htons(m->m_pkthdr.len);
1581	packetlen = m->m_pkthdr.len;
1582	if (tp->t_inpcb) {
1583	ip->ip_ttl = tp->t_inpcb->inp_ip.ip_ttl;
1584	ip->ip_tos = tp->t_inpcb->inp_ip.ip_tos \| ecn_tos;
1585	}
1586	#ifdef INET6
1587	else if (tp->t_in6pcb) {
1588	ip->ip_ttl = in6_selecthlim(tp->t_in6pcb, NULL); /XXX/
1589	ip->ip_tos = ecn_tos; /XXX/
1590	}
1591	#endif
1592	break;
1593	#endif
1594	#ifdef INET6
1595	case AF_INET6:
1596	packetlen = m->m_pkthdr.len;
1597	ip6->ip6_nxt = IPPROTO_TCP;
1598	if (tp->t_in6pcb) {
1599	/*
1600	* we separately set hoplimit for every segment, since
1601	* the user might want to change the value via
1602	* setsockopt. Also, desired default hop limit might
1603	* be changed via Neighbor Discovery.
1604	*/
1605	ip6->ip6_hlim = in6_selecthlim_rt(tp->t_in6pcb);
1606	}
1607	ip6->ip6_flow \|= htonl(ecn_tos << `20`);
1608	/ ip6->ip6_flow = ??? (from template) /
1609	/ ip6_plen will be filled in ip6_output(). /
1610	break;
1611	#endif
1612	default: /pacify gcc/
1613	packetlen = `0`;
1614	break;
1615	}
1616
1617	switch (af) {
1618	#ifdef INET
1619	case AF_INET:
1620	{
1621	struct mbuf *opts;
1622
1623	if (tp->t_inpcb)
1624	opts = tp->t_inpcb->inp_options;
1625	else
1626	opts = NULL;
1627	error = ip_output(m, opts, ro,
1628	(tp->t_mtudisc ? IP_MTUDISC : `0`) \|
1629	(so->so_options & SO_DONTROUTE), NULL, so);
1630	break;
1631	}
1632	#endif
1633	#ifdef INET6
1634	case AF_INET6:
1635	{
1636	struct ip6_pktopts *opts;
1637
1638	if (tp->t_in6pcb)
1639	opts = tp->t_in6pcb->in6p_outputopts;
1640	else
1641	opts = NULL;
1642	error = ip6_output(m, opts, ro, so->so_options & SO_DONTROUTE,
1643	NULL, so, NULL);
1644	break;
1645	}
1646	#endif
1647	default:
1648	error = EAFNOSUPPORT;
1649	break;
1650	}
1651	if (error) {
1652	out:
1653	if (error == ENOBUFS) {
1654	TCP_STATINC(TCP_STAT_SELFQUENCH);
1655	#ifdef INET
1656	if (tp->t_inpcb)
1657	tcp_quench(tp->t_inpcb, `0`);
1658	#endif
1659	#ifdef INET6
1660	if (tp->t_in6pcb)
1661	tcp6_quench(tp->t_in6pcb, `0`);
1662	#endif
1663	error = `0`;
1664	} else if ((error == EHOSTUNREACH \|\| error == ENETDOWN) &&
1665	TCPS_HAVERCVDSYN(tp->t_state)) {
1666	tp->t_softerror = error;
1667	error = `0`;
1668	}
1669
1670	/ Back out the seqence number advance. /
1671	if (sack_rxmit)
1672	p->rxmit -= len;
1673
1674	/ Restart the delayed ACK timer, if necessary. /
1675	if (tp->t_flags & TF_DELACK)
1676	TCP_RESTART_DELACK(tp);
1677
1678	return (error);
1679	}
1680
1681	if (packetlen > tp->t_pmtud_mtu_sent)
1682	tp->t_pmtud_mtu_sent = packetlen;
1683
1684	tcps = TCP_STAT_GETREF();
1685	tcps[TCP_STAT_SNDTOTAL]++;
1686	if (tp->t_flags & TF_DELACK)
1687	tcps[TCP_STAT_DELACK]++;
1688	TCP_STAT_PUTREF();
1689
1690	/*
1691	* Data sent (as far as we can tell).
1692	* If this advertises a larger window than any other segment,
1693	* then remember the size of the advertised window.
1694	* Any pending ACK has now been sent.
1695	*/
1696	if (win > `0` && SEQ_GT(tp->rcv_nxt+win, tp->rcv_adv))
1697	tp->rcv_adv = tp->rcv_nxt + win;
1698	tp->last_ack_sent = tp->rcv_nxt;
1699	tp->t_flags &= ~TF_ACKNOW;
1700	TCP_CLEAR_DELACK(tp);
1701	#ifdef DIAGNOSTIC
1702	if (maxburst < `0`)
1703	printf("tcp_output: maxburst exceeded by %d\n", -maxburst);
1704	#endif
1705	if (sendalot && (tp->t_congctl == &tcp_reno_ctl \|\| --maxburst))
1706	goto again;
1707	return (`0`);
1708	}
1709
1710	void
1711	tcp_setpersist(struct tcpcb *tp)
1712	{
1713	int t = ((tp->t_srtt >> `2`) + tp->t_rttvar) >> (`1` + `2`);
1714	int nticks;
1715
1716	if (TCP_TIMER_ISARMED(tp, TCPT_REXMT))
1717	panic("tcp_output REXMT");
1718	/*
1719	* Start/restart persistance timer.
1720	*/
1721	if (t < tp->t_rttmin)
1722	t = tp->t_rttmin;
1723	TCPT_RANGESET(nticks, t * tcp_backoff[tp->t_rxtshift],
1724	TCPTV_PERSMIN, TCPTV_PERSMAX);
1725	TCP_TIMER_ARM(tp, TCPT_PERSIST, nticks);
1726	if (tp->t_rxtshift < TCP_MAXRXTSHIFT)
1727	tp->t_rxtshift++;
1728	}
1729

Browse the source code of src/src/sys/netinet/tcp_output.c