tcp_input.c source code [src/src/sys/netinet/tcp_input.c]

1	/ $NetBSD: tcp_input.c,v 1.349 2016/11/15 22:23:09 mrg Exp $ /
2
3	/*
4	* Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
5	* All rights reserved.
6	*
7	* Redistribution and use in source and binary forms, with or without
8	* modification, are permitted provided that the following conditions
9	* are met:
10	* 1. Redistributions of source code must retain the above copyright
11	* notice, this list of conditions and the following disclaimer.
12	* 2. Redistributions in binary form must reproduce the above copyright
13	* notice, this list of conditions and the following disclaimer in the
14	* documentation and/or other materials provided with the distribution.
15	* 3. Neither the name of the project nor the names of its contributors
16	* may be used to endorse or promote products derived from this software
17	* without specific prior written permission.
18	*
19	* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
20	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22	* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
23	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29	* SUCH DAMAGE.
30	*/
31
32	/*
33	* @(#)COPYRIGHT 1.1 (NRL) 17 January 1995
34	*
35	* NRL grants permission for redistribution and use in source and binary
36	* forms, with or without modification, of the software and documentation
37	* created at NRL provided that the following conditions are met:
38	*
39	* 1. Redistributions of source code must retain the above copyright
40	* notice, this list of conditions and the following disclaimer.
41	* 2. Redistributions in binary form must reproduce the above copyright
42	* notice, this list of conditions and the following disclaimer in the
43	* documentation and/or other materials provided with the distribution.
44	* 3. All advertising materials mentioning features or use of this software
45	* must display the following acknowledgements:
46	* This product includes software developed by the University of
47	* California, Berkeley and its contributors.
48	* This product includes software developed at the Information
49	* Technology Division, US Naval Research Laboratory.
50	* 4. Neither the name of the NRL nor the names of its contributors
51	* may be used to endorse or promote products derived from this software
52	* without specific prior written permission.
53	*
54	* THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS
55	* IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
56	* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
57	* PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR
58	* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
59	* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
60	* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
61	* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
62	* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
63	* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
64	* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
65	*
66	* The views and conclusions contained in the software and documentation
67	* are those of the authors and should not be interpreted as representing
68	* official policies, either expressed or implied, of the US Naval
69	* Research Laboratory (NRL).
70	*/
71
72	/-*
73	* Copyright (c) 1997, 1998, 1999, 2001, 2005, 2006,
74	* 2011 The NetBSD Foundation, Inc.
75	* All rights reserved.
76	*
77	* This code is derived from software contributed to The NetBSD Foundation
78	* by Coyote Point Systems, Inc.
79	* This code is derived from software contributed to The NetBSD Foundation
80	* by Jason R. Thorpe and Kevin M. Lahey of the Numerical Aerospace Simulation
81	* Facility, NASA Ames Research Center.
82	* This code is derived from software contributed to The NetBSD Foundation
83	* by Charles M. Hannum.
84	* This code is derived from software contributed to The NetBSD Foundation
85	* by Rui Paulo.
86	*
87	* Redistribution and use in source and binary forms, with or without
88	* modification, are permitted provided that the following conditions
89	* are met:
90	* 1. Redistributions of source code must retain the above copyright
91	* notice, this list of conditions and the following disclaimer.
92	* 2. Redistributions in binary form must reproduce the above copyright
93	* notice, this list of conditions and the following disclaimer in the
94	* documentation and/or other materials provided with the distribution.
95	*
96	* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
97	* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
98	* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
99	* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
100	* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
101	* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
102	* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
103	* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
104	* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
105	* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
106	* POSSIBILITY OF SUCH DAMAGE.
107	*/
108
109	/*
110	* Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
111	* The Regents of the University of California. All rights reserved.
112	*
113	* Redistribution and use in source and binary forms, with or without
114	* modification, are permitted provided that the following conditions
115	* are met:
116	* 1. Redistributions of source code must retain the above copyright
117	* notice, this list of conditions and the following disclaimer.
118	* 2. Redistributions in binary form must reproduce the above copyright
119	* notice, this list of conditions and the following disclaimer in the
120	* documentation and/or other materials provided with the distribution.
121	* 3. Neither the name of the University nor the names of its contributors
122	* may be used to endorse or promote products derived from this software
123	* without specific prior written permission.
124	*
125	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
126	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
127	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
128	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
129	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
130	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
131	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
132	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
133	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
134	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
135	* SUCH DAMAGE.
136	*
137	* @(#)tcp_input.c 8.12 (Berkeley) 5/24/95
138	*/
139
140	/*
141	* TODO list for SYN cache stuff:
142	*
143	* Find room for a "state" field, which is needed to keep a
144	* compressed state for TIME_WAIT TCBs. It's been noted already
145	* that this is fairly important for very high-volume web and
146	* mail servers, which use a large number of short-lived
147	* connections.
148	*/
149
150	#include <sys/cdefs.h>
151	__KERNEL_RCSID(`0`, "$NetBSD: tcp_input.c,v 1.349 2016/11/15 22:23:09 mrg Exp $");
152
153	#ifdef _KERNEL_OPT
154	#include "opt_inet.h"
155	#include "opt_ipsec.h"
156	#include "opt_inet_csum.h"
157	#include "opt_tcp_debug.h"
158	#endif
159
160	#include <sys/param.h>
161	#include <sys/systm.h>
162	#include <sys/malloc.h>
163	#include <sys/mbuf.h>
164	#include <sys/protosw.h>
165	#include <sys/socket.h>
166	#include <sys/socketvar.h>
167	#include <sys/errno.h>
168	#include <sys/syslog.h>
169	#include <sys/pool.h>
170	#include <sys/domain.h>
171	#include <sys/kernel.h>
172	#ifdef TCP_SIGNATURE
173	#include <sys/md5.h>
174	#endif
175	#include <sys/lwp.h> /* for lwp0 */
176	#include <sys/cprng.h>
177
178	#include <net/if.h>
179	#include <net/if_types.h>
180
181	#include <netinet/in.h>
182	#include <netinet/in_systm.h>
183	#include <netinet/ip.h>
184	#include <netinet/in_pcb.h>
185	#include <netinet/in_var.h>
186	#include <netinet/ip_var.h>
187	#include <netinet/in_offload.h>
188
189	#ifdef INET6
190	#ifndef INET
191	#include <netinet/in.h>
192	#endif
193	#include <netinet/ip6.h>
194	#include <netinet6/ip6_var.h>
195	#include <netinet6/in6_pcb.h>
196	#include <netinet6/ip6_var.h>
197	#include <netinet6/in6_var.h>
198	#include <netinet/icmp6.h>
199	#include <netinet6/nd6.h>
200	#ifdef TCP_SIGNATURE
201	#include <netinet6/scope6_var.h>
202	#endif
203	#endif
204
205	#ifndef INET6
206	/ always need ip6.h for IP6_EXTHDR_GET /
207	#include <netinet/ip6.h>
208	#endif
209
210	#include <netinet/tcp.h>
211	#include <netinet/tcp_fsm.h>
212	#include <netinet/tcp_seq.h>
213	#include <netinet/tcp_timer.h>
214	#include <netinet/tcp_var.h>
215	#include <netinet/tcp_private.h>
216	#include <netinet/tcpip.h>
217	#include <netinet/tcp_congctl.h>
218	#include <netinet/tcp_debug.h>
219
220	#ifdef INET6
221	#include "faith.h"
222	#if defined(NFAITH) && NFAITH > 0
223	#include <net/if_faith.h>
224	#endif
225	#endif /* INET6 */
226
227	#ifdef IPSEC
228	#include <netipsec/ipsec.h>
229	#include <netipsec/ipsec_var.h>
230	#include <netipsec/ipsec_private.h>
231	#include <netipsec/key.h>
232	#ifdef INET6
233	#include <netipsec/ipsec6.h>
234	#endif
235	#endif /* IPSEC*/
236
237	#include <netinet/tcp_vtw.h>
238
239	int tcprexmtthresh = `3`;
240	int tcp_log_refused;
241
242	int tcp_do_autorcvbuf = `1`;
243	int tcp_autorcvbuf_inc = `16` * `1024`;
244	int tcp_autorcvbuf_max = `256` * `1024`;
245	int tcp_msl = (TCPTV_MSL / PR_SLOWHZ);
246
247	static int tcp_rst_ppslim_count = `0`;
248	static struct timeval tcp_rst_ppslim_last;
249	static int tcp_ackdrop_ppslim_count = `0`;
250	static struct timeval tcp_ackdrop_ppslim_last;
251
252	#define TCP_PAWS_IDLE (24U * 24 * 60 * 60 * PR_SLOWHZ)
253
254	/ for modulo comparisons of timestamps /
255	#define TSTMP_LT(a,b) ((int)((a)-(b)) < 0)
256	#define TSTMP_GEQ(a,b) ((int)((a)-(b)) >= 0)
257
258	/*
259	* Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint.
260	*/
261	#ifdef INET6
262	static inline void
263	nd6_hint(struct tcpcb *tp)
264	{
265	struct rtentry *rt;
266
267	if (tp != NULL && tp->t_in6pcb != NULL && tp->t_family == AF_INET6 &&
268	(rt = rtcache_validate(&tp->t_in6pcb->in6p_route)) != NULL)
269	nd6_nud_hint(rt);
270	}
271	#else
272	static inline void
273	nd6_hint(struct tcpcb *tp)
274	{
275	}
276	#endif
277
278	/*
279	* Compute ACK transmission behavior. Delay the ACK unless
280	* we have already delayed an ACK (must send an ACK every two segments).
281	* We also ACK immediately if we received a PUSH and the ACK-on-PUSH
282	* option is enabled.
283	*/
284	static void
285	tcp_setup_ack(struct tcpcb tp, const* struct tcphdr *th)
286	{
287
288	if (tp->t_flags & TF_DELACK \|\|
289	(tcp_ack_on_push && th->th_flags & TH_PUSH))
290	tp->t_flags \|= TF_ACKNOW;
291	else
292	TCP_SET_DELACK(tp);
293	}
294
295	static void
296	icmp_check(struct tcpcb tp, const* struct tcphdr th, int* acked)
297	{
298
299	/*
300	* If we had a pending ICMP message that refers to data that have
301	* just been acknowledged, disregard the recorded ICMP message.
302	*/
303	if ((tp->t_flags & TF_PMTUD_PEND) &&
304	SEQ_GT(th->th_ack, tp->t_pmtud_th_seq))
305	tp->t_flags &= ~TF_PMTUD_PEND;
306
307	/*
308	* Keep track of the largest chunk of data
309	* acknowledged since last PMTU update
310	*/
311	if (tp->t_pmtud_mss_acked < acked)
312	tp->t_pmtud_mss_acked = acked;
313	}
314
315	/*
316	* Convert TCP protocol fields to host order for easier processing.
317	*/
318	static void
319	tcp_fields_to_host(struct tcphdr *th)
320	{
321
322	NTOHL(th->th_seq);
323	NTOHL(th->th_ack);
324	NTOHS(th->th_win);
325	NTOHS(th->th_urp);
326	}
327
328	/*
329	* ... and reverse the above.
330	*/
331	static void
332	tcp_fields_to_net(struct tcphdr *th)
333	{
334
335	HTONL(th->th_seq);
336	HTONL(th->th_ack);
337	HTONS(th->th_win);
338	HTONS(th->th_urp);
339	}
340
341	#ifdef TCP_CSUM_COUNTERS
342	#include <sys/device.h>
343
344	#if defined(INET)
345	extern struct evcnt tcp_hwcsum_ok;
346	extern struct evcnt tcp_hwcsum_bad;
347	extern struct evcnt tcp_hwcsum_data;
348	extern struct evcnt tcp_swcsum;
349	#endif /* defined(INET) */
350	#if defined(INET6)
351	extern struct evcnt tcp6_hwcsum_ok;
352	extern struct evcnt tcp6_hwcsum_bad;
353	extern struct evcnt tcp6_hwcsum_data;
354	extern struct evcnt tcp6_swcsum;
355	#endif /* defined(INET6) */
356
357	#define TCP_CSUM_COUNTER_INCR(ev) (ev)->ev_count++
358
359	#else
360
361	#define TCP_CSUM_COUNTER_INCR(ev) /* nothing */
362
363	#endif /* TCP_CSUM_COUNTERS */
364
365	#ifdef TCP_REASS_COUNTERS
366	#include <sys/device.h>
367
368	extern struct evcnt tcp_reass_;
369	extern struct evcnt tcp_reass_empty;
370	extern struct evcnt tcp_reass_iteration[`8`];
371	extern struct evcnt tcp_reass_prependfirst;
372	extern struct evcnt tcp_reass_prepend;
373	extern struct evcnt tcp_reass_insert;
374	extern struct evcnt tcp_reass_inserttail;
375	extern struct evcnt tcp_reass_append;
376	extern struct evcnt tcp_reass_appendtail;
377	extern struct evcnt tcp_reass_overlaptail;
378	extern struct evcnt tcp_reass_overlapfront;
379	extern struct evcnt tcp_reass_segdup;
380	extern struct evcnt tcp_reass_fragdup;
381
382	#define TCP_REASS_COUNTER_INCR(ev) (ev)->ev_count++
383
384	#else
385
386	#define TCP_REASS_COUNTER_INCR(ev) /* nothing */
387
388	#endif /* TCP_REASS_COUNTERS */
389
390	static int tcp_reass(struct tcpcb , const* struct tcphdr , struct* mbuf *,
391	int *);
392	static int tcp_dooptions(struct tcpcb , const* u_char , int*,
393	struct tcphdr , struct* mbuf , int, struct* tcp_opt_info *);
394
395	#ifdef INET
396	static void tcp4_log_refused(const struct ip , const* struct tcphdr *);
397	#endif
398	#ifdef INET6
399	static void tcp6_log_refused(const struct ip6_hdr , const* struct tcphdr *);
400	#endif
401
402	#define TRAVERSE(x) while ((x)->m_next) (x) = (x)->m_next
403
404	#if defined(MBUFTRACE)
405	struct mowner tcp_reass_mowner = MOWNER_INIT("tcp", "reass");
406	#endif /* defined(MBUFTRACE) */
407
408	static struct pool tcpipqent_pool;
409
410	void
411	tcpipqent_init(void)
412	{
413
414	pool_init(&tcpipqent_pool, sizeof(struct ipqent), `0`, `0`, `0`, "tcpipqepl",
415	NULL, IPL_VM);
416	}
417
418	struct ipqent *
419	tcpipqent_alloc(void)
420	{
421	struct ipqent *ipqe;
422	int s;
423
424	s = splvm();
425	ipqe = pool_get(&tcpipqent_pool, PR_NOWAIT);
426	splx(s);
427
428	return ipqe;
429	}
430
431	void
432	tcpipqent_free(struct ipqent *ipqe)
433	{
434	int s;
435
436	s = splvm();
437	pool_put(&tcpipqent_pool, ipqe);
438	splx(s);
439	}
440
441	static int
442	tcp_reass(struct tcpcb tp, const* struct tcphdr th, struct* mbuf m, int* *tlen)
443	{
444	struct ipqent p, q, nq, tiqe = NULL;
445	struct socket *so = NULL;
446	int pkt_flags;
447	tcp_seq pkt_seq;
448	unsigned pkt_len;
449	u_long rcvpartdupbyte = `0`;
450	u_long rcvoobyte;
451	#ifdef TCP_REASS_COUNTERS
452	u_int count = `0`;
453	#endif
454	uint64_t *tcps;
455
456	if (tp->t_inpcb)
457	so = tp->t_inpcb->inp_socket;
458	#ifdef INET6
459	else if (tp->t_in6pcb)
460	so = tp->t_in6pcb->in6p_socket;
461	#endif
462
463	TCP_REASS_LOCK_CHECK(tp);
464
465	/*
466	* Call with th==0 after become established to
467	* force pre-ESTABLISHED data up to user socket.
468	*/
469	if (th == `0`)
470	goto present;
471
472	m_claimm(m, &tcp_reass_mowner);
473
474	rcvoobyte = *tlen;
475	/*
476	* Copy these to local variables because the tcpiphdr
477	* gets munged while we are collapsing mbufs.
478	*/
479	pkt_seq = th->th_seq;
480	pkt_len = *tlen;
481	pkt_flags = th->th_flags;
482
483	TCP_REASS_COUNTER_INCR(&tcp_reass_);
484
485	if ((p = TAILQ_LAST(&tp->segq, ipqehead)) != NULL) {
486	/*
487	* When we miss a packet, the vast majority of time we get
488	* packets that follow it in order. So optimize for that.
489	*/
490	if (pkt_seq == p->ipqe_seq + p->ipqe_len) {
491	p->ipqe_len += pkt_len;
492	p->ipqe_flags \|= pkt_flags;
493	m_cat(p->ipre_mlast, m);
494	TRAVERSE(p->ipre_mlast);
495	m = NULL;
496	tiqe = p;
497	TAILQ_REMOVE(&tp->timeq, p, ipqe_timeq);
498	TCP_REASS_COUNTER_INCR(&tcp_reass_appendtail);
499	goto skip_replacement;
500	}
501	/*
502	* While we're here, if the pkt is completely beyond
503	* anything we have, just insert it at the tail.
504	*/
505	if (SEQ_GT(pkt_seq, p->ipqe_seq + p->ipqe_len)) {
506	TCP_REASS_COUNTER_INCR(&tcp_reass_inserttail);
507	goto insert_it;
508	}
509	}
510
511	q = TAILQ_FIRST(&tp->segq);
512
513	if (q != NULL) {
514	/*
515	* If this segment immediately precedes the first out-of-order
516	* block, simply slap the segment in front of it and (mostly)
517	* skip the complicated logic.
518	*/
519	if (pkt_seq + pkt_len == q->ipqe_seq) {
520	q->ipqe_seq = pkt_seq;
521	q->ipqe_len += pkt_len;
522	q->ipqe_flags \|= pkt_flags;
523	m_cat(m, q->ipqe_m);
524	q->ipqe_m = m;
525	q->ipre_mlast = m; / last mbuf may have changed /
526	TRAVERSE(q->ipre_mlast);
527	tiqe = q;
528	TAILQ_REMOVE(&tp->timeq, q, ipqe_timeq);
529	TCP_REASS_COUNTER_INCR(&tcp_reass_prependfirst);
530	goto skip_replacement;
531	}
532	} else {
533	TCP_REASS_COUNTER_INCR(&tcp_reass_empty);
534	}
535
536	/*
537	* Find a segment which begins after this one does.
538	*/
539	for (p = NULL; q != NULL; q = nq) {
540	nq = TAILQ_NEXT(q, ipqe_q);
541	#ifdef TCP_REASS_COUNTERS
542	count++;
543	#endif
544	/*
545	* If the received segment is just right after this
546	* fragment, merge the two together and then check
547	* for further overlaps.
548	*/
549	if (q->ipqe_seq + q->ipqe_len == pkt_seq) {
550	#ifdef TCPREASS_DEBUG
551	printf("tcp_reass[%p]: concat %u:%u(%u) to %u:%u(%u)\n",
552	tp, pkt_seq, pkt_seq + pkt_len, pkt_len,
553	q->ipqe_seq, q->ipqe_seq + q->ipqe_len, q->ipqe_len);
554	#endif
555	pkt_len += q->ipqe_len;
556	pkt_flags \|= q->ipqe_flags;
557	pkt_seq = q->ipqe_seq;
558	m_cat(q->ipre_mlast, m);
559	TRAVERSE(q->ipre_mlast);
560	m = q->ipqe_m;
561	TCP_REASS_COUNTER_INCR(&tcp_reass_append);
562	goto free_ipqe;
563	}
564	/*
565	* If the received segment is completely past this
566	* fragment, we need to go the next fragment.
567	*/
568	if (SEQ_LT(q->ipqe_seq + q->ipqe_len, pkt_seq)) {
569	p = q;
570	continue;
571	}
572	/*
573	* If the fragment is past the received segment,
574	* it (or any following) can't be concatenated.
575	*/
576	if (SEQ_GT(q->ipqe_seq, pkt_seq + pkt_len)) {
577	TCP_REASS_COUNTER_INCR(&tcp_reass_insert);
578	break;
579	}
580
581	/*
582	* We've received all the data in this segment before.
583	* mark it as a duplicate and return.
584	*/
585	if (SEQ_LEQ(q->ipqe_seq, pkt_seq) &&
586	SEQ_GEQ(q->ipqe_seq + q->ipqe_len, pkt_seq + pkt_len)) {
587	tcps = TCP_STAT_GETREF();
588	tcps[TCP_STAT_RCVDUPPACK]++;
589	tcps[TCP_STAT_RCVDUPBYTE] += pkt_len;
590	TCP_STAT_PUTREF();
591	tcp_new_dsack(tp, pkt_seq, pkt_len);
592	m_freem(m);
593	if (tiqe != NULL) {
594	tcpipqent_free(tiqe);
595	}
596	TCP_REASS_COUNTER_INCR(&tcp_reass_segdup);
597	goto out;
598	}
599	/*
600	* Received segment completely overlaps this fragment
601	* so we drop the fragment (this keeps the temporal
602	* ordering of segments correct).
603	*/
604	if (SEQ_GEQ(q->ipqe_seq, pkt_seq) &&
605	SEQ_LEQ(q->ipqe_seq + q->ipqe_len, pkt_seq + pkt_len)) {
606	rcvpartdupbyte += q->ipqe_len;
607	m_freem(q->ipqe_m);
608	TCP_REASS_COUNTER_INCR(&tcp_reass_fragdup);
609	goto free_ipqe;
610	}
611	/*
612	* RX'ed segment extends past the end of the
613	* fragment. Drop the overlapping bytes. Then
614	* merge the fragment and segment then treat as
615	* a longer received packet.
616	*/
617	if (SEQ_LT(q->ipqe_seq, pkt_seq) &&
618	SEQ_GT(q->ipqe_seq + q->ipqe_len, pkt_seq)) {
619	int overlap = q->ipqe_seq + q->ipqe_len - pkt_seq;
620	#ifdef TCPREASS_DEBUG
621	printf("tcp_reass[%p]: trim starting %d bytes of %u:%u(%u)\n",
622	tp, overlap,
623	pkt_seq, pkt_seq + pkt_len, pkt_len);
624	#endif
625	m_adj(m, overlap);
626	rcvpartdupbyte += overlap;
627	m_cat(q->ipre_mlast, m);
628	TRAVERSE(q->ipre_mlast);
629	m = q->ipqe_m;
630	pkt_seq = q->ipqe_seq;
631	pkt_len += q->ipqe_len - overlap;
632	rcvoobyte -= overlap;
633	TCP_REASS_COUNTER_INCR(&tcp_reass_overlaptail);
634	goto free_ipqe;
635	}
636	/*
637	* RX'ed segment extends past the front of the
638	* fragment. Drop the overlapping bytes on the
639	* received packet. The packet will then be
640	* contatentated with this fragment a bit later.
641	*/
642	if (SEQ_GT(q->ipqe_seq, pkt_seq) &&
643	SEQ_LT(q->ipqe_seq, pkt_seq + pkt_len)) {
644	int overlap = pkt_seq + pkt_len - q->ipqe_seq;
645	#ifdef TCPREASS_DEBUG
646	printf("tcp_reass[%p]: trim trailing %d bytes of %u:%u(%u)\n",
647	tp, overlap,
648	pkt_seq, pkt_seq + pkt_len, pkt_len);
649	#endif
650	m_adj(m, -overlap);
651	pkt_len -= overlap;
652	rcvpartdupbyte += overlap;
653	TCP_REASS_COUNTER_INCR(&tcp_reass_overlapfront);
654	rcvoobyte -= overlap;
655	}
656	/*
657	* If the received segment immediates precedes this
658	* fragment then tack the fragment onto this segment
659	* and reinsert the data.
660	*/
661	if (q->ipqe_seq == pkt_seq + pkt_len) {
662	#ifdef TCPREASS_DEBUG
663	printf("tcp_reass[%p]: append %u:%u(%u) to %u:%u(%u)\n",
664	tp, q->ipqe_seq, q->ipqe_seq + q->ipqe_len, q->ipqe_len,
665	pkt_seq, pkt_seq + pkt_len, pkt_len);
666	#endif
667	pkt_len += q->ipqe_len;
668	pkt_flags \|= q->ipqe_flags;
669	m_cat(m, q->ipqe_m);
670	TAILQ_REMOVE(&tp->segq, q, ipqe_q);
671	TAILQ_REMOVE(&tp->timeq, q, ipqe_timeq);
672	tp->t_segqlen--;
673	KASSERT(tp->t_segqlen >= `0`);
674	KASSERT(tp->t_segqlen != `0` \|\|
675	(TAILQ_EMPTY(&tp->segq) &&
676	TAILQ_EMPTY(&tp->timeq)));
677	if (tiqe == NULL) {
678	tiqe = q;
679	} else {
680	tcpipqent_free(q);
681	}
682	TCP_REASS_COUNTER_INCR(&tcp_reass_prepend);
683	break;
684	}
685	/*
686	* If the fragment is before the segment, remember it.
687	* When this loop is terminated, p will contain the
688	* pointer to fragment that is right before the received
689	* segment.
690	*/
691	if (SEQ_LEQ(q->ipqe_seq, pkt_seq))
692	p = q;
693
694	continue;
695
696	/*
697	* This is a common operation. It also will allow
698	* to save doing a malloc/free in most instances.
699	*/
700	free_ipqe:
701	TAILQ_REMOVE(&tp->segq, q, ipqe_q);
702	TAILQ_REMOVE(&tp->timeq, q, ipqe_timeq);
703	tp->t_segqlen--;
704	KASSERT(tp->t_segqlen >= `0`);
705	KASSERT(tp->t_segqlen != `0` \|\|
706	(TAILQ_EMPTY(&tp->segq) && TAILQ_EMPTY(&tp->timeq)));
707	if (tiqe == NULL) {
708	tiqe = q;
709	} else {
710	tcpipqent_free(q);
711	}
712	}
713
714	#ifdef TCP_REASS_COUNTERS
715	if (count > `7`)
716	TCP_REASS_COUNTER_INCR(&tcp_reass_iteration[`0`]);
717	else if (count > `0`)
718	TCP_REASS_COUNTER_INCR(&tcp_reass_iteration[count]);
719	#endif
720
721	insert_it:
722
723	/*
724	* Allocate a new queue entry since the received segment did not
725	* collapse onto any other out-of-order block; thus we are allocating
726	* a new block. If it had collapsed, tiqe would not be NULL and
727	* we would be reusing it.
728	* XXX If we can't, just drop the packet. XXX
729	*/
730	if (tiqe == NULL) {
731	tiqe = tcpipqent_alloc();
732	if (tiqe == NULL) {
733	TCP_STATINC(TCP_STAT_RCVMEMDROP);
734	m_freem(m);
735	goto out;
736	}
737	}
738
739	/*
740	* Update the counters.
741	*/
742	tp->t_rcvoopack++;
743	tcps = TCP_STAT_GETREF();
744	tcps[TCP_STAT_RCVOOPACK]++;
745	tcps[TCP_STAT_RCVOOBYTE] += rcvoobyte;
746	if (rcvpartdupbyte) {
747	tcps[TCP_STAT_RCVPARTDUPPACK]++;
748	tcps[TCP_STAT_RCVPARTDUPBYTE] += rcvpartdupbyte;
749	}
750	TCP_STAT_PUTREF();
751
752	/*
753	* Insert the new fragment queue entry into both queues.
754	*/
755	tiqe->ipqe_m = m;
756	tiqe->ipre_mlast = m;
757	tiqe->ipqe_seq = pkt_seq;
758	tiqe->ipqe_len = pkt_len;
759	tiqe->ipqe_flags = pkt_flags;
760	if (p == NULL) {
761	TAILQ_INSERT_HEAD(&tp->segq, tiqe, ipqe_q);
762	#ifdef TCPREASS_DEBUG
763	if (tiqe->ipqe_seq != tp->rcv_nxt)
764	printf("tcp_reass[%p]: insert %u:%u(%u) at front\n",
765	tp, pkt_seq, pkt_seq + pkt_len, pkt_len);
766	#endif
767	} else {
768	TAILQ_INSERT_AFTER(&tp->segq, p, tiqe, ipqe_q);
769	#ifdef TCPREASS_DEBUG
770	printf("tcp_reass[%p]: insert %u:%u(%u) after %u:%u(%u)\n",
771	tp, pkt_seq, pkt_seq + pkt_len, pkt_len,
772	p->ipqe_seq, p->ipqe_seq + p->ipqe_len, p->ipqe_len);
773	#endif
774	}
775	tp->t_segqlen++;
776
777	skip_replacement:
778
779	TAILQ_INSERT_HEAD(&tp->timeq, tiqe, ipqe_timeq);
780
781	present:
782	/*
783	* Present data to user, advancing rcv_nxt through
784	* completed sequence space.
785	*/
786	if (TCPS_HAVEESTABLISHED(tp->t_state) == `0`)
787	goto out;
788	q = TAILQ_FIRST(&tp->segq);
789	if (q == NULL \|\| q->ipqe_seq != tp->rcv_nxt)
790	goto out;
791	if (tp->t_state == TCPS_SYN_RECEIVED && q->ipqe_len)
792	goto out;
793
794	tp->rcv_nxt += q->ipqe_len;
795	pkt_flags = q->ipqe_flags & TH_FIN;
796	nd6_hint(tp);
797
798	TAILQ_REMOVE(&tp->segq, q, ipqe_q);
799	TAILQ_REMOVE(&tp->timeq, q, ipqe_timeq);
800	tp->t_segqlen--;
801	KASSERT(tp->t_segqlen >= `0`);
802	KASSERT(tp->t_segqlen != `0` \|\|
803	(TAILQ_EMPTY(&tp->segq) && TAILQ_EMPTY(&tp->timeq)));
804	if (so->so_state & SS_CANTRCVMORE)
805	m_freem(q->ipqe_m);
806	else
807	sbappendstream(&so->so_rcv, q->ipqe_m);
808	tcpipqent_free(q);
809	TCP_REASS_UNLOCK(tp);
810	sorwakeup(so);
811	return (pkt_flags);
812	out:
813	TCP_REASS_UNLOCK(tp);
814	return (`0`);
815	}
816
817	#ifdef INET6
818	int
819	tcp6_input(struct mbuf *mp, int* offp, int* proto)
820	{
821	struct mbuf m = mp;
822
823	/*
824	* draft-itojun-ipv6-tcp-to-anycast
825	* better place to put this in?
826	*/
827	if (m->m_flags & M_ANYCAST6) {
828	struct ip6_hdr *ip6;
829	if (m->m_len < sizeof(struct ip6_hdr)) {
830	if ((m = m_pullup(m, sizeof(struct ip6_hdr))) == NULL) {
831	TCP_STATINC(TCP_STAT_RCVSHORT);
832	return IPPROTO_DONE;
833	}
834	}
835	ip6 = mtod(m, struct ip6_hdr *);
836	icmp6_error(m, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_ADDR,
837	(char )&ip6->ip6_dst - (char* *)ip6);
838	return IPPROTO_DONE;
839	}
840
841	tcp_input(m, *offp, proto);
842	return IPPROTO_DONE;
843	}
844	#endif
845
846	#ifdef INET
847	static void
848	tcp4_log_refused(const struct ip ip, const* struct tcphdr *th)
849	{
850	char src[INET_ADDRSTRLEN];
851	char dst[INET_ADDRSTRLEN];
852
853	if (ip) {
854	in_print(src, sizeof(src), &ip->ip_src);
855	in_print(dst, sizeof(dst), &ip->ip_dst);
856	}
857	else {
858	strlcpy(src, "(unknown)", sizeof(src));
859	strlcpy(dst, "(unknown)", sizeof(dst));
860	}
861	log(LOG_INFO,
862	"Connection attempt to TCP %s:%d from %s:%d\n",
863	dst, ntohs(th->th_dport),
864	src, ntohs(th->th_sport));
865	}
866	#endif
867
868	#ifdef INET6
869	static void
870	tcp6_log_refused(const struct ip6_hdr ip6, const* struct tcphdr *th)
871	{
872	char src[INET6_ADDRSTRLEN];
873	char dst[INET6_ADDRSTRLEN];
874
875	if (ip6) {
876	in6_print(src, sizeof(src), &ip6->ip6_src);
877	in6_print(dst, sizeof(dst), &ip6->ip6_dst);
878	}
879	else {
880	strlcpy(src, "(unknown v6)", sizeof(src));
881	strlcpy(dst, "(unknown v6)", sizeof(dst));
882	}
883	log(LOG_INFO,
884	"Connection attempt to TCP [%s]:%d from [%s]:%d\n",
885	dst, ntohs(th->th_dport),
886	src, ntohs(th->th_sport));
887	}
888	#endif
889
890	/*
891	* Checksum extended TCP header and data.
892	*/
893	int
894	tcp_input_checksum(int af, struct mbuf m, const* struct tcphdr *th,
895	int toff, int off, int tlen)
896	{
897	struct ifnet *rcvif;
898	int s;
899
900	/*
901	* XXX it's better to record and check if this mbuf is
902	* already checked.
903	*/
904
905	rcvif = m_get_rcvif(m, &s);
906
907	switch (af) {
908	#ifdef INET
909	case AF_INET:
910	switch (m->m_pkthdr.csum_flags &
911	((rcvif->if_csum_flags_rx & M_CSUM_TCPv4) \|
912	M_CSUM_TCP_UDP_BAD \| M_CSUM_DATA)) {
913	case M_CSUM_TCPv4\|M_CSUM_TCP_UDP_BAD:
914	TCP_CSUM_COUNTER_INCR(&tcp_hwcsum_bad);
915	goto badcsum;
916
917	case M_CSUM_TCPv4\|M_CSUM_DATA: {
918	u_int32_t hw_csum = m->m_pkthdr.csum_data;
919
920	TCP_CSUM_COUNTER_INCR(&tcp_hwcsum_data);
921	if (m->m_pkthdr.csum_flags & M_CSUM_NO_PSEUDOHDR) {
922	const struct ip *ip =
923	mtod(m, const struct ip *);
924
925	hw_csum = in_cksum_phdr(ip->ip_src.s_addr,
926	ip->ip_dst.s_addr,
927	htons(hw_csum + tlen + off + IPPROTO_TCP));
928	}
929	if ((hw_csum ^ `0xffff`) != `0`)
930	goto badcsum;
931	break;
932	}
933
934	case M_CSUM_TCPv4:
935	/ Checksum was okay. /
936	TCP_CSUM_COUNTER_INCR(&tcp_hwcsum_ok);
937	break;
938
939	default:
940	/*
941	* Must compute it ourselves. Maybe skip checksum
942	* on loopback interfaces.
943	*/
944	if (__predict_true(!(rcvif->if_flags & IFF_LOOPBACK) \|\|
945	tcp_do_loopback_cksum)) {
946	TCP_CSUM_COUNTER_INCR(&tcp_swcsum);
947	if (in4_cksum(m, IPPROTO_TCP, toff,
948	tlen + off) != `0`)
949	goto badcsum;
950	}
951	break;
952	}
953	break;
954	#endif /* INET4 */
955
956	#ifdef INET6
957	case AF_INET6:
958	switch (m->m_pkthdr.csum_flags &
959	((rcvif->if_csum_flags_rx & M_CSUM_TCPv6) \|
960	M_CSUM_TCP_UDP_BAD \| M_CSUM_DATA)) {
961	case M_CSUM_TCPv6\|M_CSUM_TCP_UDP_BAD:
962	TCP_CSUM_COUNTER_INCR(&tcp6_hwcsum_bad);
963	goto badcsum;
964
965	#if 0 /* notyet */
966	case M_CSUM_TCPv6\|M_CSUM_DATA:
967	#endif
968
969	case M_CSUM_TCPv6:
970	/ Checksum was okay. /
971	TCP_CSUM_COUNTER_INCR(&tcp6_hwcsum_ok);
972	break;
973
974	default:
975	/*
976	* Must compute it ourselves. Maybe skip checksum
977	* on loopback interfaces.
978	*/
979	if (__predict_true((m->m_flags & M_LOOP) == `0` \|\|
980	tcp_do_loopback_cksum)) {
981	TCP_CSUM_COUNTER_INCR(&tcp6_swcsum);
982	if (in6_cksum(m, IPPROTO_TCP, toff,
983	tlen + off) != `0`)
984	goto badcsum;
985	}
986	}
987	break;
988	#endif /* INET6 */
989	}
990	m_put_rcvif(rcvif, &s);
991
992	return `0`;
993
994	badcsum:
995	m_put_rcvif(rcvif, &s);
996	TCP_STATINC(TCP_STAT_RCVBADSUM);
997	return -`1`;
998	}
999
1000	/ When a packet arrives addressed to a vestigial tcpbp, we*
1001	* nevertheless have to respond to it per the spec.
1002	*/
1003	static void tcp_vtw_input(struct tcphdr th, vestigial_inpcb_t vp,
1004	struct mbuf m, int* tlen, int multicast)
1005	{
1006	int tiflags;
1007	int todrop;
1008	uint32_t t_flags = `0`;
1009	uint64_t *tcps;
1010
1011	tiflags = th->th_flags;
1012	todrop = vp->rcv_nxt - th->th_seq;
1013
1014	if (todrop > `0`) {
1015	if (tiflags & TH_SYN) {
1016	tiflags &= ~TH_SYN;
1017	++th->th_seq;
1018	if (th->th_urp > `1`)
1019	--th->th_urp;
1020	else {
1021	tiflags &= ~TH_URG;
1022	th->th_urp = `0`;
1023	}
1024	--todrop;
1025	}
1026	if (todrop > tlen \|\|
1027	(todrop == tlen && (tiflags & TH_FIN) == `0`)) {
1028	/*
1029	* Any valid FIN or RST must be to the left of the
1030	* window. At this point the FIN or RST must be a
1031	* duplicate or out of sequence; drop it.
1032	*/
1033	if (tiflags & TH_RST)
1034	goto drop;
1035	tiflags &= ~(TH_FIN\|TH_RST);
1036	/*
1037	* Send an ACK to resynchronize and drop any data.
1038	* But keep on processing for RST or ACK.
1039	*/
1040	t_flags \|= TF_ACKNOW;
1041	todrop = tlen;
1042	tcps = TCP_STAT_GETREF();
1043	tcps[TCP_STAT_RCVDUPPACK] += `1`;
1044	tcps[TCP_STAT_RCVDUPBYTE] += todrop;
1045	TCP_STAT_PUTREF();
1046	} else if ((tiflags & TH_RST)
1047	&& th->th_seq != vp->rcv_nxt) {
1048	/*
1049	* Test for reset before adjusting the sequence
1050	* number for overlapping data.
1051	*/
1052	goto dropafterack_ratelim;
1053	} else {
1054	tcps = TCP_STAT_GETREF();
1055	tcps[TCP_STAT_RCVPARTDUPPACK] += `1`;
1056	tcps[TCP_STAT_RCVPARTDUPBYTE] += todrop;
1057	TCP_STAT_PUTREF();
1058	}
1059
1060	// tcp_new_dsack(tp, th->th_seq, todrop);
1061	// hdroptlen += todrop; /drop from head afterwards/
1062
1063	th->th_seq += todrop;
1064	tlen -= todrop;
1065
1066	if (th->th_urp > todrop)
1067	th->th_urp -= todrop;
1068	else {
1069	tiflags &= ~TH_URG;
1070	th->th_urp = `0`;
1071	}
1072	}
1073
1074	/*
1075	* If new data are received on a connection after the
1076	* user processes are gone, then RST the other end.
1077	*/
1078	if (tlen) {
1079	TCP_STATINC(TCP_STAT_RCVAFTERCLOSE);
1080	goto dropwithreset;
1081	}
1082
1083	/*
1084	* If segment ends after window, drop trailing data
1085	* (and PUSH and FIN); if nothing left, just ACK.
1086	*/
1087	todrop = (th->th_seq + tlen) - (vp->rcv_nxt+vp->rcv_wnd);
1088
1089	if (todrop > `0`) {
1090	TCP_STATINC(TCP_STAT_RCVPACKAFTERWIN);
1091	if (todrop >= tlen) {
1092	/*
1093	* The segment actually starts after the window.
1094	* th->th_seq + tlen - vp->rcv_nxt - vp->rcv_wnd >= tlen
1095	* th->th_seq - vp->rcv_nxt - vp->rcv_wnd >= 0
1096	* th->th_seq >= vp->rcv_nxt + vp->rcv_wnd
1097	*/
1098	TCP_STATADD(TCP_STAT_RCVBYTEAFTERWIN, tlen);
1099	/*
1100	* If a new connection request is received
1101	* while in TIME_WAIT, drop the old connection
1102	* and start over if the sequence numbers
1103	* are above the previous ones.
1104	*/
1105	if ((tiflags & TH_SYN)
1106	&& SEQ_GT(th->th_seq, vp->rcv_nxt)) {
1107	/ We only support this in the !NOFDREF case, which*
1108	* is to say: not here.
1109	*/
1110	goto dropwithreset;
1111	}
1112	/*
1113	* If window is closed can only take segments at
1114	* window edge, and have to drop data and PUSH from
1115	* incoming segments. Continue processing, but
1116	* remember to ack. Otherwise, drop segment
1117	* and (if not RST) ack.
1118	*/
1119	if (vp->rcv_wnd == `0` && th->th_seq == vp->rcv_nxt) {
1120	t_flags \|= TF_ACKNOW;
1121	TCP_STATINC(TCP_STAT_RCVWINPROBE);
1122	} else
1123	goto dropafterack;
1124	} else
1125	TCP_STATADD(TCP_STAT_RCVBYTEAFTERWIN, todrop);
1126	m_adj(m, -todrop);
1127	tlen -= todrop;
1128	tiflags &= ~(TH_PUSH\|TH_FIN);
1129	}
1130
1131	if (tiflags & TH_RST) {
1132	if (th->th_seq != vp->rcv_nxt)
1133	goto dropafterack_ratelim;
1134
1135	vtw_del(vp->ctl, vp->vtw);
1136	goto drop;
1137	}
1138
1139	/*
1140	* If the ACK bit is off we drop the segment and return.
1141	*/
1142	if ((tiflags & TH_ACK) == `0`) {
1143	if (t_flags & TF_ACKNOW)
1144	goto dropafterack;
1145	else
1146	goto drop;
1147	}
1148
1149	/*
1150	* In TIME_WAIT state the only thing that should arrive
1151	* is a retransmission of the remote FIN. Acknowledge
1152	* it and restart the finack timer.
1153	*/
1154	vtw_restart(vp);
1155	goto dropafterack;
1156
1157	dropafterack:
1158	/*
1159	* Generate an ACK dropping incoming segment if it occupies
1160	* sequence space, where the ACK reflects our state.
1161	*/
1162	if (tiflags & TH_RST)
1163	goto drop;
1164	goto dropafterack2;
1165
1166	dropafterack_ratelim:
1167	/*
1168	* We may want to rate-limit ACKs against SYN/RST attack.
1169	*/
1170	if (ppsratecheck(&tcp_ackdrop_ppslim_last, &tcp_ackdrop_ppslim_count,
1171	tcp_ackdrop_ppslim) == `0`) {
1172	/ XXX stat /
1173	goto drop;
1174	}
1175	/ ...fall into dropafterack2... /
1176
1177	dropafterack2:
1178	(void)tcp_respond(`0`, m, m, th, th->th_seq + tlen, th->th_ack,
1179	TH_ACK);
1180	return;
1181
1182	dropwithreset:
1183	/*
1184	* Generate a RST, dropping incoming segment.
1185	* Make ACK acceptable to originator of segment.
1186	*/
1187	if (tiflags & TH_RST)
1188	goto drop;
1189
1190	if (tiflags & TH_ACK)
1191	tcp_respond(`0`, m, m, th, (tcp_seq)`0`, th->th_ack, TH_RST);
1192	else {
1193	if (tiflags & TH_SYN)
1194	++tlen;
1195	(void)tcp_respond(`0`, m, m, th, th->th_seq + tlen, (tcp_seq)`0`,
1196	TH_RST\|TH_ACK);
1197	}
1198	return;
1199	drop:
1200	m_freem(m);
1201	}
1202
1203	/*
1204	* TCP input routine, follows pages 65-76 of RFC 793 very closely.
1205	*/
1206	void
1207	tcp_input(struct mbuf *m, ...)
1208	{
1209	struct tcphdr *th;
1210	struct ip *ip;
1211	struct inpcb *inp;
1212	#ifdef INET6
1213	struct ip6_hdr *ip6;
1214	struct in6pcb *in6p;
1215	#endif
1216	u_int8_t *optp = NULL;
1217	int optlen = `0`;
1218	int len, tlen, toff, hdroptlen = `0`;
1219	struct tcpcb *tp = `0`;
1220	int tiflags;
1221	struct socket *so = NULL;
1222	int todrop, acked, ourfinisacked, needoutput = `0`;
1223	bool dupseg;
1224	#ifdef TCP_DEBUG
1225	short ostate = `0`;
1226	#endif
1227	u_long tiwin;
1228	struct tcp_opt_info opti;
1229	int off, iphlen;
1230	va_list ap;
1231	int af; / af on the wire /
1232	struct mbuf *tcp_saveti = NULL;
1233	uint32_t ts_rtt;
1234	uint8_t iptos;
1235	uint64_t *tcps;
1236	vestigial_inpcb_t vestige;
1237
1238	vestige.valid = `0`;
1239
1240	MCLAIM(m, &tcp_rx_mowner);
1241	va_start(ap, m);
1242	toff = va_arg(ap, int);
1243	(void)va_arg(ap, int); / ignore value, advance ap /
1244	va_end(ap);
1245
1246	TCP_STATINC(TCP_STAT_RCVTOTAL);
1247
1248	memset(&opti, `0`, sizeof(opti));
1249	opti.ts_present = `0`;
1250	opti.maxseg = `0`;
1251
1252	/*
1253	* RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN.
1254	*
1255	* TCP is, by definition, unicast, so we reject all
1256	* multicast outright.
1257	*
1258	* Note, there are additional src/dst address checks in
1259	* the AF-specific code below.
1260	*/
1261	if (m->m_flags & (M_BCAST\|M_MCAST)) {
1262	/ XXX stat /
1263	goto drop;
1264	}
1265	#ifdef INET6
1266	if (m->m_flags & M_ANYCAST6) {
1267	/ XXX stat /
1268	goto drop;
1269	}
1270	#endif
1271
1272	/*
1273	* Get IP and TCP header.
1274	* Note: IP leaves IP header in first mbuf.
1275	*/
1276	ip = mtod(m, struct ip *);
1277	switch (ip->ip_v) {
1278	#ifdef INET
1279	case `4`:
1280	#ifdef INET6
1281	ip6 = NULL;
1282	#endif
1283	af = AF_INET;
1284	iphlen = sizeof(struct ip);
1285	IP6_EXTHDR_GET(th, struct tcphdr *, m, toff,
1286	sizeof(struct tcphdr));
1287	if (th == NULL) {
1288	TCP_STATINC(TCP_STAT_RCVSHORT);
1289	return;
1290	}
1291	/ We do the checksum after PCB lookup... /
1292	len = ntohs(ip->ip_len);
1293	tlen = len - toff;
1294	iptos = ip->ip_tos;
1295	break;
1296	#endif
1297	#ifdef INET6
1298	case `6`:
1299	ip = NULL;
1300	iphlen = sizeof(struct ip6_hdr);
1301	af = AF_INET6;
1302	ip6 = mtod(m, struct ip6_hdr *);
1303	IP6_EXTHDR_GET(th, struct tcphdr *, m, toff,
1304	sizeof(struct tcphdr));
1305	if (th == NULL) {
1306	TCP_STATINC(TCP_STAT_RCVSHORT);
1307	return;
1308	}
1309
1310	/ Be proactive about malicious use of IPv4 mapped address /
1311	if (IN6_IS_ADDR_V4MAPPED(&ip6->ip6_src) \|\|
1312	IN6_IS_ADDR_V4MAPPED(&ip6->ip6_dst)) {
1313	/ XXX stat /
1314	goto drop;
1315	}
1316
1317	/*
1318	* Be proactive about unspecified IPv6 address in source.
1319	* As we use all-zero to indicate unbounded/unconnected pcb,
1320	* unspecified IPv6 address can be used to confuse us.
1321	*
1322	* Note that packets with unspecified IPv6 destination is
1323	* already dropped in ip6_input.
1324	*/
1325	if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) {
1326	/ XXX stat /
1327	goto drop;
1328	}
1329
1330	/*
1331	* Make sure destination address is not multicast.
1332	* Source address checked in ip6_input().
1333	*/
1334	if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) {
1335	/ XXX stat /
1336	goto drop;
1337	}
1338
1339	/ We do the checksum after PCB lookup... /
1340	len = m->m_pkthdr.len;
1341	tlen = len - toff;
1342	iptos = (ntohl(ip6->ip6_flow) >> `20`) & `0xff`;
1343	break;
1344	#endif
1345	default:
1346	m_freem(m);
1347	return;
1348	}
1349	/*
1350	* Enforce alignment requirements that are violated in
1351	* some cases, see kern/50766 for details.
1352	*/
1353	if (TCP_HDR_ALIGNED_P(th) == `0`) {
1354	m = m_copyup(m, toff + sizeof(struct tcphdr), `0`);
1355	if (m == NULL) {
1356	TCP_STATINC(TCP_STAT_RCVSHORT);
1357	return;
1358	}
1359	ip = mtod(m, struct ip *);
1360	#ifdef INET6
1361	ip6 = mtod(m, struct ip6_hdr *);
1362	#endif
1363	th = (struct tcphdr )(mtod(m, char* *) + toff);
1364	}
1365	KASSERT(TCP_HDR_ALIGNED_P(th));
1366
1367	/*
1368	* Check that TCP offset makes sense,
1369	* pull out TCP options and adjust length. XXX
1370	*/
1371	off = th->th_off << `2`;
1372	if (off < sizeof (struct tcphdr) \|\| off > tlen) {
1373	TCP_STATINC(TCP_STAT_RCVBADOFF);
1374	goto drop;
1375	}
1376	tlen -= off;
1377
1378	/*
1379	* tcp_input() has been modified to use tlen to mean the TCP data
1380	* length throughout the function. Other functions can use
1381	* m->m_pkthdr.len as the basis for calculating the TCP data length.
1382	* rja
1383	*/
1384
1385	if (off > sizeof (struct tcphdr)) {
1386	IP6_EXTHDR_GET(th, struct tcphdr *, m, toff, off);
1387	if (th == NULL) {
1388	TCP_STATINC(TCP_STAT_RCVSHORT);
1389	return;
1390	}
1391	/*
1392	* NOTE: ip/ip6 will not be affected by m_pulldown()
1393	* (as they're before toff) and we don't need to update those.
1394	*/
1395	KASSERT(TCP_HDR_ALIGNED_P(th));
1396	optlen = off - sizeof (struct tcphdr);
1397	optp = ((u_int8_t )th) + sizeof(struct* tcphdr);
1398	/*
1399	* Do quick retrieval of timestamp options ("options
1400	* prediction?"). If timestamp is the only option and it's
1401	* formatted as recommended in RFC 1323 appendix A, we
1402	* quickly get the values now and not bother calling
1403	* tcp_dooptions(), etc.
1404	*/
1405	if ((optlen == TCPOLEN_TSTAMP_APPA \|\|
1406	(optlen > TCPOLEN_TSTAMP_APPA &&
1407	optp[TCPOLEN_TSTAMP_APPA] == TCPOPT_EOL)) &&
1408	(u_int32_t )optp == htonl(TCPOPT_TSTAMP_HDR) &&
1409	(th->th_flags & TH_SYN) == `0`) {
1410	opti.ts_present = `1`;
1411	opti.ts_val = ntohl((u_int32_t )(optp + `4`));
1412	opti.ts_ecr = ntohl((u_int32_t )(optp + `8`));
1413	optp = NULL; / we've parsed the options /
1414	}
1415	}
1416	tiflags = th->th_flags;
1417
1418	/*
1419	* Checksum extended TCP header and data
1420	*/
1421	if (tcp_input_checksum(af, m, th, toff, off, tlen))
1422	goto badcsum;
1423
1424	/*
1425	* Locate pcb for segment.
1426	*/
1427	findpcb:
1428	inp = NULL;
1429	#ifdef INET6
1430	in6p = NULL;
1431	#endif
1432	switch (af) {
1433	#ifdef INET
1434	case AF_INET:
1435	inp = in_pcblookup_connect(&tcbtable, ip->ip_src, th->th_sport,
1436	ip->ip_dst, th->th_dport,
1437	&vestige);
1438	if (inp == `0` && !vestige.valid) {
1439	TCP_STATINC(TCP_STAT_PCBHASHMISS);
1440	inp = in_pcblookup_bind(&tcbtable, ip->ip_dst, th->th_dport);
1441	}
1442	#ifdef INET6
1443	if (inp == `0` && !vestige.valid) {
1444	struct in6_addr s, d;
1445
1446	/ mapped addr case /
1447	in6_in_2_v4mapin6(&ip->ip_src, &s);
1448	in6_in_2_v4mapin6(&ip->ip_dst, &d);
1449	in6p = in6_pcblookup_connect(&tcbtable, &s,
1450	th->th_sport, &d, th->th_dport,
1451	`0`, &vestige);
1452	if (in6p == `0` && !vestige.valid) {
1453	TCP_STATINC(TCP_STAT_PCBHASHMISS);
1454	in6p = in6_pcblookup_bind(&tcbtable, &d,
1455	th->th_dport, `0`);
1456	}
1457	}
1458	#endif
1459	#ifndef INET6
1460	if (inp == `0` && !vestige.valid)
1461	#else
1462	if (inp == `0` && in6p == `0` && !vestige.valid)
1463	#endif
1464	{
1465	TCP_STATINC(TCP_STAT_NOPORT);
1466	if (tcp_log_refused &&
1467	(tiflags & (TH_RST\|TH_ACK\|TH_SYN)) == TH_SYN) {
1468	tcp4_log_refused(ip, th);
1469	}
1470	tcp_fields_to_host(th);
1471	goto dropwithreset_ratelim;
1472	}
1473	#if defined(IPSEC)
1474	if (ipsec_used) {
1475	if (inp &&
1476	(inp->inp_socket->so_options & SO_ACCEPTCONN) == `0`
1477	&& ipsec4_in_reject(m, inp)) {
1478	IPSEC_STATINC(IPSEC_STAT_IN_POLVIO);
1479	goto drop;
1480	}
1481	#ifdef INET6
1482	else if (in6p &&
1483	(in6p->in6p_socket->so_options & SO_ACCEPTCONN) == `0`
1484	&& ipsec6_in_reject_so(m, in6p->in6p_socket)) {
1485	IPSEC_STATINC(IPSEC_STAT_IN_POLVIO);
1486	goto drop;
1487	}
1488	#endif
1489	}
1490	#endif /IPSEC/
1491	break;
1492	#endif /INET/
1493	#ifdef INET6
1494	case AF_INET6:
1495	{
1496	int faith;
1497
1498	#if defined(NFAITH) && NFAITH > 0
1499	faith = faithprefix(&ip6->ip6_dst);
1500	#else
1501	faith = `0`;
1502	#endif
1503	in6p = in6_pcblookup_connect(&tcbtable, &ip6->ip6_src,
1504	th->th_sport, &ip6->ip6_dst, th->th_dport, faith, &vestige);
1505	if (!in6p && !vestige.valid) {
1506	TCP_STATINC(TCP_STAT_PCBHASHMISS);
1507	in6p = in6_pcblookup_bind(&tcbtable, &ip6->ip6_dst,
1508	th->th_dport, faith);
1509	}
1510	if (!in6p && !vestige.valid) {
1511	TCP_STATINC(TCP_STAT_NOPORT);
1512	if (tcp_log_refused &&
1513	(tiflags & (TH_RST\|TH_ACK\|TH_SYN)) == TH_SYN) {
1514	tcp6_log_refused(ip6, th);
1515	}
1516	tcp_fields_to_host(th);
1517	goto dropwithreset_ratelim;
1518	}
1519	#if defined(IPSEC)
1520	if (ipsec_used && in6p
1521	&& (in6p->in6p_socket->so_options & SO_ACCEPTCONN) == `0`
1522	&& ipsec6_in_reject(m, in6p)) {
1523	IPSEC6_STATINC(IPSEC_STAT_IN_POLVIO);
1524	goto drop;
1525	}
1526	#endif /IPSEC/
1527	break;
1528	}
1529	#endif
1530	}
1531
1532	/*
1533	* If the state is CLOSED (i.e., TCB does not exist) then
1534	* all data in the incoming segment is discarded.
1535	* If the TCB exists but is in CLOSED state, it is embryonic,
1536	* but should either do a listen or a connect soon.
1537	*/
1538	tp = NULL;
1539	so = NULL;
1540	if (inp) {
1541	/ Check the minimum TTL for socket. /
1542	if (ip->ip_ttl < inp->inp_ip_minttl)
1543	goto drop;
1544
1545	tp = intotcpcb(inp);
1546	so = inp->inp_socket;
1547	}
1548	#ifdef INET6
1549	else if (in6p) {
1550	tp = in6totcpcb(in6p);
1551	so = in6p->in6p_socket;
1552	}
1553	#endif
1554	else if (vestige.valid) {
1555	int mc = `0`;
1556
1557	/ We do not support the resurrection of vtw tcpcps.*
1558	*/
1559	if (tcp_input_checksum(af, m, th, toff, off, tlen))
1560	goto badcsum;
1561
1562	switch (af) {
1563	#ifdef INET6
1564	case AF_INET6:
1565	mc = IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst);
1566	break;
1567	#endif
1568
1569	case AF_INET:
1570	mc = (IN_MULTICAST(ip->ip_dst.s_addr)
1571	\|\| in_broadcast(ip->ip_dst,
1572	m_get_rcvif_NOMPSAFE(m)));
1573	break;
1574	}
1575
1576	tcp_fields_to_host(th);
1577	tcp_vtw_input(th, &vestige, m, tlen, mc);
1578	m = `0`;
1579	goto drop;
1580	}
1581
1582	if (tp == `0`) {
1583	tcp_fields_to_host(th);
1584	goto dropwithreset_ratelim;
1585	}
1586	if (tp->t_state == TCPS_CLOSED)
1587	goto drop;
1588
1589	KASSERT(so->so_lock == softnet_lock);
1590	KASSERT(solocked(so));
1591
1592	tcp_fields_to_host(th);
1593
1594	/ Unscale the window into a 32-bit value. /
1595	if ((tiflags & TH_SYN) == `0`)
1596	tiwin = th->th_win << tp->snd_scale;
1597	else
1598	tiwin = th->th_win;
1599
1600	#ifdef INET6
1601	/ save packet options if user wanted /
1602	if (in6p && (in6p->in6p_flags & IN6P_CONTROLOPTS)) {
1603	if (in6p->in6p_options) {
1604	m_freem(in6p->in6p_options);
1605	in6p->in6p_options = `0`;
1606	}
1607	KASSERT(ip6 != NULL);
1608	ip6_savecontrol(in6p, &in6p->in6p_options, ip6, m);
1609	}
1610	#endif
1611
1612	if (so->so_options & (SO_DEBUG\|SO_ACCEPTCONN)) {
1613	union syn_cache_sa src;
1614	union syn_cache_sa dst;
1615
1616	memset(&src, `0`, sizeof(src));
1617	memset(&dst, `0`, sizeof(dst));
1618	switch (af) {
1619	#ifdef INET
1620	case AF_INET:
1621	src.sin.sin_len = sizeof(struct sockaddr_in);
1622	src.sin.sin_family = AF_INET;
1623	src.sin.sin_addr = ip->ip_src;
1624	src.sin.sin_port = th->th_sport;
1625
1626	dst.sin.sin_len = sizeof(struct sockaddr_in);
1627	dst.sin.sin_family = AF_INET;
1628	dst.sin.sin_addr = ip->ip_dst;
1629	dst.sin.sin_port = th->th_dport;
1630	break;
1631	#endif
1632	#ifdef INET6
1633	case AF_INET6:
1634	src.sin6.sin6_len = sizeof(struct sockaddr_in6);
1635	src.sin6.sin6_family = AF_INET6;
1636	src.sin6.sin6_addr = ip6->ip6_src;
1637	src.sin6.sin6_port = th->th_sport;
1638
1639	dst.sin6.sin6_len = sizeof(struct sockaddr_in6);
1640	dst.sin6.sin6_family = AF_INET6;
1641	dst.sin6.sin6_addr = ip6->ip6_dst;
1642	dst.sin6.sin6_port = th->th_dport;
1643	break;
1644	#endif /* INET6 */
1645	default:
1646	goto badsyn; /sanity/
1647	}
1648
1649	if (so->so_options & SO_DEBUG) {
1650	#ifdef TCP_DEBUG
1651	ostate = tp->t_state;
1652	#endif
1653
1654	tcp_saveti = NULL;
1655	if (iphlen + sizeof(struct tcphdr) > MHLEN)
1656	goto nosave;
1657
1658	if (m->m_len > iphlen && (m->m_flags & M_EXT) == `0`) {
1659	tcp_saveti = m_copym(m, `0`, iphlen, M_DONTWAIT);
1660	if (!tcp_saveti)
1661	goto nosave;
1662	} else {
1663	MGETHDR(tcp_saveti, M_DONTWAIT, MT_HEADER);
1664	if (!tcp_saveti)
1665	goto nosave;
1666	MCLAIM(m, &tcp_mowner);
1667	tcp_saveti->m_len = iphlen;
1668	m_copydata(m, `0`, iphlen,
1669	mtod(tcp_saveti, void *));
1670	}
1671
1672	if (M_TRAILINGSPACE(tcp_saveti) < sizeof(struct tcphdr)) {
1673	m_freem(tcp_saveti);
1674	tcp_saveti = NULL;
1675	} else {
1676	tcp_saveti->m_len += sizeof(struct tcphdr);
1677	memcpy(mtod(tcp_saveti, char *) + iphlen, th,
1678	sizeof(struct tcphdr));
1679	}
1680	nosave:;
1681	}
1682	if (so->so_options & SO_ACCEPTCONN) {
1683	if ((tiflags & (TH_RST\|TH_ACK\|TH_SYN)) != TH_SYN) {
1684	if (tiflags & TH_RST) {
1685	syn_cache_reset(&src.sa, &dst.sa, th);
1686	} else if ((tiflags & (TH_ACK\|TH_SYN)) ==
1687	(TH_ACK\|TH_SYN)) {
1688	/*
1689	* Received a SYN,ACK. This should
1690	* never happen while we are in
1691	* LISTEN. Send an RST.
1692	*/
1693	goto badsyn;
1694	} else if (tiflags & TH_ACK) {
1695	so = syn_cache_get(&src.sa, &dst.sa,
1696	th, toff, tlen, so, m);
1697	if (so == NULL) {
1698	/*
1699	* We don't have a SYN for
1700	* this ACK; send an RST.
1701	*/
1702	goto badsyn;
1703	} else if (so ==
1704	(struct socket *)(-`1`)) {
1705	/*
1706	* We were unable to create
1707	* the connection. If the
1708	* 3-way handshake was
1709	* completed, and RST has
1710	* been sent to the peer.
1711	* Since the mbuf might be
1712	* in use for the reply,
1713	* do not free it.
1714	*/
1715	m = NULL;
1716	} else {
1717	/*
1718	* We have created a
1719	* full-blown connection.
1720	*/
1721	tp = NULL;
1722	inp = NULL;
1723	#ifdef INET6
1724	in6p = NULL;
1725	#endif
1726	switch (so->so_proto->pr_domain->dom_family) {
1727	#ifdef INET
1728	case AF_INET:
1729	inp = sotoinpcb(so);
1730	tp = intotcpcb(inp);
1731	break;
1732	#endif
1733	#ifdef INET6
1734	case AF_INET6:
1735	in6p = sotoin6pcb(so);
1736	tp = in6totcpcb(in6p);
1737	break;
1738	#endif
1739	}
1740	if (tp == NULL)
1741	goto badsyn; /XXX/
1742	tiwin <<= tp->snd_scale;
1743	goto after_listen;
1744	}
1745	} else {
1746	/*
1747	* None of RST, SYN or ACK was set.
1748	* This is an invalid packet for a
1749	* TCB in LISTEN state. Send a RST.
1750	*/
1751	goto badsyn;
1752	}
1753	} else {
1754	/*
1755	* Received a SYN.
1756	*
1757	* RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN
1758	*/
1759	if (m->m_flags & (M_BCAST\|M_MCAST))
1760	goto drop;
1761
1762	switch (af) {
1763	#ifdef INET6
1764	case AF_INET6:
1765	if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst))
1766	goto drop;
1767	break;
1768	#endif /* INET6 */
1769	case AF_INET:
1770	if (IN_MULTICAST(ip->ip_dst.s_addr) \|\|
1771	in_broadcast(ip->ip_dst,
1772	m_get_rcvif_NOMPSAFE(m)))
1773	goto drop;
1774	break;
1775	}
1776
1777	#ifdef INET6
1778	/*
1779	* If deprecated address is forbidden, we do
1780	* not accept SYN to deprecated interface
1781	* address to prevent any new inbound
1782	* connection from getting established.
1783	* When we do not accept SYN, we send a TCP
1784	* RST, with deprecated source address (instead
1785	* of dropping it). We compromise it as it is
1786	* much better for peer to send a RST, and
1787	* RST will be the final packet for the
1788	* exchange.
1789	*
1790	* If we do not forbid deprecated addresses, we
1791	* accept the SYN packet. RFC2462 does not
1792	* suggest dropping SYN in this case.
1793	* If we decipher RFC2462 5.5.4, it says like
1794	* this:
1795	* 1. use of deprecated addr with existing
1796	* communication is okay - "SHOULD continue
1797	* to be used"
1798	* 2. use of it with new communication:
1799	* (2a) "SHOULD NOT be used if alternate
1800	* address with sufficient scope is
1801	* available"
1802	* (2b) nothing mentioned otherwise.
1803	* Here we fall into (2b) case as we have no
1804	* choice in our source address selection - we
1805	* must obey the peer.
1806	*
1807	* The wording in RFC2462 is confusing, and
1808	* there are multiple description text for
1809	* deprecated address handling - worse, they
1810	* are not exactly the same. I believe 5.5.4
1811	* is the best one, so we follow 5.5.4.
1812	*/
1813	if (af == AF_INET6 && !ip6_use_deprecated) {
1814	struct in6_ifaddr *ia6;
1815	int s;
1816	struct ifnet *rcvif = m_get_rcvif(m, &s);
1817	if (rcvif == NULL)
1818	goto dropwithreset; / XXX /
1819	if ((ia6 = in6ifa_ifpwithaddr(rcvif,
1820	&ip6->ip6_dst)) &&
1821	(ia6->ia6_flags & IN6_IFF_DEPRECATED)) {
1822	tp = NULL;
1823	m_put_rcvif(rcvif, &s);
1824	goto dropwithreset;
1825	}
1826	m_put_rcvif(rcvif, &s);
1827	}
1828	#endif
1829
1830	#if defined(IPSEC)
1831	if (ipsec_used) {
1832	switch (af) {
1833	#ifdef INET
1834	case AF_INET:
1835	if (!ipsec4_in_reject_so(m, so))
1836	break;
1837	IPSEC_STATINC(
1838	IPSEC_STAT_IN_POLVIO);
1839	tp = NULL;
1840	goto dropwithreset;
1841	#endif
1842	#ifdef INET6
1843	case AF_INET6:
1844	if (!ipsec6_in_reject_so(m, so))
1845	break;
1846	IPSEC6_STATINC(
1847	IPSEC_STAT_IN_POLVIO);
1848	tp = NULL;
1849	goto dropwithreset;
1850	#endif /INET6/
1851	}
1852	}
1853	#endif /IPSEC/
1854
1855	/*
1856	* LISTEN socket received a SYN
1857	* from itself? This can't possibly
1858	* be valid; drop the packet.
1859	*/
1860	if (th->th_sport == th->th_dport) {
1861	int i;
1862
1863	switch (af) {
1864	#ifdef INET
1865	case AF_INET:
1866	i = in_hosteq(ip->ip_src, ip->ip_dst);
1867	break;
1868	#endif
1869	#ifdef INET6
1870	case AF_INET6:
1871	i = IN6_ARE_ADDR_EQUAL(&ip6->ip6_src, &ip6->ip6_dst);
1872	break;
1873	#endif
1874	default:
1875	i = `1`;
1876	}
1877	if (i) {
1878	TCP_STATINC(TCP_STAT_BADSYN);
1879	goto drop;
1880	}
1881	}
1882
1883	/*
1884	* SYN looks ok; create compressed TCP
1885	* state for it.
1886	*/
1887	if (so->so_qlen <= so->so_qlimit &&
1888	syn_cache_add(&src.sa, &dst.sa, th, tlen,
1889	so, m, optp, optlen, &opti))
1890	m = NULL;
1891	}
1892	goto drop;
1893	}
1894	}
1895
1896	after_listen:
1897	#ifdef DIAGNOSTIC
1898	/*
1899	* Should not happen now that all embryonic connections
1900	* are handled with compressed state.
1901	*/
1902	if (tp->t_state == TCPS_LISTEN)
1903	panic("tcp_input: TCPS_LISTEN");
1904	#endif
1905
1906	/*
1907	* Segment received on connection.
1908	* Reset idle time and keep-alive timer.
1909	*/
1910	tp->t_rcvtime = tcp_now;
1911	if (TCPS_HAVEESTABLISHED(tp->t_state))
1912	TCP_TIMER_ARM(tp, TCPT_KEEP, tp->t_keepidle);
1913
1914	/*
1915	* Process options.
1916	*/
1917	#ifdef TCP_SIGNATURE
1918	if (optp \|\| (tp->t_flags & TF_SIGNATURE))
1919	#else
1920	if (optp)
1921	#endif
1922	if (tcp_dooptions(tp, optp, optlen, th, m, toff, &opti) < `0`)
1923	goto drop;
1924
1925	if (TCP_SACK_ENABLED(tp)) {
1926	tcp_del_sackholes(tp, th);
1927	}
1928
1929	if (TCP_ECN_ALLOWED(tp)) {
1930	if (tiflags & TH_CWR) {
1931	tp->t_flags &= ~TF_ECN_SND_ECE;
1932	}
1933	switch (iptos & IPTOS_ECN_MASK) {
1934	case IPTOS_ECN_CE:
1935	tp->t_flags \|= TF_ECN_SND_ECE;
1936	TCP_STATINC(TCP_STAT_ECN_CE);
1937	break;
1938	case IPTOS_ECN_ECT0:
1939	TCP_STATINC(TCP_STAT_ECN_ECT);
1940	break;
1941	case IPTOS_ECN_ECT1:
1942	/ XXX: ignore for now -- rpaulo /
1943	break;
1944	}
1945	/*
1946	* Congestion experienced.
1947	* Ignore if we are already trying to recover.
1948	*/
1949	if ((tiflags & TH_ECE) && SEQ_GEQ(tp->snd_una, tp->snd_recover))
1950	tp->t_congctl->cong_exp(tp);
1951	}
1952
1953	if (opti.ts_present && opti.ts_ecr) {
1954	/*
1955	* Calculate the RTT from the returned time stamp and the
1956	* connection's time base. If the time stamp is later than
1957	* the current time, or is extremely old, fall back to non-1323
1958	* RTT calculation. Since ts_rtt is unsigned, we can test both
1959	* at the same time.
1960	*
1961	* Note that ts_rtt is in units of slow ticks (500
1962	* ms). Since most earthbound RTTs are < 500 ms,
1963	* observed values will have large quantization noise.
1964	* Our smoothed RTT is then the fraction of observed
1965	* samples that are 1 tick instead of 0 (times 500
1966	* ms).
1967	*
1968	* ts_rtt is increased by 1 to denote a valid sample,
1969	* with 0 indicating an invalid measurement. This
1970	* extra 1 must be removed when ts_rtt is used, or
1971	* else an an erroneous extra 500 ms will result.
1972	*/
1973	ts_rtt = TCP_TIMESTAMP(tp) - opti.ts_ecr + `1`;
1974	if (ts_rtt > TCP_PAWS_IDLE)
1975	ts_rtt = `0`;
1976	} else {
1977	ts_rtt = `0`;
1978	}
1979
1980	/*
1981	* Header prediction: check for the two common cases
1982	* of a uni-directional data xfer. If the packet has
1983	* no control flags, is in-sequence, the window didn't
1984	* change and we're not retransmitting, it's a
1985	* candidate. If the length is zero and the ack moved
1986	* forward, we're the sender side of the xfer. Just
1987	* free the data acked & wake any higher level process
1988	* that was blocked waiting for space. If the length
1989	* is non-zero and the ack didn't move, we're the
1990	* receiver side. If we're getting packets in-order
1991	* (the reassembly queue is empty), add the data to
1992	* the socket buffer and note that we need a delayed ack.
1993	*/
1994	if (tp->t_state == TCPS_ESTABLISHED &&
1995	(tiflags & (TH_SYN\|TH_FIN\|TH_RST\|TH_URG\|TH_ECE\|TH_CWR\|TH_ACK))
1996	== TH_ACK &&
1997	(!opti.ts_present \|\| TSTMP_GEQ(opti.ts_val, tp->ts_recent)) &&
1998	th->th_seq == tp->rcv_nxt &&
1999	tiwin && tiwin == tp->snd_wnd &&
2000	tp->snd_nxt == tp->snd_max) {
2001
2002	/*
2003	* If last ACK falls within this segment's sequence numbers,
2004	* record the timestamp.
2005	* NOTE that the test is modified according to the latest
2006	* proposal of the tcplw@cray.com list (Braden 1993/04/26).
2007	*
2008	* note that we already know
2009	* TSTMP_GEQ(opti.ts_val, tp->ts_recent)
2010	*/
2011	if (opti.ts_present &&
2012	SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
2013	tp->ts_recent_age = tcp_now;
2014	tp->ts_recent = opti.ts_val;
2015	}
2016
2017	if (tlen == `0`) {
2018	/ Ack prediction. /
2019	if (SEQ_GT(th->th_ack, tp->snd_una) &&
2020	SEQ_LEQ(th->th_ack, tp->snd_max) &&
2021	tp->snd_cwnd >= tp->snd_wnd &&
2022	tp->t_partialacks < `0`) {
2023	/*
2024	* this is a pure ack for outstanding data.
2025	*/
2026	if (ts_rtt)
2027	tcp_xmit_timer(tp, ts_rtt - `1`);
2028	else if (tp->t_rtttime &&
2029	SEQ_GT(th->th_ack, tp->t_rtseq))
2030	tcp_xmit_timer(tp,
2031	tcp_now - tp->t_rtttime);
2032	acked = th->th_ack - tp->snd_una;
2033	tcps = TCP_STAT_GETREF();
2034	tcps[TCP_STAT_PREDACK]++;
2035	tcps[TCP_STAT_RCVACKPACK]++;
2036	tcps[TCP_STAT_RCVACKBYTE] += acked;
2037	TCP_STAT_PUTREF();
2038	nd6_hint(tp);
2039
2040	if (acked > (tp->t_lastoff - tp->t_inoff))
2041	tp->t_lastm = NULL;
2042	sbdrop(&so->so_snd, acked);
2043	tp->t_lastoff -= acked;
2044
2045	icmp_check(tp, th, acked);
2046
2047	tp->snd_una = th->th_ack;
2048	tp->snd_fack = tp->snd_una;
2049	if (SEQ_LT(tp->snd_high, tp->snd_una))
2050	tp->snd_high = tp->snd_una;
2051	m_freem(m);
2052
2053	/*
2054	* If all outstanding data are acked, stop
2055	* retransmit timer, otherwise restart timer
2056	* using current (possibly backed-off) value.
2057	* If process is waiting for space,
2058	* wakeup/selnotify/signal. If data
2059	* are ready to send, let tcp_output
2060	* decide between more output or persist.
2061	*/
2062	if (tp->snd_una == tp->snd_max)
2063	TCP_TIMER_DISARM(tp, TCPT_REXMT);
2064	else if (TCP_TIMER_ISARMED(tp,
2065	TCPT_PERSIST) == `0`)
2066	TCP_TIMER_ARM(tp, TCPT_REXMT,
2067	tp->t_rxtcur);
2068
2069	sowwakeup(so);
2070	if (so->so_snd.sb_cc) {
2071	KERNEL_LOCK(`1`, NULL);
2072	(void) tcp_output(tp);
2073	KERNEL_UNLOCK_ONE(NULL);
2074	}
2075	if (tcp_saveti)
2076	m_freem(tcp_saveti);
2077	return;
2078	}
2079	} else if (th->th_ack == tp->snd_una &&
2080	TAILQ_FIRST(&tp->segq) == NULL &&
2081	tlen <= sbspace(&so->so_rcv)) {
2082	int newsize = `0`; / automatic sockbuf scaling /
2083
2084	/*
2085	* this is a pure, in-sequence data packet
2086	* with nothing on the reassembly queue and
2087	* we have enough buffer space to take it.
2088	*/
2089	tp->rcv_nxt += tlen;
2090	tcps = TCP_STAT_GETREF();
2091	tcps[TCP_STAT_PREDDAT]++;
2092	tcps[TCP_STAT_RCVPACK]++;
2093	tcps[TCP_STAT_RCVBYTE] += tlen;
2094	TCP_STAT_PUTREF();
2095	nd6_hint(tp);
2096
2097	/*
2098	* Automatic sizing enables the performance of large buffers
2099	* and most of the efficiency of small ones by only allocating
2100	* space when it is needed.
2101	*
2102	* On the receive side the socket buffer memory is only rarely
2103	* used to any significant extent. This allows us to be much
2104	* more aggressive in scaling the receive socket buffer. For
2105	* the case that the buffer space is actually used to a large
2106	* extent and we run out of kernel memory we can simply drop
2107	* the new segments; TCP on the sender will just retransmit it
2108	* later. Setting the buffer size too big may only consume too
2109	* much kernel memory if the application doesn't read() from
2110	* the socket or packet loss or reordering makes use of the
2111	* reassembly queue.
2112	*
2113	* The criteria to step up the receive buffer one notch are:
2114	* 1. the number of bytes received during the time it takes
2115	* one timestamp to be reflected back to us (the RTT);
2116	* 2. received bytes per RTT is within seven eighth of the
2117	* current socket buffer size;
2118	* 3. receive buffer size has not hit maximal automatic size;
2119	*
2120	* This algorithm does one step per RTT at most and only if
2121	* we receive a bulk stream w/o packet losses or reorderings.
2122	* Shrinking the buffer during idle times is not necessary as
2123	* it doesn't consume any memory when idle.
2124	*
2125	* TODO: Only step up if the application is actually serving
2126	* the buffer to better manage the socket buffer resources.
2127	*/
2128	if (tcp_do_autorcvbuf &&
2129	opti.ts_ecr &&
2130	(so->so_rcv.sb_flags & SB_AUTOSIZE)) {
2131	if (opti.ts_ecr > tp->rfbuf_ts &&
2132	opti.ts_ecr - tp->rfbuf_ts < PR_SLOWHZ) {
2133	if (tp->rfbuf_cnt >
2134	(so->so_rcv.sb_hiwat / `8` * `7`) &&
2135	so->so_rcv.sb_hiwat <
2136	tcp_autorcvbuf_max) {
2137	newsize =
2138	min(so->so_rcv.sb_hiwat +
2139	tcp_autorcvbuf_inc,
2140	tcp_autorcvbuf_max);
2141	}
2142	/ Start over with next RTT. /
2143	tp->rfbuf_ts = `0`;
2144	tp->rfbuf_cnt = `0`;
2145	} else
2146	tp->rfbuf_cnt += tlen; / add up /
2147	}
2148
2149	/*
2150	* Drop TCP, IP headers and TCP options then add data
2151	* to socket buffer.
2152	*/
2153	if (so->so_state & SS_CANTRCVMORE)
2154	m_freem(m);
2155	else {
2156	/*
2157	* Set new socket buffer size.
2158	* Give up when limit is reached.
2159	*/
2160	if (newsize)
2161	if (!sbreserve(&so->so_rcv,
2162	newsize, so))
2163	so->so_rcv.sb_flags &= ~SB_AUTOSIZE;
2164	m_adj(m, toff + off);
2165	sbappendstream(&so->so_rcv, m);
2166	}
2167	sorwakeup(so);
2168	tcp_setup_ack(tp, th);
2169	if (tp->t_flags & TF_ACKNOW) {
2170	KERNEL_LOCK(`1`, NULL);
2171	(void) tcp_output(tp);
2172	KERNEL_UNLOCK_ONE(NULL);
2173	}
2174	if (tcp_saveti)
2175	m_freem(tcp_saveti);
2176	return;
2177	}
2178	}
2179
2180	/*
2181	* Compute mbuf offset to TCP data segment.
2182	*/
2183	hdroptlen = toff + off;
2184
2185	/*
2186	* Calculate amount of space in receive window,
2187	* and then do TCP input processing.
2188	* Receive window is amount of space in rcv queue,
2189	* but not less than advertised window.
2190	*/
2191	{ int win;
2192
2193	win = sbspace(&so->so_rcv);
2194	if (win < `0`)
2195	win = `0`;
2196	tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt));
2197	}
2198
2199	/ Reset receive buffer auto scaling when not in bulk receive mode. /
2200	tp->rfbuf_ts = `0`;
2201	tp->rfbuf_cnt = `0`;
2202
2203	switch (tp->t_state) {
2204	/*
2205	* If the state is SYN_SENT:
2206	* if seg contains an ACK, but not for our SYN, drop the input.
2207	* if seg contains a RST, then drop the connection.
2208	* if seg does not contain SYN, then drop it.
2209	* Otherwise this is an acceptable SYN segment
2210	* initialize tp->rcv_nxt and tp->irs
2211	* if seg contains ack then advance tp->snd_una
2212	* if seg contains a ECE and ECN support is enabled, the stream
2213	* is ECN capable.
2214	* if SYN has been acked change to ESTABLISHED else SYN_RCVD state
2215	* arrange for segment to be acked (eventually)
2216	* continue processing rest of data/controls, beginning with URG
2217	*/
2218	case TCPS_SYN_SENT:
2219	if ((tiflags & TH_ACK) &&
2220	(SEQ_LEQ(th->th_ack, tp->iss) \|\|
2221	SEQ_GT(th->th_ack, tp->snd_max)))
2222	goto dropwithreset;
2223	if (tiflags & TH_RST) {
2224	if (tiflags & TH_ACK)
2225	tp = tcp_drop(tp, ECONNREFUSED);
2226	goto drop;
2227	}
2228	if ((tiflags & TH_SYN) == `0`)
2229	goto drop;
2230	if (tiflags & TH_ACK) {
2231	tp->snd_una = th->th_ack;
2232	if (SEQ_LT(tp->snd_nxt, tp->snd_una))
2233	tp->snd_nxt = tp->snd_una;
2234	if (SEQ_LT(tp->snd_high, tp->snd_una))
2235	tp->snd_high = tp->snd_una;
2236	TCP_TIMER_DISARM(tp, TCPT_REXMT);
2237
2238	if ((tiflags & TH_ECE) && tcp_do_ecn) {
2239	tp->t_flags \|= TF_ECN_PERMIT;
2240	TCP_STATINC(TCP_STAT_ECN_SHS);
2241	}
2242
2243	}
2244	tp->irs = th->th_seq;
2245	tcp_rcvseqinit(tp);
2246	tp->t_flags \|= TF_ACKNOW;
2247	tcp_mss_from_peer(tp, opti.maxseg);
2248
2249	/*
2250	* Initialize the initial congestion window. If we
2251	* had to retransmit the SYN, we must initialize cwnd
2252	* to 1 segment (i.e. the Loss Window).
2253	*/
2254	if (tp->t_flags & TF_SYN_REXMT)
2255	tp->snd_cwnd = tp->t_peermss;
2256	else {
2257	int ss = tcp_init_win;
2258	#ifdef INET
2259	if (inp != NULL && in_localaddr(inp->inp_faddr))
2260	ss = tcp_init_win_local;
2261	#endif
2262	#ifdef INET6
2263	if (in6p != NULL && in6_localaddr(&in6p->in6p_faddr))
2264	ss = tcp_init_win_local;
2265	#endif
2266	tp->snd_cwnd = TCP_INITIAL_WINDOW(ss, tp->t_peermss);
2267	}
2268
2269	tcp_rmx_rtt(tp);
2270	if (tiflags & TH_ACK) {
2271	TCP_STATINC(TCP_STAT_CONNECTS);
2272	/*
2273	* move tcp_established before soisconnected
2274	* because upcall handler can drive tcp_output
2275	* functionality.
2276	* XXX we might call soisconnected at the end of
2277	* all processing
2278	*/
2279	tcp_established(tp);
2280	soisconnected(so);
2281	/ Do window scaling on this connection? /
2282	if ((tp->t_flags & (TF_RCVD_SCALE\|TF_REQ_SCALE)) ==
2283	(TF_RCVD_SCALE\|TF_REQ_SCALE)) {
2284	tp->snd_scale = tp->requested_s_scale;
2285	tp->rcv_scale = tp->request_r_scale;
2286	}
2287	TCP_REASS_LOCK(tp);
2288	(void) tcp_reass(tp, NULL, NULL, &tlen);
2289	/*
2290	* if we didn't have to retransmit the SYN,
2291	* use its rtt as our initial srtt & rtt var.
2292	*/
2293	if (tp->t_rtttime)
2294	tcp_xmit_timer(tp, tcp_now - tp->t_rtttime);
2295	} else
2296	tp->t_state = TCPS_SYN_RECEIVED;
2297
2298	/*
2299	* Advance th->th_seq to correspond to first data byte.
2300	* If data, trim to stay within window,
2301	* dropping FIN if necessary.
2302	*/
2303	th->th_seq++;
2304	if (tlen > tp->rcv_wnd) {
2305	todrop = tlen - tp->rcv_wnd;
2306	m_adj(m, -todrop);
2307	tlen = tp->rcv_wnd;
2308	tiflags &= ~TH_FIN;
2309	tcps = TCP_STAT_GETREF();
2310	tcps[TCP_STAT_RCVPACKAFTERWIN]++;
2311	tcps[TCP_STAT_RCVBYTEAFTERWIN] += todrop;
2312	TCP_STAT_PUTREF();
2313	}
2314	tp->snd_wl1 = th->th_seq - `1`;
2315	tp->rcv_up = th->th_seq;
2316	goto step6;
2317
2318	/*
2319	* If the state is SYN_RECEIVED:
2320	* If seg contains an ACK, but not for our SYN, drop the input
2321	* and generate an RST. See page 36, rfc793
2322	*/
2323	case TCPS_SYN_RECEIVED:
2324	if ((tiflags & TH_ACK) &&
2325	(SEQ_LEQ(th->th_ack, tp->iss) \|\|
2326	SEQ_GT(th->th_ack, tp->snd_max)))
2327	goto dropwithreset;
2328	break;
2329	}
2330
2331	/*
2332	* States other than LISTEN or SYN_SENT.
2333	* First check timestamp, if present.
2334	* Then check that at least some bytes of segment are within
2335	* receive window. If segment begins before rcv_nxt,
2336	* drop leading data (and SYN); if nothing left, just ack.
2337	*
2338	* RFC 1323 PAWS: If we have a timestamp reply on this segment
2339	* and it's less than ts_recent, drop it.
2340	*/
2341	if (opti.ts_present && (tiflags & TH_RST) == `0` && tp->ts_recent &&
2342	TSTMP_LT(opti.ts_val, tp->ts_recent)) {
2343
2344	/ Check to see if ts_recent is over 24 days old. /
2345	if (tcp_now - tp->ts_recent_age > TCP_PAWS_IDLE) {
2346	/*
2347	* Invalidate ts_recent. If this segment updates
2348	* ts_recent, the age will be reset later and ts_recent
2349	* will get a valid value. If it does not, setting
2350	* ts_recent to zero will at least satisfy the
2351	* requirement that zero be placed in the timestamp
2352	* echo reply when ts_recent isn't valid. The
2353	* age isn't reset until we get a valid ts_recent
2354	* because we don't want out-of-order segments to be
2355	* dropped when ts_recent is old.
2356	*/
2357	tp->ts_recent = `0`;
2358	} else {
2359	tcps = TCP_STAT_GETREF();
2360	tcps[TCP_STAT_RCVDUPPACK]++;
2361	tcps[TCP_STAT_RCVDUPBYTE] += tlen;
2362	tcps[TCP_STAT_PAWSDROP]++;
2363	TCP_STAT_PUTREF();
2364	tcp_new_dsack(tp, th->th_seq, tlen);
2365	goto dropafterack;
2366	}
2367	}
2368
2369	todrop = tp->rcv_nxt - th->th_seq;
2370	dupseg = false;
2371	if (todrop > `0`) {
2372	if (tiflags & TH_SYN) {
2373	tiflags &= ~TH_SYN;
2374	th->th_seq++;
2375	if (th->th_urp > `1`)
2376	th->th_urp--;
2377	else {
2378	tiflags &= ~TH_URG;
2379	th->th_urp = `0`;
2380	}
2381	todrop--;
2382	}
2383	if (todrop > tlen \|\|
2384	(todrop == tlen && (tiflags & TH_FIN) == `0`)) {
2385	/*
2386	* Any valid FIN or RST must be to the left of the
2387	* window. At this point the FIN or RST must be a
2388	* duplicate or out of sequence; drop it.
2389	*/
2390	if (tiflags & TH_RST)
2391	goto drop;
2392	tiflags &= ~(TH_FIN\|TH_RST);
2393	/*
2394	* Send an ACK to resynchronize and drop any data.
2395	* But keep on processing for RST or ACK.
2396	*/
2397	tp->t_flags \|= TF_ACKNOW;
2398	todrop = tlen;
2399	dupseg = true;
2400	tcps = TCP_STAT_GETREF();
2401	tcps[TCP_STAT_RCVDUPPACK]++;
2402	tcps[TCP_STAT_RCVDUPBYTE] += todrop;
2403	TCP_STAT_PUTREF();
2404	} else if ((tiflags & TH_RST) &&
2405	th->th_seq != tp->rcv_nxt) {
2406	/*
2407	* Test for reset before adjusting the sequence
2408	* number for overlapping data.
2409	*/
2410	goto dropafterack_ratelim;
2411	} else {
2412	tcps = TCP_STAT_GETREF();
2413	tcps[TCP_STAT_RCVPARTDUPPACK]++;
2414	tcps[TCP_STAT_RCVPARTDUPBYTE] += todrop;
2415	TCP_STAT_PUTREF();
2416	}
2417	tcp_new_dsack(tp, th->th_seq, todrop);
2418	hdroptlen += todrop; /drop from head afterwards/
2419	th->th_seq += todrop;
2420	tlen -= todrop;
2421	if (th->th_urp > todrop)
2422	th->th_urp -= todrop;
2423	else {
2424	tiflags &= ~TH_URG;
2425	th->th_urp = `0`;
2426	}
2427	}
2428
2429	/*
2430	* If new data are received on a connection after the
2431	* user processes are gone, then RST the other end.
2432	*/
2433	if ((so->so_state & SS_NOFDREF) &&
2434	tp->t_state > TCPS_CLOSE_WAIT && tlen) {
2435	tp = tcp_close(tp);
2436	TCP_STATINC(TCP_STAT_RCVAFTERCLOSE);
2437	goto dropwithreset;
2438	}
2439
2440	/*
2441	* If segment ends after window, drop trailing data
2442	* (and PUSH and FIN); if nothing left, just ACK.
2443	*/
2444	todrop = (th->th_seq + tlen) - (tp->rcv_nxt+tp->rcv_wnd);
2445	if (todrop > `0`) {
2446	TCP_STATINC(TCP_STAT_RCVPACKAFTERWIN);
2447	if (todrop >= tlen) {
2448	/*
2449	* The segment actually starts after the window.
2450	* th->th_seq + tlen - tp->rcv_nxt - tp->rcv_wnd >= tlen
2451	* th->th_seq - tp->rcv_nxt - tp->rcv_wnd >= 0
2452	* th->th_seq >= tp->rcv_nxt + tp->rcv_wnd
2453	*/
2454	TCP_STATADD(TCP_STAT_RCVBYTEAFTERWIN, tlen);
2455	/*
2456	* If a new connection request is received
2457	* while in TIME_WAIT, drop the old connection
2458	* and start over if the sequence numbers
2459	* are above the previous ones.
2460	*
2461	* NOTE: We will checksum the packet again, and
2462	* so we need to put the header fields back into
2463	* network order!
2464	* XXX This kind of sucks, but we don't expect
2465	* XXX this to happen very often, so maybe it
2466	* XXX doesn't matter so much.
2467	*/
2468	if (tiflags & TH_SYN &&
2469	tp->t_state == TCPS_TIME_WAIT &&
2470	SEQ_GT(th->th_seq, tp->rcv_nxt)) {
2471	tp = tcp_close(tp);
2472	tcp_fields_to_net(th);
2473	goto findpcb;
2474	}
2475	/*
2476	* If window is closed can only take segments at
2477	* window edge, and have to drop data and PUSH from
2478	* incoming segments. Continue processing, but
2479	* remember to ack. Otherwise, drop segment
2480	* and (if not RST) ack.
2481	*/
2482	if (tp->rcv_wnd == `0` && th->th_seq == tp->rcv_nxt) {
2483	tp->t_flags \|= TF_ACKNOW;
2484	TCP_STATINC(TCP_STAT_RCVWINPROBE);
2485	} else
2486	goto dropafterack;
2487	} else
2488	TCP_STATADD(TCP_STAT_RCVBYTEAFTERWIN, todrop);
2489	m_adj(m, -todrop);
2490	tlen -= todrop;
2491	tiflags &= ~(TH_PUSH\|TH_FIN);
2492	}
2493
2494	/*
2495	* If last ACK falls within this segment's sequence numbers,
2496	* record the timestamp.
2497	* NOTE:
2498	* 1) That the test incorporates suggestions from the latest
2499	* proposal of the tcplw@cray.com list (Braden 1993/04/26).
2500	* 2) That updating only on newer timestamps interferes with
2501	* our earlier PAWS tests, so this check should be solely
2502	* predicated on the sequence space of this segment.
2503	* 3) That we modify the segment boundary check to be
2504	* Last.ACK.Sent <= SEG.SEQ + SEG.Len
2505	* instead of RFC1323's
2506	* Last.ACK.Sent < SEG.SEQ + SEG.Len,
2507	* This modified check allows us to overcome RFC1323's
2508	* limitations as described in Stevens TCP/IP Illustrated
2509	* Vol. 2 p.869. In such cases, we can still calculate the
2510	* RTT correctly when RCV.NXT == Last.ACK.Sent.
2511	*/
2512	if (opti.ts_present &&
2513	SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
2514	SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
2515	((tiflags & (TH_SYN\|TH_FIN)) != `0`))) {
2516	tp->ts_recent_age = tcp_now;
2517	tp->ts_recent = opti.ts_val;
2518	}
2519
2520	/*
2521	* If the RST bit is set examine the state:
2522	* SYN_RECEIVED STATE:
2523	* If passive open, return to LISTEN state.
2524	* If active open, inform user that connection was refused.
2525	* ESTABLISHED, FIN_WAIT_1, FIN_WAIT2, CLOSE_WAIT STATES:
2526	* Inform user that connection was reset, and close tcb.
2527	* CLOSING, LAST_ACK, TIME_WAIT STATES
2528	* Close the tcb.
2529	*/
2530	if (tiflags & TH_RST) {
2531	if (th->th_seq != tp->rcv_nxt)
2532	goto dropafterack_ratelim;
2533
2534	switch (tp->t_state) {
2535	case TCPS_SYN_RECEIVED:
2536	so->so_error = ECONNREFUSED;
2537	goto close;
2538
2539	case TCPS_ESTABLISHED:
2540	case TCPS_FIN_WAIT_1:
2541	case TCPS_FIN_WAIT_2:
2542	case TCPS_CLOSE_WAIT:
2543	so->so_error = ECONNRESET;
2544	close:
2545	tp->t_state = TCPS_CLOSED;
2546	TCP_STATINC(TCP_STAT_DROPS);
2547	tp = tcp_close(tp);
2548	goto drop;
2549
2550	case TCPS_CLOSING:
2551	case TCPS_LAST_ACK:
2552	case TCPS_TIME_WAIT:
2553	tp = tcp_close(tp);
2554	goto drop;
2555	}
2556	}
2557
2558	/*
2559	* Since we've covered the SYN-SENT and SYN-RECEIVED states above
2560	* we must be in a synchronized state. RFC791 states (under RST
2561	* generation) that any unacceptable segment (an out-of-order SYN
2562	* qualifies) received in a synchronized state must elicit only an
2563	* empty acknowledgment segment ... and the connection remains in
2564	* the same state.
2565	*/
2566	if (tiflags & TH_SYN) {
2567	if (tp->rcv_nxt == th->th_seq) {
2568	tcp_respond(tp, m, m, th, (tcp_seq)`0`, th->th_ack - `1`,
2569	TH_ACK);
2570	if (tcp_saveti)
2571	m_freem(tcp_saveti);
2572	return;
2573	}
2574
2575	goto dropafterack_ratelim;
2576	}
2577
2578	/*
2579	* If the ACK bit is off we drop the segment and return.
2580	*/
2581	if ((tiflags & TH_ACK) == `0`) {
2582	if (tp->t_flags & TF_ACKNOW)
2583	goto dropafterack;
2584	else
2585	goto drop;
2586	}
2587
2588	/*
2589	* Ack processing.
2590	*/
2591	switch (tp->t_state) {
2592
2593	/*
2594	* In SYN_RECEIVED state if the ack ACKs our SYN then enter
2595	* ESTABLISHED state and continue processing, otherwise
2596	* send an RST.
2597	*/
2598	case TCPS_SYN_RECEIVED:
2599	if (SEQ_GT(tp->snd_una, th->th_ack) \|\|
2600	SEQ_GT(th->th_ack, tp->snd_max))
2601	goto dropwithreset;
2602	TCP_STATINC(TCP_STAT_CONNECTS);
2603	soisconnected(so);
2604	tcp_established(tp);
2605	/ Do window scaling? /
2606	if ((tp->t_flags & (TF_RCVD_SCALE\|TF_REQ_SCALE)) ==
2607	(TF_RCVD_SCALE\|TF_REQ_SCALE)) {
2608	tp->snd_scale = tp->requested_s_scale;
2609	tp->rcv_scale = tp->request_r_scale;
2610	}
2611	TCP_REASS_LOCK(tp);
2612	(void) tcp_reass(tp, NULL, NULL, &tlen);
2613	tp->snd_wl1 = th->th_seq - `1`;
2614	/ fall into ... /
2615
2616	/*
2617	* In ESTABLISHED state: drop duplicate ACKs; ACK out of range
2618	* ACKs. If the ack is in the range
2619	* tp->snd_una < th->th_ack <= tp->snd_max
2620	* then advance tp->snd_una to th->th_ack and drop
2621	* data from the retransmission queue. If this ACK reflects
2622	* more up to date window information we update our window information.
2623	*/
2624	case TCPS_ESTABLISHED:
2625	case TCPS_FIN_WAIT_1:
2626	case TCPS_FIN_WAIT_2:
2627	case TCPS_CLOSE_WAIT:
2628	case TCPS_CLOSING:
2629	case TCPS_LAST_ACK:
2630	case TCPS_TIME_WAIT:
2631
2632	if (SEQ_LEQ(th->th_ack, tp->snd_una)) {
2633	if (tlen == `0` && !dupseg && tiwin == tp->snd_wnd) {
2634	TCP_STATINC(TCP_STAT_RCVDUPACK);
2635	/*
2636	* If we have outstanding data (other than
2637	* a window probe), this is a completely
2638	* duplicate ack (ie, window info didn't
2639	* change), the ack is the biggest we've
2640	* seen and we've seen exactly our rexmt
2641	* threshhold of them, assume a packet
2642	* has been dropped and retransmit it.
2643	* Kludge snd_nxt & the congestion
2644	* window so we send only this one
2645	* packet.
2646	*/
2647	if (TCP_TIMER_ISARMED(tp, TCPT_REXMT) == `0` \|\|
2648	th->th_ack != tp->snd_una)
2649	tp->t_dupacks = `0`;
2650	else if (tp->t_partialacks < `0` &&
2651	(++tp->t_dupacks == tcprexmtthresh \|\|
2652	TCP_FACK_FASTRECOV(tp))) {
2653	/*
2654	* Do the fast retransmit, and adjust
2655	* congestion control paramenters.
2656	*/
2657	if (tp->t_congctl->fast_retransmit(tp, th)) {
2658	/ False fast retransmit /
2659	break;
2660	} else
2661	goto drop;
2662	} else if (tp->t_dupacks > tcprexmtthresh) {
2663	tp->snd_cwnd += tp->t_segsz;
2664	KERNEL_LOCK(`1`, NULL);
2665	(void) tcp_output(tp);
2666	KERNEL_UNLOCK_ONE(NULL);
2667	goto drop;
2668	}
2669	} else {
2670	/*
2671	* If the ack appears to be very old, only
2672	* allow data that is in-sequence. This
2673	* makes it somewhat more difficult to insert
2674	* forged data by guessing sequence numbers.
2675	* Sent an ack to try to update the send
2676	* sequence number on the other side.
2677	*/
2678	if (tlen && th->th_seq != tp->rcv_nxt &&
2679	SEQ_LT(th->th_ack,
2680	tp->snd_una - tp->max_sndwnd))
2681	goto dropafterack;
2682	}
2683	break;
2684	}
2685	/*
2686	* If the congestion window was inflated to account
2687	* for the other side's cached packets, retract it.
2688	*/
2689	tp->t_congctl->fast_retransmit_newack(tp, th);
2690
2691	if (SEQ_GT(th->th_ack, tp->snd_max)) {
2692	TCP_STATINC(TCP_STAT_RCVACKTOOMUCH);
2693	goto dropafterack;
2694	}
2695	acked = th->th_ack - tp->snd_una;
2696	tcps = TCP_STAT_GETREF();
2697	tcps[TCP_STAT_RCVACKPACK]++;
2698	tcps[TCP_STAT_RCVACKBYTE] += acked;
2699	TCP_STAT_PUTREF();
2700
2701	/*
2702	* If we have a timestamp reply, update smoothed
2703	* round trip time. If no timestamp is present but
2704	* transmit timer is running and timed sequence
2705	* number was acked, update smoothed round trip time.
2706	* Since we now have an rtt measurement, cancel the
2707	* timer backoff (cf., Phil Karn's retransmit alg.).
2708	* Recompute the initial retransmit timer.
2709	*/
2710	if (ts_rtt)
2711	tcp_xmit_timer(tp, ts_rtt - `1`);
2712	else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq))
2713	tcp_xmit_timer(tp, tcp_now - tp->t_rtttime);
2714
2715	/*
2716	* If all outstanding data is acked, stop retransmit
2717	* timer and remember to restart (more output or persist).
2718	* If there is more data to be acked, restart retransmit
2719	* timer, using current (possibly backed-off) value.
2720	*/
2721	if (th->th_ack == tp->snd_max) {
2722	TCP_TIMER_DISARM(tp, TCPT_REXMT);
2723	needoutput = `1`;
2724	} else if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == `0`)
2725	TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur);
2726
2727	/*
2728	* New data has been acked, adjust the congestion window.
2729	*/
2730	tp->t_congctl->newack(tp, th);
2731
2732	nd6_hint(tp);
2733	if (acked > so->so_snd.sb_cc) {
2734	tp->snd_wnd -= so->so_snd.sb_cc;
2735	sbdrop(&so->so_snd, (int)so->so_snd.sb_cc);
2736	ourfinisacked = `1`;
2737	} else {
2738	if (acked > (tp->t_lastoff - tp->t_inoff))
2739	tp->t_lastm = NULL;
2740	sbdrop(&so->so_snd, acked);
2741	tp->t_lastoff -= acked;
2742	if (tp->snd_wnd > acked)
2743	tp->snd_wnd -= acked;
2744	else
2745	tp->snd_wnd = `0`;
2746	ourfinisacked = `0`;
2747	}
2748	sowwakeup(so);
2749
2750	icmp_check(tp, th, acked);
2751
2752	tp->snd_una = th->th_ack;
2753	if (SEQ_GT(tp->snd_una, tp->snd_fack))
2754	tp->snd_fack = tp->snd_una;
2755	if (SEQ_LT(tp->snd_nxt, tp->snd_una))
2756	tp->snd_nxt = tp->snd_una;
2757	if (SEQ_LT(tp->snd_high, tp->snd_una))
2758	tp->snd_high = tp->snd_una;
2759
2760	switch (tp->t_state) {
2761
2762	/*
2763	* In FIN_WAIT_1 STATE in addition to the processing
2764	* for the ESTABLISHED state if our FIN is now acknowledged
2765	* then enter FIN_WAIT_2.
2766	*/
2767	case TCPS_FIN_WAIT_1:
2768	if (ourfinisacked) {
2769	/*
2770	* If we can't receive any more
2771	* data, then closing user can proceed.
2772	* Starting the timer is contrary to the
2773	* specification, but if we don't get a FIN
2774	* we'll hang forever.
2775	*/
2776	if (so->so_state & SS_CANTRCVMORE) {
2777	soisdisconnected(so);
2778	if (tp->t_maxidle > `0`)
2779	TCP_TIMER_ARM(tp, TCPT_2MSL,
2780	tp->t_maxidle);
2781	}
2782	tp->t_state = TCPS_FIN_WAIT_2;
2783	}
2784	break;
2785
2786	/*
2787	* In CLOSING STATE in addition to the processing for
2788	* the ESTABLISHED state if the ACK acknowledges our FIN
2789	* then enter the TIME-WAIT state, otherwise ignore
2790	* the segment.
2791	*/
2792	case TCPS_CLOSING:
2793	if (ourfinisacked) {
2794	tp->t_state = TCPS_TIME_WAIT;
2795	tcp_canceltimers(tp);
2796	TCP_TIMER_ARM(tp, TCPT_2MSL, `2` * tp->t_msl);
2797	soisdisconnected(so);
2798	}
2799	break;
2800
2801	/*
2802	* In LAST_ACK, we may still be waiting for data to drain
2803	* and/or to be acked, as well as for the ack of our FIN.
2804	* If our FIN is now acknowledged, delete the TCB,
2805	* enter the closed state and return.
2806	*/
2807	case TCPS_LAST_ACK:
2808	if (ourfinisacked) {
2809	tp = tcp_close(tp);
2810	goto drop;
2811	}
2812	break;
2813
2814	/*
2815	* In TIME_WAIT state the only thing that should arrive
2816	* is a retransmission of the remote FIN. Acknowledge
2817	* it and restart the finack timer.
2818	*/
2819	case TCPS_TIME_WAIT:
2820	TCP_TIMER_ARM(tp, TCPT_2MSL, `2` * tp->t_msl);
2821	goto dropafterack;
2822	}
2823	}
2824
2825	step6:
2826	/*
2827	* Update window information.
2828	* Don't look at window if no ACK: TAC's send garbage on first SYN.
2829	*/
2830	if ((tiflags & TH_ACK) && (SEQ_LT(tp->snd_wl1, th->th_seq) \|\|
2831	(tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) \|\|
2832	(tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) {
2833	/ keep track of pure window updates /
2834	if (tlen == `0` &&
2835	tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd)
2836	TCP_STATINC(TCP_STAT_RCVWINUPD);
2837	tp->snd_wnd = tiwin;
2838	tp->snd_wl1 = th->th_seq;
2839	tp->snd_wl2 = th->th_ack;
2840	if (tp->snd_wnd > tp->max_sndwnd)
2841	tp->max_sndwnd = tp->snd_wnd;
2842	needoutput = `1`;
2843	}
2844
2845	/*
2846	* Process segments with URG.
2847	*/
2848	if ((tiflags & TH_URG) && th->th_urp &&
2849	TCPS_HAVERCVDFIN(tp->t_state) == `0`) {
2850	/*
2851	* This is a kludge, but if we receive and accept
2852	* random urgent pointers, we'll crash in
2853	* soreceive. It's hard to imagine someone
2854	* actually wanting to send this much urgent data.
2855	*/
2856	if (th->th_urp + so->so_rcv.sb_cc > sb_max) {
2857	th->th_urp = `0`; / XXX /
2858	tiflags &= ~TH_URG; / XXX /
2859	goto dodata; / XXX /
2860	}
2861	/*
2862	* If this segment advances the known urgent pointer,
2863	* then mark the data stream. This should not happen
2864	* in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since
2865	* a FIN has been received from the remote side.
2866	* In these states we ignore the URG.
2867	*
2868	* According to RFC961 (Assigned Protocols),
2869	* the urgent pointer points to the last octet
2870	* of urgent data. We continue, however,
2871	* to consider it to indicate the first octet
2872	* of data past the urgent section as the original
2873	* spec states (in one of two places).
2874	*/
2875	if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) {
2876	tp->rcv_up = th->th_seq + th->th_urp;
2877	so->so_oobmark = so->so_rcv.sb_cc +
2878	(tp->rcv_up - tp->rcv_nxt) - `1`;
2879	if (so->so_oobmark == `0`)
2880	so->so_state \|= SS_RCVATMARK;
2881	sohasoutofband(so);
2882	tp->t_oobflags &= ~(TCPOOB_HAVEDATA \| TCPOOB_HADDATA);
2883	}
2884	/*
2885	* Remove out of band data so doesn't get presented to user.
2886	* This can happen independent of advancing the URG pointer,
2887	* but if two URG's are pending at once, some out-of-band
2888	* data may creep in... ick.
2889	*/
2890	if (th->th_urp <= (u_int16_t) tlen
2891	#ifdef SO_OOBINLINE
2892	&& (so->so_options & SO_OOBINLINE) == `0`
2893	#endif
2894	)
2895	tcp_pulloutofband(so, th, m, hdroptlen);
2896	} else
2897	/*
2898	* If no out of band data is expected,
2899	* pull receive urgent pointer along
2900	* with the receive window.
2901	*/
2902	if (SEQ_GT(tp->rcv_nxt, tp->rcv_up))
2903	tp->rcv_up = tp->rcv_nxt;
2904	dodata: / XXX /
2905
2906	/*
2907	* Process the segment text, merging it into the TCP sequencing queue,
2908	* and arranging for acknowledgement of receipt if necessary.
2909	* This process logically involves adjusting tp->rcv_wnd as data
2910	* is presented to the user (this happens in tcp_usrreq.c,
2911	* tcp_rcvd()). If a FIN has already been received on this
2912	* connection then we just ignore the text.
2913	*/
2914	if ((tlen \|\| (tiflags & TH_FIN)) &&
2915	TCPS_HAVERCVDFIN(tp->t_state) == `0`) {
2916	/*
2917	* Insert segment ti into reassembly queue of tcp with
2918	* control block tp. Return TH_FIN if reassembly now includes
2919	* a segment with FIN. The macro form does the common case
2920	* inline (segment is the next to be received on an
2921	* established connection, and the queue is empty),
2922	* avoiding linkage into and removal from the queue and
2923	* repetition of various conversions.
2924	* Set DELACK for segments received in order, but ack
2925	* immediately when segments are out of order
2926	* (so fast retransmit can work).
2927	*/
2928	/ NOTE: this was TCP_REASS() macro, but used only once /
2929	TCP_REASS_LOCK(tp);
2930	if (th->th_seq == tp->rcv_nxt &&
2931	TAILQ_FIRST(&tp->segq) == NULL &&
2932	tp->t_state == TCPS_ESTABLISHED) {
2933	tcp_setup_ack(tp, th);
2934	tp->rcv_nxt += tlen;
2935	tiflags = th->th_flags & TH_FIN;
2936	tcps = TCP_STAT_GETREF();
2937	tcps[TCP_STAT_RCVPACK]++;
2938	tcps[TCP_STAT_RCVBYTE] += tlen;
2939	TCP_STAT_PUTREF();
2940	nd6_hint(tp);
2941	if (so->so_state & SS_CANTRCVMORE)
2942	m_freem(m);
2943	else {
2944	m_adj(m, hdroptlen);
2945	sbappendstream(&(so)->so_rcv, m);
2946	}
2947	TCP_REASS_UNLOCK(tp);
2948	sorwakeup(so);
2949	} else {
2950	m_adj(m, hdroptlen);
2951	tiflags = tcp_reass(tp, th, m, &tlen);
2952	tp->t_flags \|= TF_ACKNOW;
2953	}
2954
2955	/*
2956	* Note the amount of data that peer has sent into
2957	* our window, in order to estimate the sender's
2958	* buffer size.
2959	*/
2960	len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt);
2961	} else {
2962	m_freem(m);
2963	m = NULL;
2964	tiflags &= ~TH_FIN;
2965	}
2966
2967	/*
2968	* If FIN is received ACK the FIN and let the user know
2969	* that the connection is closing. Ignore a FIN received before
2970	* the connection is fully established.
2971	*/
2972	if ((tiflags & TH_FIN) && TCPS_HAVEESTABLISHED(tp->t_state)) {
2973	if (TCPS_HAVERCVDFIN(tp->t_state) == `0`) {
2974	socantrcvmore(so);
2975	tp->t_flags \|= TF_ACKNOW;
2976	tp->rcv_nxt++;
2977	}
2978	switch (tp->t_state) {
2979
2980	/*
2981	* In ESTABLISHED STATE enter the CLOSE_WAIT state.
2982	*/
2983	case TCPS_ESTABLISHED:
2984	tp->t_state = TCPS_CLOSE_WAIT;
2985	break;
2986
2987	/*
2988	* If still in FIN_WAIT_1 STATE FIN has not been acked so
2989	* enter the CLOSING state.
2990	*/
2991	case TCPS_FIN_WAIT_1:
2992	tp->t_state = TCPS_CLOSING;
2993	break;
2994
2995	/*
2996	* In FIN_WAIT_2 state enter the TIME_WAIT state,
2997	* starting the time-wait timer, turning off the other
2998	* standard timers.
2999	*/
3000	case TCPS_FIN_WAIT_2:
3001	tp->t_state = TCPS_TIME_WAIT;
3002	tcp_canceltimers(tp);
3003	TCP_TIMER_ARM(tp, TCPT_2MSL, `2` * tp->t_msl);
3004	soisdisconnected(so);
3005	break;
3006
3007	/*
3008	* In TIME_WAIT state restart the 2 MSL time_wait timer.
3009	*/
3010	case TCPS_TIME_WAIT:
3011	TCP_TIMER_ARM(tp, TCPT_2MSL, `2` * tp->t_msl);
3012	break;
3013	}
3014	}
3015	#ifdef TCP_DEBUG
3016	if (so->so_options & SO_DEBUG)
3017	tcp_trace(TA_INPUT, ostate, tp, tcp_saveti, `0`);
3018	#endif
3019
3020	/*
3021	* Return any desired output.
3022	*/
3023	if (needoutput \|\| (tp->t_flags & TF_ACKNOW)) {
3024	KERNEL_LOCK(`1`, NULL);
3025	(void) tcp_output(tp);
3026	KERNEL_UNLOCK_ONE(NULL);
3027	}
3028	if (tcp_saveti)
3029	m_freem(tcp_saveti);
3030
3031	if (tp->t_state == TCPS_TIME_WAIT
3032	&& (so->so_state & SS_NOFDREF)
3033	&& (tp->t_inpcb \|\| af != AF_INET)
3034	&& (tp->t_in6pcb \|\| af != AF_INET6)
3035	&& ((af == AF_INET ? tcp4_vtw_enable : tcp6_vtw_enable) & `1`) != `0`
3036	&& TAILQ_EMPTY(&tp->segq)
3037	&& vtw_add(af, tp)) {
3038	;
3039	}
3040	return;
3041
3042	badsyn:
3043	/*
3044	* Received a bad SYN. Increment counters and dropwithreset.
3045	*/
3046	TCP_STATINC(TCP_STAT_BADSYN);
3047	tp = NULL;
3048	goto dropwithreset;
3049
3050	dropafterack:
3051	/*
3052	* Generate an ACK dropping incoming segment if it occupies
3053	* sequence space, where the ACK reflects our state.
3054	*/
3055	if (tiflags & TH_RST)
3056	goto drop;
3057	goto dropafterack2;
3058
3059	dropafterack_ratelim:
3060	/*
3061	* We may want to rate-limit ACKs against SYN/RST attack.
3062	*/
3063	if (ppsratecheck(&tcp_ackdrop_ppslim_last, &tcp_ackdrop_ppslim_count,
3064	tcp_ackdrop_ppslim) == `0`) {
3065	/ XXX stat /
3066	goto drop;
3067	}
3068	/ ...fall into dropafterack2... /
3069
3070	dropafterack2:
3071	m_freem(m);
3072	tp->t_flags \|= TF_ACKNOW;
3073	KERNEL_LOCK(`1`, NULL);
3074	(void) tcp_output(tp);
3075	KERNEL_UNLOCK_ONE(NULL);
3076	if (tcp_saveti)
3077	m_freem(tcp_saveti);
3078	return;
3079
3080	dropwithreset_ratelim:
3081	/*
3082	* We may want to rate-limit RSTs in certain situations,
3083	* particularly if we are sending an RST in response to
3084	* an attempt to connect to or otherwise communicate with
3085	* a port for which we have no socket.
3086	*/
3087	if (ppsratecheck(&tcp_rst_ppslim_last, &tcp_rst_ppslim_count,
3088	tcp_rst_ppslim) == `0`) {
3089	/ XXX stat /
3090	goto drop;
3091	}
3092	/ ...fall into dropwithreset... /
3093
3094	dropwithreset:
3095	/*
3096	* Generate a RST, dropping incoming segment.
3097	* Make ACK acceptable to originator of segment.
3098	*/
3099	if (tiflags & TH_RST)
3100	goto drop;
3101
3102	switch (af) {
3103	#ifdef INET6
3104	case AF_INET6:
3105	/ For following calls to tcp_respond /
3106	if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst))
3107	goto drop;
3108	break;
3109	#endif /* INET6 */
3110	case AF_INET:
3111	if (IN_MULTICAST(ip->ip_dst.s_addr) \|\|
3112	in_broadcast(ip->ip_dst, m_get_rcvif_NOMPSAFE(m)))
3113	goto drop;
3114	}
3115
3116	if (tiflags & TH_ACK)
3117	(void)tcp_respond(tp, m, m, th, (tcp_seq)`0`, th->th_ack, TH_RST);
3118	else {
3119	if (tiflags & TH_SYN)
3120	tlen++;
3121	(void)tcp_respond(tp, m, m, th, th->th_seq + tlen, (tcp_seq)`0`,
3122	TH_RST\|TH_ACK);
3123	}
3124	if (tcp_saveti)
3125	m_freem(tcp_saveti);
3126	return;
3127
3128	badcsum:
3129	drop:
3130	/*
3131	* Drop space held by incoming segment and return.
3132	*/
3133	if (tp) {
3134	if (tp->t_inpcb)
3135	so = tp->t_inpcb->inp_socket;
3136	#ifdef INET6
3137	else if (tp->t_in6pcb)
3138	so = tp->t_in6pcb->in6p_socket;
3139	#endif
3140	else
3141	so = NULL;
3142	#ifdef TCP_DEBUG
3143	if (so && (so->so_options & SO_DEBUG) != `0`)
3144	tcp_trace(TA_DROP, ostate, tp, tcp_saveti, `0`);
3145	#endif
3146	}
3147	if (tcp_saveti)
3148	m_freem(tcp_saveti);
3149	m_freem(m);
3150	return;
3151	}
3152
3153	#ifdef TCP_SIGNATURE
3154	int
3155	tcp_signature_apply(void fstate, void* *data, u_int len)
3156	{
3157
3158	MD5Update(fstate, (u_char *)data, len);
3159	return (`0`);
3160	}
3161
3162	struct secasvar *
3163	tcp_signature_getsav(struct mbuf m, struct* tcphdr *th)
3164	{
3165	struct ip *ip;
3166	struct ip6_hdr *ip6;
3167
3168	ip = mtod(m, struct ip *);
3169	switch (ip->ip_v) {
3170	case `4`:
3171	ip = mtod(m, struct ip *);
3172	ip6 = NULL;
3173	break;
3174	case `6`:
3175	ip = NULL;
3176	ip6 = mtod(m, struct ip6_hdr *);
3177	break;
3178	default:
3179	return (NULL);
3180	}
3181
3182	#ifdef IPSEC
3183	if (ipsec_used) {
3184	union sockaddr_union dst;
3185	/ Extract the destination from the IP header in the mbuf. /
3186	memset(&dst, `0`, sizeof(union sockaddr_union));
3187	if (ip != NULL) {
3188	dst.sa.sa_len = sizeof(struct sockaddr_in);
3189	dst.sa.sa_family = AF_INET;
3190	dst.sin.sin_addr = ip->ip_dst;
3191	} else {
3192	dst.sa.sa_len = sizeof(struct sockaddr_in6);
3193	dst.sa.sa_family = AF_INET6;
3194	dst.sin6.sin6_addr = ip6->ip6_dst;
3195	}
3196
3197	/*
3198	* Look up an SADB entry which matches the address of the peer.
3199	*/
3200	return KEY_ALLOCSA(&dst, IPPROTO_TCP, htonl(TCP_SIG_SPI), `0`, `0`);
3201	}
3202	return NULL;
3203	#else
3204	if (ip)
3205	return key_allocsa(AF_INET, (void *)&ip->ip_src,
3206	(void *)&ip->ip_dst, IPPROTO_TCP,
3207	htonl(TCP_SIG_SPI), `0`, `0`);
3208	else
3209	return key_allocsa(AF_INET6, (void *)&ip6->ip6_src,
3210	(void *)&ip6->ip6_dst, IPPROTO_TCP,
3211	htonl(TCP_SIG_SPI), `0`, `0`);
3212	#endif
3213	}
3214
3215	int
3216	tcp_signature(struct mbuf m, struct* tcphdr th, int* thoff,
3217	struct secasvar sav, char* *sig)
3218	{
3219	MD5_CTX ctx;
3220	struct ip *ip;
3221	struct ipovly *ipovly;
3222	#ifdef INET6
3223	struct ip6_hdr *ip6;
3224	struct ip6_hdr_pseudo ip6pseudo;
3225	#endif /* INET6 */
3226	struct ippseudo ippseudo;
3227	struct tcphdr th0;
3228	int l, tcphdrlen;
3229
3230	if (sav == NULL)
3231	return (-`1`);
3232
3233	tcphdrlen = th->th_off * `4`;
3234
3235	switch (mtod(m, struct ip *)->ip_v) {
3236	case `4`:
3237	MD5Init(&ctx);
3238	ip = mtod(m, struct ip *);
3239	memset(&ippseudo, `0`, sizeof(ippseudo));
3240	ipovly = (struct ipovly *)ip;
3241	ippseudo.ippseudo_src = ipovly->ih_src;
3242	ippseudo.ippseudo_dst = ipovly->ih_dst;
3243	ippseudo.ippseudo_pad = `0`;
3244	ippseudo.ippseudo_p = IPPROTO_TCP;
3245	ippseudo.ippseudo_len = htons(m->m_pkthdr.len - thoff);
3246	MD5Update(&ctx, (char )&ippseudo, sizeof*(ippseudo));
3247	break;
3248	#if INET6
3249	case `6`:
3250	MD5Init(&ctx);
3251	ip6 = mtod(m, struct ip6_hdr *);
3252	memset(&ip6pseudo, `0`, sizeof(ip6pseudo));
3253	ip6pseudo.ip6ph_src = ip6->ip6_src;
3254	in6_clearscope(&ip6pseudo.ip6ph_src);
3255	ip6pseudo.ip6ph_dst = ip6->ip6_dst;
3256	in6_clearscope(&ip6pseudo.ip6ph_dst);
3257	ip6pseudo.ip6ph_len = htons(m->m_pkthdr.len - thoff);
3258	ip6pseudo.ip6ph_nxt = IPPROTO_TCP;
3259	MD5Update(&ctx, (char )&ip6pseudo, sizeof*(ip6pseudo));
3260	break;
3261	#endif /* INET6 */
3262	default:
3263	return (-`1`);
3264	}
3265
3266	th0 = *th;
3267	th0.th_sum = `0`;
3268	MD5Update(&ctx, (char )&th0, sizeof*(th0));
3269
3270	l = m->m_pkthdr.len - thoff - tcphdrlen;
3271	if (l > `0`)
3272	m_apply(m, thoff + tcphdrlen,
3273	m->m_pkthdr.len - thoff - tcphdrlen,
3274	tcp_signature_apply, &ctx);
3275
3276	MD5Update(&ctx, _KEYBUF(sav->key_auth), _KEYLEN(sav->key_auth));
3277	MD5Final(sig, &ctx);
3278
3279	return (`0`);
3280	}
3281	#endif
3282
3283	/*
3284	* tcp_dooptions: parse and process tcp options.
3285	*
3286	* returns -1 if this segment should be dropped. (eg. wrong signature)
3287	* otherwise returns 0.
3288	*/
3289
3290	static int
3291	tcp_dooptions(struct tcpcb tp, const* u_char cp, int* cnt,
3292	struct tcphdr *th,
3293	struct mbuf m, int* toff, struct tcp_opt_info *oi)
3294	{
3295	u_int16_t mss;
3296	int opt, optlen = `0`;
3297	#ifdef TCP_SIGNATURE
3298	void *sigp = NULL;
3299	char sigbuf[TCP_SIGLEN];
3300	struct secasvar *sav = NULL;
3301	#endif
3302
3303	for (; cp && cnt > `0`; cnt -= optlen, cp += optlen) {
3304	opt = cp[`0`];
3305	if (opt == TCPOPT_EOL)
3306	break;
3307	if (opt == TCPOPT_NOP)
3308	optlen = `1`;
3309	else {
3310	if (cnt < `2`)
3311	break;
3312	optlen = cp[`1`];
3313	if (optlen < `2` \|\| optlen > cnt)
3314	break;
3315	}
3316	switch (opt) {
3317
3318	default:
3319	continue;
3320
3321	case TCPOPT_MAXSEG:
3322	if (optlen != TCPOLEN_MAXSEG)
3323	continue;
3324	if (!(th->th_flags & TH_SYN))
3325	continue;
3326	if (TCPS_HAVERCVDSYN(tp->t_state))
3327	continue;
3328	bcopy(cp + `2`, &mss, sizeof(mss));
3329	oi->maxseg = ntohs(mss);
3330	break;
3331
3332	case TCPOPT_WINDOW:
3333	if (optlen != TCPOLEN_WINDOW)
3334	continue;
3335	if (!(th->th_flags & TH_SYN))
3336	continue;
3337	if (TCPS_HAVERCVDSYN(tp->t_state))
3338	continue;
3339	tp->t_flags \|= TF_RCVD_SCALE;
3340	tp->requested_s_scale = cp[`2`];
3341	if (tp->requested_s_scale > TCP_MAX_WINSHIFT) {
3342	char buf[INET6_ADDRSTRLEN];
3343	struct ip ip = mtod(m, struct* ip *);
3344	#ifdef INET6
3345	struct ip6_hdr ip6 = mtod(m, struct* ip6_hdr *);
3346	#endif
3347	if (ip)
3348	in_print(buf, sizeof(buf),
3349	&ip->ip_src);
3350	#ifdef INET6
3351	else if (ip6)
3352	in6_print(buf, sizeof(buf),
3353	&ip6->ip6_src);
3354	#endif
3355	else
3356	strlcpy(buf, "(unknown)", sizeof(buf));
3357	log(LOG_ERR, "TCP: invalid wscale %d from %s, "
3358	"assuming %d\n",
3359	tp->requested_s_scale, buf,
3360	TCP_MAX_WINSHIFT);
3361	tp->requested_s_scale = TCP_MAX_WINSHIFT;
3362	}
3363	break;
3364
3365	case TCPOPT_TIMESTAMP:
3366	if (optlen != TCPOLEN_TIMESTAMP)
3367	continue;
3368	oi->ts_present = `1`;
3369	bcopy(cp + `2`, &oi->ts_val, sizeof(oi->ts_val));
3370	NTOHL(oi->ts_val);
3371	bcopy(cp + `6`, &oi->ts_ecr, sizeof(oi->ts_ecr));
3372	NTOHL(oi->ts_ecr);
3373
3374	if (!(th->th_flags & TH_SYN))
3375	continue;
3376	if (TCPS_HAVERCVDSYN(tp->t_state))
3377	continue;
3378	/*
3379	* A timestamp received in a SYN makes
3380	* it ok to send timestamp requests and replies.
3381	*/
3382	tp->t_flags \|= TF_RCVD_TSTMP;
3383	tp->ts_recent = oi->ts_val;
3384	tp->ts_recent_age = tcp_now;
3385	break;
3386
3387	case TCPOPT_SACK_PERMITTED:
3388	if (optlen != TCPOLEN_SACK_PERMITTED)
3389	continue;
3390	if (!(th->th_flags & TH_SYN))
3391	continue;
3392	if (TCPS_HAVERCVDSYN(tp->t_state))
3393	continue;
3394	if (tcp_do_sack) {
3395	tp->t_flags \|= TF_SACK_PERMIT;
3396	tp->t_flags \|= TF_WILL_SACK;
3397	}
3398	break;
3399
3400	case TCPOPT_SACK:
3401	tcp_sack_option(tp, th, cp, optlen);
3402	break;
3403	#ifdef TCP_SIGNATURE
3404	case TCPOPT_SIGNATURE:
3405	if (optlen != TCPOLEN_SIGNATURE)
3406	continue;
3407	if (sigp && memcmp(sigp, cp + `2`, TCP_SIGLEN))
3408	return (-`1`);
3409
3410	sigp = sigbuf;
3411	memcpy(sigbuf, cp + `2`, TCP_SIGLEN);
3412	tp->t_flags \|= TF_SIGNATURE;
3413	break;
3414	#endif
3415	}
3416	}
3417
3418	#ifndef TCP_SIGNATURE
3419	return `0`;
3420	#else
3421	if (tp->t_flags & TF_SIGNATURE) {
3422
3423	sav = tcp_signature_getsav(m, th);
3424
3425	if (sav == NULL && tp->t_state == TCPS_LISTEN)
3426	return (-`1`);
3427	}
3428
3429	if ((sigp ? TF_SIGNATURE : `0`) ^ (tp->t_flags & TF_SIGNATURE))
3430	goto out;
3431
3432	if (sigp) {
3433	char sig[TCP_SIGLEN];
3434
3435	tcp_fields_to_net(th);
3436	if (tcp_signature(m, th, toff, sav, sig) < `0`) {
3437	tcp_fields_to_host(th);
3438	goto out;
3439	}
3440	tcp_fields_to_host(th);
3441
3442	if (memcmp(sig, sigp, TCP_SIGLEN)) {
3443	TCP_STATINC(TCP_STAT_BADSIG);
3444	goto out;
3445	} else
3446	TCP_STATINC(TCP_STAT_GOODSIG);
3447
3448	key_sa_recordxfer(sav, m);
3449	KEY_FREESAV(&sav);
3450	}
3451	return `0`;
3452	out:
3453	if (sav != NULL)
3454	KEY_FREESAV(&sav);
3455	return -`1`;
3456	#endif
3457	}
3458
3459	/*
3460	* Pull out of band byte out of a segment so
3461	* it doesn't appear in the user's data queue.
3462	* It is still reflected in the segment length for
3463	* sequencing purposes.
3464	*/
3465	void
3466	tcp_pulloutofband(struct socket so, struct* tcphdr *th,
3467	struct mbuf m, int* off)
3468	{
3469	int cnt = off + th->th_urp - `1`;
3470
3471	while (cnt >= `0`) {
3472	if (m->m_len > cnt) {
3473	char cp = mtod(m, char* *) + cnt;
3474	struct tcpcb *tp = sototcpcb(so);
3475
3476	tp->t_iobc = *cp;
3477	tp->t_oobflags \|= TCPOOB_HAVEDATA;
3478	bcopy(cp+`1`, cp, (unsigned)(m->m_len - cnt - `1`));
3479	m->m_len--;
3480	return;
3481	}
3482	cnt -= m->m_len;
3483	m = m->m_next;
3484	if (m == `0`)
3485	break;
3486	}
3487	panic("tcp_pulloutofband");
3488	}
3489
3490	/*
3491	* Collect new round-trip time estimate
3492	* and update averages and current timeout.
3493	*
3494	* rtt is in units of slow ticks (typically 500 ms) -- essentially the
3495	* difference of two timestamps.
3496	*/
3497	void
3498	tcp_xmit_timer(struct tcpcb *tp, uint32_t rtt)
3499	{
3500	int32_t delta;
3501
3502	TCP_STATINC(TCP_STAT_RTTUPDATED);
3503	if (tp->t_srtt != `0`) {
3504	/*
3505	* Compute the amount to add to srtt for smoothing,
3506	* *alpha, or 2^(-TCP_RTT_SHIFT). Because
3507	* srtt is stored in 1/32 slow ticks, we conceptually
3508	* shift left 5 bits, subtract srtt to get the
3509	* diference, and then shift right by TCP_RTT_SHIFT
3510	* (3) to obtain 1/8 of the difference.
3511	*/
3512	delta = (rtt << `2`) - (tp->t_srtt >> TCP_RTT_SHIFT);
3513	/*
3514	* This can never happen, because delta's lowest
3515	* possible value is 1/8 of t_srtt. But if it does,
3516	* set srtt to some reasonable value, here chosen
3517	* as 1/8 tick.
3518	*/
3519	if ((tp->t_srtt += delta) <= `0`)
3520	tp->t_srtt = `1` << `2`;
3521	/*
3522	* RFC2988 requires that rttvar be updated first.
3523	* This code is compliant because "delta" is the old
3524	* srtt minus the new observation (scaled).
3525	*
3526	* RFC2988 says:
3527	* rttvar = (1-beta) * rttvar + beta * \|srtt-observed\|
3528	*
3529	* delta is in units of 1/32 ticks, and has then been
3530	* divided by 8. This is equivalent to being in 1/16s
3531	* units and divided by 4. Subtract from it 1/4 of
3532	* the existing rttvar to form the (signed) amount to
3533	* adjust.
3534	*/
3535	if (delta < `0`)
3536	delta = -delta;
3537	delta -= (tp->t_rttvar >> TCP_RTTVAR_SHIFT);
3538	/*
3539	* As with srtt, this should never happen. There is
3540	* no support in RFC2988 for this operation. But 1/4s
3541	* as rttvar when faced with something arguably wrong
3542	* is ok.
3543	*/
3544	if ((tp->t_rttvar += delta) <= `0`)
3545	tp->t_rttvar = `1` << `2`;
3546
3547	/*
3548	* If srtt exceeds .01 second, ensure we use the 'remote' MSL
3549	* Problem is: it doesn't work. Disabled by defaulting
3550	* tcp_rttlocal to 0; see corresponding code in
3551	* tcp_subr that selects local vs remote in a different way.
3552	*
3553	* The static branch prediction hint here should be removed
3554	* when the rtt estimator is fixed and the rtt_enable code
3555	* is turned back on.
3556	*/
3557	if (__predict_false(tcp_rttlocal) && tcp_msl_enable
3558	&& tp->t_srtt > tcp_msl_remote_threshold
3559	&& tp->t_msl < tcp_msl_remote) {
3560	tp->t_msl = tcp_msl_remote;
3561	}
3562	} else {
3563	/*
3564	* This is the first measurement. Per RFC2988, 2.2,
3565	* set rtt=R and srtt=R/2.
3566	* For srtt, storage representation is 1/32 ticks,
3567	* so shift left by 5.
3568	* For rttvar, storage representation is 1/16 ticks,
3569	* So shift left by 4, but then right by 1 to halve.
3570	*/
3571	tp->t_srtt = rtt << (TCP_RTT_SHIFT + `2`);
3572	tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT + `2` - `1`);
3573	}
3574	tp->t_rtttime = `0`;
3575	tp->t_rxtshift = `0`;
3576
3577	/*
3578	* the retransmit should happen at rtt + 4 * rttvar.
3579	* Because of the way we do the smoothing, srtt and rttvar
3580	* will each average +1/2 tick of bias. When we compute
3581	* the retransmit timer, we want 1/2 tick of rounding and
3582	* 1 extra tick because of +-1/2 tick uncertainty in the
3583	* firing of the timer. The bias will give us exactly the
3584	* 1.5 tick we need. But, because the bias is
3585	* statistical, we have to test that we don't drop below
3586	* the minimum feasible timer (which is 2 ticks).
3587	*/
3588	TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp),
3589	max(tp->t_rttmin, rtt + `2`), TCPTV_REXMTMAX);
3590
3591	/*
3592	* We received an ack for a packet that wasn't retransmitted;
3593	* it is probably safe to discard any error indications we've
3594	* received recently. This isn't quite right, but close enough
3595	* for now (a route might have failed after we sent a segment,
3596	* and the return path might not be symmetrical).
3597	*/
3598	tp->t_softerror = `0`;
3599	}
3600
3601
3602	/*
3603	* TCP compressed state engine. Currently used to hold compressed
3604	* state for SYN_RECEIVED.
3605	*/
3606
3607	u_long syn_cache_count;
3608	u_int32_t syn_hash1, syn_hash2;
3609
3610	#define SYN_HASH(sa, sp, dp) \
3611	((((sa)->s_addr^syn_hash1)*(((((u_int32_t)(dp))<<16) + \
3612	((u_int32_t)(sp)))^syn_hash2)))
3613	#ifndef INET6
3614	#define SYN_HASHALL(hash, src, dst) \
3615	do { \
3616	hash = SYN_HASH(&((const struct sockaddr_in *)(src))->sin_addr, \
3617	((const struct sockaddr_in *)(src))->sin_port, \
3618	((const struct sockaddr_in *)(dst))->sin_port); \
3619	} while (/CONSTCOND/ 0)
3620	#else
3621	#define SYN_HASH6(sa, sp, dp) \
3622	((((sa)->s6_addr32[0] ^ (sa)->s6_addr32[3] ^ syn_hash1) * \
3623	(((((u_int32_t)(dp))<<16) + ((u_int32_t)(sp)))^syn_hash2)) \
3624	& 0x7fffffff)
3625
3626	#define SYN_HASHALL(hash, src, dst) \
3627	do { \
3628	switch ((src)->sa_family) { \
3629	case AF_INET: \
3630	hash = SYN_HASH(&((const struct sockaddr_in *)(src))->sin_addr, \
3631	((const struct sockaddr_in *)(src))->sin_port, \
3632	((const struct sockaddr_in *)(dst))->sin_port); \
3633	break; \
3634	case AF_INET6: \
3635	hash = SYN_HASH6(&((const struct sockaddr_in6 *)(src))->sin6_addr, \
3636	((const struct sockaddr_in6 *)(src))->sin6_port, \
3637	((const struct sockaddr_in6 *)(dst))->sin6_port); \
3638	break; \
3639	default: \
3640	hash = 0; \
3641	} \
3642	} while (/CONSTCOND/0)
3643	#endif /* INET6 */
3644
3645	static struct pool syn_cache_pool;
3646
3647	/*
3648	* We don't estimate RTT with SYNs, so each packet starts with the default
3649	* RTT and each timer step has a fixed timeout value.
3650	*/
3651	#define SYN_CACHE_TIMER_ARM(sc) \
3652	do { \
3653	TCPT_RANGESET((sc)->sc_rxtcur, \
3654	TCPTV_SRTTDFLT * tcp_backoff[(sc)->sc_rxtshift], TCPTV_MIN, \
3655	TCPTV_REXMTMAX); \
3656	callout_reset(&(sc)->sc_timer, \
3657	(sc)->sc_rxtcur * (hz / PR_SLOWHZ), syn_cache_timer, (sc)); \
3658	} while (/CONSTCOND/0)
3659
3660	#define SYN_CACHE_TIMESTAMP(sc) (tcp_now - (sc)->sc_timebase)
3661
3662	static inline void
3663	syn_cache_rm(struct syn_cache *sc)
3664	{
3665	TAILQ_REMOVE(&tcp_syn_cache[sc->sc_bucketidx].sch_bucket,
3666	sc, sc_bucketq);
3667	sc->sc_tp = NULL;
3668	LIST_REMOVE(sc, sc_tpq);
3669	tcp_syn_cache[sc->sc_bucketidx].sch_length--;
3670	callout_stop(&sc->sc_timer);
3671	syn_cache_count--;
3672	}
3673
3674	static inline void
3675	syn_cache_put(struct syn_cache *sc)
3676	{
3677	if (sc->sc_ipopts)
3678	(void) m_free(sc->sc_ipopts);
3679	rtcache_free(&sc->sc_route);
3680	sc->sc_flags \|= SCF_DEAD;
3681	if (!callout_invoking(&sc->sc_timer))
3682	callout_schedule(&(sc)->sc_timer, `1`);
3683	}
3684
3685	void
3686	syn_cache_init(void)
3687	{
3688	int i;
3689
3690	pool_init(&syn_cache_pool, sizeof(struct syn_cache), `0`, `0`, `0`,
3691	"synpl", NULL, IPL_SOFTNET);
3692
3693	/ Initialize the hash buckets. /
3694	for (i = `0`; i < tcp_syn_cache_size; i++)
3695	TAILQ_INIT(&tcp_syn_cache[i].sch_bucket);
3696	}
3697
3698	void
3699	syn_cache_insert(struct syn_cache sc, struct* tcpcb *tp)
3700	{
3701	struct syn_cache_head *scp;
3702	struct syn_cache *sc2;
3703	int s;
3704
3705	/*
3706	* If there are no entries in the hash table, reinitialize
3707	* the hash secrets.
3708	*/
3709	if (syn_cache_count == `0`) {
3710	syn_hash1 = cprng_fast32();
3711	syn_hash2 = cprng_fast32();
3712	}
3713
3714	SYN_HASHALL(sc->sc_hash, &sc->sc_src.sa, &sc->sc_dst.sa);
3715	sc->sc_bucketidx = sc->sc_hash % tcp_syn_cache_size;
3716	scp = &tcp_syn_cache[sc->sc_bucketidx];
3717
3718	/*
3719	* Make sure that we don't overflow the per-bucket
3720	* limit or the total cache size limit.
3721	*/
3722	s = splsoftnet();
3723	if (scp->sch_length >= tcp_syn_bucket_limit) {
3724	TCP_STATINC(TCP_STAT_SC_BUCKETOVERFLOW);
3725	/*
3726	* The bucket is full. Toss the oldest element in the
3727	* bucket. This will be the first entry in the bucket.
3728	*/
3729	sc2 = TAILQ_FIRST(&scp->sch_bucket);
3730	#ifdef DIAGNOSTIC
3731	/*
3732	* This should never happen; we should always find an
3733	* entry in our bucket.
3734	*/
3735	if (sc2 == NULL)
3736	panic("syn_cache_insert: bucketoverflow: impossible");
3737	#endif
3738	syn_cache_rm(sc2);
3739	syn_cache_put(sc2); / calls pool_put but see spl above /
3740	} else if (syn_cache_count >= tcp_syn_cache_limit) {
3741	struct syn_cache_head scp2, sce;
3742
3743	TCP_STATINC(TCP_STAT_SC_OVERFLOWED);
3744	/*
3745	* The cache is full. Toss the oldest entry in the
3746	* first non-empty bucket we can find.
3747	*
3748	* XXX We would really like to toss the oldest
3749	* entry in the cache, but we hope that this
3750	* condition doesn't happen very often.
3751	*/
3752	scp2 = scp;
3753	if (TAILQ_EMPTY(&scp2->sch_bucket)) {
3754	sce = &tcp_syn_cache[tcp_syn_cache_size];
3755	for (++scp2; scp2 != scp; scp2++) {
3756	if (scp2 >= sce)
3757	scp2 = &tcp_syn_cache[`0`];
3758	if (! TAILQ_EMPTY(&scp2->sch_bucket))
3759	break;
3760	}
3761	#ifdef DIAGNOSTIC
3762	/*
3763	* This should never happen; we should always find a
3764	* non-empty bucket.
3765	*/
3766	if (scp2 == scp)
3767	panic("syn_cache_insert: cacheoverflow: "
3768	"impossible");
3769	#endif
3770	}
3771	sc2 = TAILQ_FIRST(&scp2->sch_bucket);
3772	syn_cache_rm(sc2);
3773	syn_cache_put(sc2); / calls pool_put but see spl above /
3774	}
3775
3776	/*
3777	* Initialize the entry's timer.
3778	*/
3779	sc->sc_rxttot = `0`;
3780	sc->sc_rxtshift = `0`;
3781	SYN_CACHE_TIMER_ARM(sc);
3782
3783	/ Link it from tcpcb entry /
3784	LIST_INSERT_HEAD(&tp->t_sc, sc, sc_tpq);
3785
3786	/ Put it into the bucket. /
3787	TAILQ_INSERT_TAIL(&scp->sch_bucket, sc, sc_bucketq);
3788	scp->sch_length++;
3789	syn_cache_count++;
3790
3791	TCP_STATINC(TCP_STAT_SC_ADDED);
3792	splx(s);
3793	}
3794
3795	/*
3796	* Walk the timer queues, looking for SYN,ACKs that need to be retransmitted.
3797	* If we have retransmitted an entry the maximum number of times, expire
3798	* that entry.
3799	*/
3800	void
3801	syn_cache_timer(void *arg)
3802	{
3803	struct syn_cache *sc = arg;
3804
3805	mutex_enter(softnet_lock);
3806	KERNEL_LOCK(`1`, NULL);
3807	callout_ack(&sc->sc_timer);
3808
3809	if (__predict_false(sc->sc_flags & SCF_DEAD)) {
3810	TCP_STATINC(TCP_STAT_SC_DELAYED_FREE);
3811	callout_destroy(&sc->sc_timer);
3812	pool_put(&syn_cache_pool, sc);
3813	KERNEL_UNLOCK_ONE(NULL);
3814	mutex_exit(softnet_lock);
3815	return;
3816	}
3817
3818	if (__predict_false(sc->sc_rxtshift == TCP_MAXRXTSHIFT)) {
3819	/ Drop it -- too many retransmissions. /
3820	goto dropit;
3821	}
3822
3823	/*
3824	* Compute the total amount of time this entry has
3825	* been on a queue. If this entry has been on longer
3826	* than the keep alive timer would allow, expire it.
3827	*/
3828	sc->sc_rxttot += sc->sc_rxtcur;
3829	if (sc->sc_rxttot >= tcp_keepinit)
3830	goto dropit;
3831
3832	TCP_STATINC(TCP_STAT_SC_RETRANSMITTED);
3833	(void) syn_cache_respond(sc, NULL);
3834
3835	/ Advance the timer back-off. /
3836	sc->sc_rxtshift++;
3837	SYN_CACHE_TIMER_ARM(sc);
3838
3839	KERNEL_UNLOCK_ONE(NULL);
3840	mutex_exit(softnet_lock);
3841	return;
3842
3843	dropit:
3844	TCP_STATINC(TCP_STAT_SC_TIMED_OUT);
3845	syn_cache_rm(sc);
3846	if (sc->sc_ipopts)
3847	(void) m_free(sc->sc_ipopts);
3848	rtcache_free(&sc->sc_route);
3849	callout_destroy(&sc->sc_timer);
3850	pool_put(&syn_cache_pool, sc);
3851	KERNEL_UNLOCK_ONE(NULL);
3852	mutex_exit(softnet_lock);
3853	}
3854
3855	/*
3856	* Remove syn cache created by the specified tcb entry,
3857	* because this does not make sense to keep them
3858	* (if there's no tcb entry, syn cache entry will never be used)
3859	*/
3860	void
3861	syn_cache_cleanup(struct tcpcb *tp)
3862	{
3863	struct syn_cache sc, nsc;
3864	int s;
3865
3866	s = splsoftnet();
3867
3868	for (sc = LIST_FIRST(&tp->t_sc); sc != NULL; sc = nsc) {
3869	nsc = LIST_NEXT(sc, sc_tpq);
3870
3871	#ifdef DIAGNOSTIC
3872	if (sc->sc_tp != tp)
3873	panic("invalid sc_tp in syn_cache_cleanup");
3874	#endif
3875	syn_cache_rm(sc);
3876	syn_cache_put(sc); / calls pool_put but see spl above /
3877	}
3878	/ just for safety /
3879	LIST_INIT(&tp->t_sc);
3880
3881	splx(s);
3882	}
3883
3884	/*
3885	* Find an entry in the syn cache.
3886	*/
3887	struct syn_cache *
3888	syn_cache_lookup(const struct sockaddr src, const* struct sockaddr *dst,
3889	struct syn_cache_head **headp)
3890	{
3891	struct syn_cache *sc;
3892	struct syn_cache_head *scp;
3893	u_int32_t hash;
3894	int s;
3895
3896	SYN_HASHALL(hash, src, dst);
3897
3898	scp = &tcp_syn_cache[hash % tcp_syn_cache_size];
3899	*headp = scp;
3900	s = splsoftnet();
3901	for (sc = TAILQ_FIRST(&scp->sch_bucket); sc != NULL;
3902	sc = TAILQ_NEXT(sc, sc_bucketq)) {
3903	if (sc->sc_hash != hash)
3904	continue;
3905	if (!memcmp(&sc->sc_src, src, src->sa_len) &&
3906	!memcmp(&sc->sc_dst, dst, dst->sa_len)) {
3907	splx(s);
3908	return (sc);
3909	}
3910	}
3911	splx(s);
3912	return (NULL);
3913	}
3914
3915	/*
3916	* This function gets called when we receive an ACK for a
3917	* socket in the LISTEN state. We look up the connection
3918	* in the syn cache, and if its there, we pull it out of
3919	* the cache and turn it into a full-blown connection in
3920	* the SYN-RECEIVED state.
3921	*
3922	* The return values may not be immediately obvious, and their effects
3923	* can be subtle, so here they are:
3924	*
3925	* NULL SYN was not found in cache; caller should drop the
3926	* packet and send an RST.
3927	*
3928	* -1 We were unable to create the new connection, and are
3929	* aborting it. An ACK,RST is being sent to the peer
3930	* (unless we got screwey sequence numbners; see below),
3931	* because the 3-way handshake has been completed. Caller
3932	* should not free the mbuf, since we may be using it. If
3933	* we are not, we will free it.
3934	*
3935	* Otherwise, the return value is a pointer to the new socket
3936	* associated with the connection.
3937	*/
3938	struct socket *
3939	syn_cache_get(struct sockaddr src, struct* sockaddr *dst,
3940	struct tcphdr th, unsigned* int hlen, unsigned int tlen,
3941	struct socket so, struct* mbuf *m)
3942	{
3943	struct syn_cache *sc;
3944	struct syn_cache_head *scp;
3945	struct inpcb *inp = NULL;
3946	#ifdef INET6
3947	struct in6pcb *in6p = NULL;
3948	#endif
3949	struct tcpcb *tp = `0`;
3950	int s;
3951	struct socket *oso;
3952
3953	s = splsoftnet();
3954	if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) {
3955	splx(s);
3956	return (NULL);
3957	}
3958
3959	/*
3960	* Verify the sequence and ack numbers. Try getting the correct
3961	* response again.
3962	*/
3963	if ((th->th_ack != sc->sc_iss + `1`) \|\|
3964	SEQ_LEQ(th->th_seq, sc->sc_irs) \|\|
3965	SEQ_GT(th->th_seq, sc->sc_irs + `1` + sc->sc_win)) {
3966	(void) syn_cache_respond(sc, m);
3967	splx(s);
3968	return ((struct socket *)(-`1`));
3969	}
3970
3971	/ Remove this cache entry /
3972	syn_cache_rm(sc);
3973	splx(s);
3974
3975	/*
3976	* Ok, create the full blown connection, and set things up
3977	* as they would have been set up if we had created the
3978	* connection when the SYN arrived. If we can't create
3979	* the connection, abort it.
3980	*/
3981	/*
3982	* inp still has the OLD in_pcb stuff, set the
3983	* v6-related flags on the new guy, too. This is
3984	* done particularly for the case where an AF_INET6
3985	* socket is bound only to a port, and a v4 connection
3986	* comes in on that port.
3987	* we also copy the flowinfo from the original pcb
3988	* to the new one.
3989	*/
3990	oso = so;
3991	so = sonewconn(so, true);
3992	if (so == NULL)
3993	goto resetandabort;
3994
3995	switch (so->so_proto->pr_domain->dom_family) {
3996	#ifdef INET
3997	case AF_INET:
3998	inp = sotoinpcb(so);
3999	break;
4000	#endif
4001	#ifdef INET6
4002	case AF_INET6:
4003	in6p = sotoin6pcb(so);
4004	break;
4005	#endif
4006	}
4007	switch (src->sa_family) {
4008	#ifdef INET
4009	case AF_INET:
4010	if (inp) {
4011	inp->inp_laddr = ((struct sockaddr_in *)dst)->sin_addr;
4012	inp->inp_lport = ((struct sockaddr_in *)dst)->sin_port;
4013	inp->inp_options = ip_srcroute();
4014	in_pcbstate(inp, INP_BOUND);
4015	if (inp->inp_options == NULL) {
4016	inp->inp_options = sc->sc_ipopts;
4017	sc->sc_ipopts = NULL;
4018	}
4019	}
4020	#ifdef INET6
4021	else if (in6p) {
4022	/ IPv4 packet to AF_INET6 socket /
4023	memset(&in6p->in6p_laddr, `0`, sizeof(in6p->in6p_laddr));
4024	in6p->in6p_laddr.s6_addr16[`5`] = htons(`0xffff`);
4025	bcopy(&((struct sockaddr_in *)dst)->sin_addr,
4026	&in6p->in6p_laddr.s6_addr32[`3`],
4027	sizeof(((struct sockaddr_in *)dst)->sin_addr));
4028	in6p->in6p_lport = ((struct sockaddr_in *)dst)->sin_port;
4029	in6totcpcb(in6p)->t_family = AF_INET;
4030	if (sotoin6pcb(oso)->in6p_flags & IN6P_IPV6_V6ONLY)
4031	in6p->in6p_flags \|= IN6P_IPV6_V6ONLY;
4032	else
4033	in6p->in6p_flags &= ~IN6P_IPV6_V6ONLY;
4034	in6_pcbstate(in6p, IN6P_BOUND);
4035	}
4036	#endif
4037	break;
4038	#endif
4039	#ifdef INET6
4040	case AF_INET6:
4041	if (in6p) {
4042	in6p->in6p_laddr = ((struct sockaddr_in6 *)dst)->sin6_addr;
4043	in6p->in6p_lport = ((struct sockaddr_in6 *)dst)->sin6_port;
4044	in6_pcbstate(in6p, IN6P_BOUND);
4045	}
4046	break;
4047	#endif
4048	}
4049	#ifdef INET6
4050	if (in6p && in6totcpcb(in6p)->t_family == AF_INET6 && sotoinpcb(oso)) {
4051	struct in6pcb *oin6p = sotoin6pcb(oso);
4052	/ inherit socket options from the listening socket /
4053	in6p->in6p_flags \|= (oin6p->in6p_flags & IN6P_CONTROLOPTS);
4054	if (in6p->in6p_flags & IN6P_CONTROLOPTS) {
4055	m_freem(in6p->in6p_options);
4056	in6p->in6p_options = `0`;
4057	}
4058	ip6_savecontrol(in6p, &in6p->in6p_options,
4059	mtod(m, struct ip6_hdr *), m);
4060	}
4061	#endif
4062
4063	#if defined(IPSEC)
4064	if (ipsec_used) {
4065	/*
4066	* we make a copy of policy, instead of sharing the policy, for
4067	* better behavior in terms of SA lookup and dead SA removal.
4068	*/
4069	if (inp) {
4070	/ copy old policy into new socket's /
4071	if (ipsec_copy_pcbpolicy(sotoinpcb(oso)->inp_sp,
4072	inp->inp_sp))
4073	printf("tcp_input: could not copy policy\n");
4074	}
4075	#ifdef INET6
4076	else if (in6p) {
4077	/ copy old policy into new socket's /
4078	if (ipsec_copy_pcbpolicy(sotoin6pcb(oso)->in6p_sp,
4079	in6p->in6p_sp))
4080	printf("tcp_input: could not copy policy\n");
4081	}
4082	#endif
4083	}
4084	#endif
4085
4086	/*
4087	* Give the new socket our cached route reference.
4088	*/
4089	if (inp) {
4090	rtcache_copy(&inp->inp_route, &sc->sc_route);
4091	rtcache_free(&sc->sc_route);
4092	}
4093	#ifdef INET6
4094	else {
4095	rtcache_copy(&in6p->in6p_route, &sc->sc_route);
4096	rtcache_free(&sc->sc_route);
4097	}
4098	#endif
4099
4100	if (inp) {
4101	struct sockaddr_in sin;
4102	memcpy(&sin, src, src->sa_len);
4103	if (in_pcbconnect(inp, &sin, &lwp0)) {
4104	goto resetandabort;
4105	}
4106	}
4107	#ifdef INET6
4108	else if (in6p) {
4109	struct sockaddr_in6 sin6;
4110	memcpy(&sin6, src, src->sa_len);
4111	if (src->sa_family == AF_INET) {
4112	/ IPv4 packet to AF_INET6 socket /
4113	in6_sin_2_v4mapsin6((struct sockaddr_in *)src, &sin6);
4114	}
4115	if (in6_pcbconnect(in6p, &sin6, NULL)) {
4116	goto resetandabort;
4117	}
4118	}
4119	#endif
4120	else {
4121	goto resetandabort;
4122	}
4123
4124	if (inp)
4125	tp = intotcpcb(inp);
4126	#ifdef INET6
4127	else if (in6p)
4128	tp = in6totcpcb(in6p);
4129	#endif
4130	else
4131	tp = NULL;
4132	tp->t_flags = sototcpcb(oso)->t_flags & TF_NODELAY;
4133	if (sc->sc_request_r_scale != `15`) {
4134	tp->requested_s_scale = sc->sc_requested_s_scale;
4135	tp->request_r_scale = sc->sc_request_r_scale;
4136	tp->snd_scale = sc->sc_requested_s_scale;
4137	tp->rcv_scale = sc->sc_request_r_scale;
4138	tp->t_flags \|= TF_REQ_SCALE\|TF_RCVD_SCALE;
4139	}
4140	if (sc->sc_flags & SCF_TIMESTAMP)
4141	tp->t_flags \|= TF_REQ_TSTMP\|TF_RCVD_TSTMP;
4142	tp->ts_timebase = sc->sc_timebase;
4143
4144	tp->t_template = tcp_template(tp);
4145	if (tp->t_template == `0`) {
4146	tp = tcp_drop(tp, ENOBUFS); / destroys socket /
4147	so = NULL;
4148	m_freem(m);
4149	goto abort;
4150	}
4151
4152	tp->iss = sc->sc_iss;
4153	tp->irs = sc->sc_irs;
4154	tcp_sendseqinit(tp);
4155	tcp_rcvseqinit(tp);
4156	tp->t_state = TCPS_SYN_RECEIVED;
4157	TCP_TIMER_ARM(tp, TCPT_KEEP, tp->t_keepinit);
4158	TCP_STATINC(TCP_STAT_ACCEPTS);
4159
4160	if ((sc->sc_flags & SCF_SACK_PERMIT) && tcp_do_sack)
4161	tp->t_flags \|= TF_WILL_SACK;
4162
4163	if ((sc->sc_flags & SCF_ECN_PERMIT) && tcp_do_ecn)
4164	tp->t_flags \|= TF_ECN_PERMIT;
4165
4166	#ifdef TCP_SIGNATURE
4167	if (sc->sc_flags & SCF_SIGNATURE)
4168	tp->t_flags \|= TF_SIGNATURE;
4169	#endif
4170
4171	/ Initialize tp->t_ourmss before we deal with the peer's! /
4172	tp->t_ourmss = sc->sc_ourmaxseg;
4173	tcp_mss_from_peer(tp, sc->sc_peermaxseg);
4174
4175	/*
4176	* Initialize the initial congestion window. If we
4177	* had to retransmit the SYN,ACK, we must initialize cwnd
4178	* to 1 segment (i.e. the Loss Window).
4179	*/
4180	if (sc->sc_rxtshift)
4181	tp->snd_cwnd = tp->t_peermss;
4182	else {
4183	int ss = tcp_init_win;
4184	#ifdef INET
4185	if (inp != NULL && in_localaddr(inp->inp_faddr))
4186	ss = tcp_init_win_local;
4187	#endif
4188	#ifdef INET6
4189	if (in6p != NULL && in6_localaddr(&in6p->in6p_faddr))
4190	ss = tcp_init_win_local;
4191	#endif
4192	tp->snd_cwnd = TCP_INITIAL_WINDOW(ss, tp->t_peermss);
4193	}
4194
4195	tcp_rmx_rtt(tp);
4196	tp->snd_wl1 = sc->sc_irs;
4197	tp->rcv_up = sc->sc_irs + `1`;
4198
4199	/*
4200	* This is what whould have happened in tcp_output() when
4201	* the SYN,ACK was sent.
4202	*/
4203	tp->snd_up = tp->snd_una;
4204	tp->snd_max = tp->snd_nxt = tp->iss+`1`;
4205	TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur);
4206	if (sc->sc_win > `0` && SEQ_GT(tp->rcv_nxt + sc->sc_win, tp->rcv_adv))
4207	tp->rcv_adv = tp->rcv_nxt + sc->sc_win;
4208	tp->last_ack_sent = tp->rcv_nxt;
4209	tp->t_partialacks = -`1`;
4210	tp->t_dupacks = `0`;
4211
4212	TCP_STATINC(TCP_STAT_SC_COMPLETED);
4213	s = splsoftnet();
4214	syn_cache_put(sc);
4215	splx(s);
4216	return (so);
4217
4218	resetandabort:
4219	(void)tcp_respond(NULL, m, m, th, (tcp_seq)`0`, th->th_ack, TH_RST);
4220	abort:
4221	if (so != NULL) {
4222	(void) soqremque(so, `1`);
4223	(void) soabort(so);
4224	mutex_enter(softnet_lock);
4225	}
4226	s = splsoftnet();
4227	syn_cache_put(sc);
4228	splx(s);
4229	TCP_STATINC(TCP_STAT_SC_ABORTED);
4230	return ((struct socket *)(-`1`));
4231	}
4232
4233	/*
4234	* This function is called when we get a RST for a
4235	* non-existent connection, so that we can see if the
4236	* connection is in the syn cache. If it is, zap it.
4237	*/
4238
4239	void
4240	syn_cache_reset(struct sockaddr src, struct* sockaddr dst, struct* tcphdr *th)
4241	{
4242	struct syn_cache *sc;
4243	struct syn_cache_head *scp;
4244	int s = splsoftnet();
4245
4246	if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) {
4247	splx(s);
4248	return;
4249	}
4250	if (SEQ_LT(th->th_seq, sc->sc_irs) \|\|
4251	SEQ_GT(th->th_seq, sc->sc_irs+`1`)) {
4252	splx(s);
4253	return;
4254	}
4255	syn_cache_rm(sc);
4256	TCP_STATINC(TCP_STAT_SC_RESET);
4257	syn_cache_put(sc); / calls pool_put but see spl above /
4258	splx(s);
4259	}
4260
4261	void
4262	syn_cache_unreach(const struct sockaddr src, const* struct sockaddr *dst,
4263	struct tcphdr *th)
4264	{
4265	struct syn_cache *sc;
4266	struct syn_cache_head *scp;
4267	int s;
4268
4269	s = splsoftnet();
4270	if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) {
4271	splx(s);
4272	return;
4273	}
4274	/ If the sequence number != sc_iss, then it's a bogus ICMP msg /
4275	if (ntohl (th->th_seq) != sc->sc_iss) {
4276	splx(s);
4277	return;
4278	}
4279
4280	/*
4281	* If we've retransmitted 3 times and this is our second error,
4282	* we remove the entry. Otherwise, we allow it to continue on.
4283	* This prevents us from incorrectly nuking an entry during a
4284	* spurious network outage.
4285	*
4286	* See tcp_notify().
4287	*/
4288	if ((sc->sc_flags & SCF_UNREACH) == `0` \|\| sc->sc_rxtshift < `3`) {
4289	sc->sc_flags \|= SCF_UNREACH;
4290	splx(s);
4291	return;
4292	}
4293
4294	syn_cache_rm(sc);
4295	TCP_STATINC(TCP_STAT_SC_UNREACH);
4296	syn_cache_put(sc); / calls pool_put but see spl above /
4297	splx(s);
4298	}
4299
4300	/*
4301	* Given a LISTEN socket and an inbound SYN request, add
4302	* this to the syn cache, and send back a segment:
4303	* <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK>
4304	* to the source.
4305	*
4306	* IMPORTANT NOTE: We do _NOT_ ACK data that might accompany the SYN.
4307	* Doing so would require that we hold onto the data and deliver it
4308	* to the application. However, if we are the target of a SYN-flood
4309	* DoS attack, an attacker could send data which would eventually
4310	* consume all available buffer space if it were ACKed. By not ACKing
4311	* the data, we avoid this DoS scenario.
4312	*/
4313
4314	int
4315	syn_cache_add(struct sockaddr src, struct* sockaddr dst, struct* tcphdr *th,
4316	unsigned int hlen, struct socket so, struct* mbuf m, u_char optp,
4317	int optlen, struct tcp_opt_info *oi)
4318	{
4319	struct tcpcb tb, *tp;
4320	long win;
4321	struct syn_cache *sc;
4322	struct syn_cache_head *scp;
4323	struct mbuf *ipopts;
4324	struct tcp_opt_info opti;
4325	int s;
4326
4327	tp = sototcpcb(so);
4328
4329	memset(&opti, `0`, sizeof(opti));
4330
4331	/*
4332	* RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN
4333	*
4334	* Note this check is performed in tcp_input() very early on.
4335	*/
4336
4337	/*
4338	* Initialize some local state.
4339	*/
4340	win = sbspace(&so->so_rcv);
4341	if (win > TCP_MAXWIN)
4342	win = TCP_MAXWIN;
4343
4344	switch (src->sa_family) {
4345	#ifdef INET
4346	case AF_INET:
4347	/*
4348	* Remember the IP options, if any.
4349	*/
4350	ipopts = ip_srcroute();
4351	break;
4352	#endif
4353	default:
4354	ipopts = NULL;
4355	}
4356
4357	#ifdef TCP_SIGNATURE
4358	if (optp \|\| (tp->t_flags & TF_SIGNATURE))
4359	#else
4360	if (optp)
4361	#endif
4362	{
4363	tb.t_flags = tcp_do_rfc1323 ? (TF_REQ_SCALE\|TF_REQ_TSTMP) : `0`;
4364	#ifdef TCP_SIGNATURE
4365	tb.t_flags \|= (tp->t_flags & TF_SIGNATURE);
4366	#endif
4367	tb.t_state = TCPS_LISTEN;
4368	if (tcp_dooptions(&tb, optp, optlen, th, m, m->m_pkthdr.len -
4369	sizeof(struct tcphdr) - optlen - hlen, oi) < `0`)
4370	return (`0`);
4371	} else
4372	tb.t_flags = `0`;
4373
4374	/*
4375	* See if we already have an entry for this connection.
4376	* If we do, resend the SYN,ACK. We do not count this
4377	* as a retransmission (XXX though maybe we should).
4378	*/
4379	if ((sc = syn_cache_lookup(src, dst, &scp)) != NULL) {
4380	TCP_STATINC(TCP_STAT_SC_DUPESYN);
4381	if (ipopts) {
4382	/*
4383	* If we were remembering a previous source route,
4384	* forget it and use the new one we've been given.
4385	*/
4386	if (sc->sc_ipopts)
4387	(void) m_free(sc->sc_ipopts);
4388	sc->sc_ipopts = ipopts;
4389	}
4390	sc->sc_timestamp = tb.ts_recent;
4391	if (syn_cache_respond(sc, m) == `0`) {
4392	uint64_t *tcps = TCP_STAT_GETREF();
4393	tcps[TCP_STAT_SNDACKS]++;
4394	tcps[TCP_STAT_SNDTOTAL]++;
4395	TCP_STAT_PUTREF();
4396	}
4397	return (`1`);
4398	}
4399
4400	s = splsoftnet();
4401	sc = pool_get(&syn_cache_pool, PR_NOWAIT);
4402	splx(s);
4403	if (sc == NULL) {
4404	if (ipopts)
4405	(void) m_free(ipopts);
4406	return (`0`);
4407	}
4408
4409	/*
4410	* Fill in the cache, and put the necessary IP and TCP
4411	* options into the reply.
4412	*/
4413	memset(sc, `0`, sizeof(struct syn_cache));
4414	callout_init(&sc->sc_timer, CALLOUT_MPSAFE);
4415	bcopy(src, &sc->sc_src, src->sa_len);
4416	bcopy(dst, &sc->sc_dst, dst->sa_len);
4417	sc->sc_flags = `0`;
4418	sc->sc_ipopts = ipopts;
4419	sc->sc_irs = th->th_seq;
4420	switch (src->sa_family) {
4421	#ifdef INET
4422	case AF_INET:
4423	{
4424	struct sockaddr_in srcin = (void* *) src;
4425	struct sockaddr_in dstin = (void* *) dst;
4426
4427	sc->sc_iss = tcp_new_iss1(&dstin->sin_addr,
4428	&srcin->sin_addr, dstin->sin_port,
4429	srcin->sin_port, sizeof(dstin->sin_addr), `0`);
4430	break;
4431	}
4432	#endif /* INET */
4433	#ifdef INET6
4434	case AF_INET6:
4435	{
4436	struct sockaddr_in6 srcin6 = (void* *) src;
4437	struct sockaddr_in6 dstin6 = (void* *) dst;
4438
4439	sc->sc_iss = tcp_new_iss1(&dstin6->sin6_addr,
4440	&srcin6->sin6_addr, dstin6->sin6_port,
4441	srcin6->sin6_port, sizeof(dstin6->sin6_addr), `0`);
4442	break;
4443	}
4444	#endif /* INET6 */
4445	}
4446	sc->sc_peermaxseg = oi->maxseg;
4447	sc->sc_ourmaxseg = tcp_mss_to_advertise(m->m_flags & M_PKTHDR ?
4448	m_get_rcvif_NOMPSAFE(m) : NULL,
4449	sc->sc_src.sa.sa_family);
4450	sc->sc_win = win;
4451	sc->sc_timebase = tcp_now - `1`; / see tcp_newtcpcb() /
4452	sc->sc_timestamp = tb.ts_recent;
4453	if ((tb.t_flags & (TF_REQ_TSTMP\|TF_RCVD_TSTMP)) ==
4454	(TF_REQ_TSTMP\|TF_RCVD_TSTMP))
4455	sc->sc_flags \|= SCF_TIMESTAMP;
4456	if ((tb.t_flags & (TF_RCVD_SCALE\|TF_REQ_SCALE)) ==
4457	(TF_RCVD_SCALE\|TF_REQ_SCALE)) {
4458	sc->sc_requested_s_scale = tb.requested_s_scale;
4459	sc->sc_request_r_scale = `0`;
4460	/*
4461	* Pick the smallest possible scaling factor that
4462	* will still allow us to scale up to sb_max.
4463	*
4464	* We do this because there are broken firewalls that
4465	* will corrupt the window scale option, leading to
4466	* the other endpoint believing that our advertised
4467	* window is unscaled. At scale factors larger than
4468	* 5 the unscaled window will drop below 1500 bytes,
4469	* leading to serious problems when traversing these
4470	* broken firewalls.
4471	*
4472	* With the default sbmax of 256K, a scale factor
4473	* of 3 will be chosen by this algorithm. Those who
4474	* choose a larger sbmax should watch out
4475	* for the compatiblity problems mentioned above.
4476	*
4477	* RFC1323: The Window field in a SYN (i.e., a <SYN>
4478	* or <SYN,ACK>) segment itself is never scaled.
4479	*/
4480	while (sc->sc_request_r_scale < TCP_MAX_WINSHIFT &&
4481	(TCP_MAXWIN << sc->sc_request_r_scale) < sb_max)
4482	sc->sc_request_r_scale++;
4483	} else {
4484	sc->sc_requested_s_scale = `15`;
4485	sc->sc_request_r_scale = `15`;
4486	}
4487	if ((tb.t_flags & TF_SACK_PERMIT) && tcp_do_sack)
4488	sc->sc_flags \|= SCF_SACK_PERMIT;
4489
4490	/*
4491	* ECN setup packet recieved.
4492	*/
4493	if ((th->th_flags & (TH_ECE\|TH_CWR)) && tcp_do_ecn)
4494	sc->sc_flags \|= SCF_ECN_PERMIT;
4495
4496	#ifdef TCP_SIGNATURE
4497	if (tb.t_flags & TF_SIGNATURE)
4498	sc->sc_flags \|= SCF_SIGNATURE;
4499	#endif
4500	sc->sc_tp = tp;
4501	if (syn_cache_respond(sc, m) == `0`) {
4502	uint64_t *tcps = TCP_STAT_GETREF();
4503	tcps[TCP_STAT_SNDACKS]++;
4504	tcps[TCP_STAT_SNDTOTAL]++;
4505	TCP_STAT_PUTREF();
4506	syn_cache_insert(sc, tp);
4507	} else {
4508	s = splsoftnet();
4509	/*
4510	* syn_cache_put() will try to schedule the timer, so
4511	* we need to initialize it
4512	*/
4513	SYN_CACHE_TIMER_ARM(sc);
4514	syn_cache_put(sc);
4515	splx(s);
4516	TCP_STATINC(TCP_STAT_SC_DROPPED);
4517	}
4518	return (`1`);
4519	}
4520
4521	/*
4522	* syn_cache_respond: (re)send SYN+ACK.
4523	*
4524	* returns 0 on success. otherwise returns an errno, typically ENOBUFS.
4525	*/
4526
4527	int
4528	syn_cache_respond(struct syn_cache sc, struct* mbuf *m)
4529	{
4530	#ifdef INET6
4531	struct rtentry *rt;
4532	#endif
4533	struct route *ro;
4534	u_int8_t *optp;
4535	int optlen, error;
4536	u_int16_t tlen;
4537	struct ip *ip = NULL;
4538	#ifdef INET6
4539	struct ip6_hdr *ip6 = NULL;
4540	#endif
4541	struct tcpcb *tp = NULL;
4542	struct tcphdr *th;
4543	u_int hlen;
4544	struct socket *so;
4545
4546	ro = &sc->sc_route;
4547	switch (sc->sc_src.sa.sa_family) {
4548	case AF_INET:
4549	hlen = sizeof(struct ip);
4550	break;
4551	#ifdef INET6
4552	case AF_INET6:
4553	hlen = sizeof(struct ip6_hdr);
4554	break;
4555	#endif
4556	default:
4557	if (m)
4558	m_freem(m);
4559	return (EAFNOSUPPORT);
4560	}
4561
4562	/ Compute the size of the TCP options. /
4563	optlen = `4` + (sc->sc_request_r_scale != `15` ? `4` : `0`) +
4564	((sc->sc_flags & SCF_SACK_PERMIT) ? (TCPOLEN_SACK_PERMITTED + `2`) : `0`) +
4565	#ifdef TCP_SIGNATURE
4566	((sc->sc_flags & SCF_SIGNATURE) ? (TCPOLEN_SIGNATURE + `2`) : `0`) +
4567	#endif
4568	((sc->sc_flags & SCF_TIMESTAMP) ? TCPOLEN_TSTAMP_APPA : `0`);
4569
4570	tlen = hlen + sizeof(struct tcphdr) + optlen;
4571
4572	/*
4573	* Create the IP+TCP header from scratch.
4574	*/
4575	if (m)
4576	m_freem(m);
4577	#ifdef DIAGNOSTIC
4578	if (max_linkhdr + tlen > MCLBYTES)
4579	return (ENOBUFS);
4580	#endif
4581	MGETHDR(m, M_DONTWAIT, MT_DATA);
4582	if (m && (max_linkhdr + tlen) > MHLEN) {
4583	MCLGET(m, M_DONTWAIT);
4584	if ((m->m_flags & M_EXT) == `0`) {
4585	m_freem(m);
4586	m = NULL;
4587	}
4588	}
4589	if (m == NULL)
4590	return (ENOBUFS);
4591	MCLAIM(m, &tcp_tx_mowner);
4592
4593	/ Fixup the mbuf. /
4594	m->m_data += max_linkhdr;
4595	m->m_len = m->m_pkthdr.len = tlen;
4596	if (sc->sc_tp) {
4597	tp = sc->sc_tp;
4598	if (tp->t_inpcb)
4599	so = tp->t_inpcb->inp_socket;
4600	#ifdef INET6
4601	else if (tp->t_in6pcb)
4602	so = tp->t_in6pcb->in6p_socket;
4603	#endif
4604	else
4605	so = NULL;
4606	} else
4607	so = NULL;
4608	m_reset_rcvif(m);
4609	memset(mtod(m, u_char *), `0`, tlen);
4610
4611	switch (sc->sc_src.sa.sa_family) {
4612	case AF_INET:
4613	ip = mtod(m, struct ip *);
4614	ip->ip_v = `4`;
4615	ip->ip_dst = sc->sc_src.sin.sin_addr;
4616	ip->ip_src = sc->sc_dst.sin.sin_addr;
4617	ip->ip_p = IPPROTO_TCP;
4618	th = (struct tcphdr *)(ip + `1`);
4619	th->th_dport = sc->sc_src.sin.sin_port;
4620	th->th_sport = sc->sc_dst.sin.sin_port;
4621	break;
4622	#ifdef INET6
4623	case AF_INET6:
4624	ip6 = mtod(m, struct ip6_hdr *);
4625	ip6->ip6_vfc = IPV6_VERSION;
4626	ip6->ip6_dst = sc->sc_src.sin6.sin6_addr;
4627	ip6->ip6_src = sc->sc_dst.sin6.sin6_addr;
4628	ip6->ip6_nxt = IPPROTO_TCP;
4629	/ ip6_plen will be updated in ip6_output() /
4630	th = (struct tcphdr *)(ip6 + `1`);
4631	th->th_dport = sc->sc_src.sin6.sin6_port;
4632	th->th_sport = sc->sc_dst.sin6.sin6_port;
4633	break;
4634	#endif
4635	default:
4636	th = NULL;
4637	}
4638
4639	th->th_seq = htonl(sc->sc_iss);
4640	th->th_ack = htonl(sc->sc_irs + `1`);
4641	th->th_off = (sizeof(struct tcphdr) + optlen) >> `2`;
4642	th->th_flags = TH_SYN\|TH_ACK;
4643	th->th_win = htons(sc->sc_win);
4644	/ th_sum already 0 /
4645	/ th_urp already 0 /
4646
4647	/ Tack on the TCP options. /
4648	optp = (u_int8_t *)(th + `1`);
4649	*optp++ = TCPOPT_MAXSEG;
4650	*optp++ = `4`;
4651	*optp++ = (sc->sc_ourmaxseg >> `8`) & `0xff`;
4652	*optp++ = sc->sc_ourmaxseg & `0xff`;
4653
4654	if (sc->sc_request_r_scale != `15`) {
4655	((u_int32_t )optp) = htonl(TCPOPT_NOP << `24` \|
4656	TCPOPT_WINDOW << `16` \| TCPOLEN_WINDOW << `8` \|
4657	sc->sc_request_r_scale);
4658	optp += `4`;
4659	}
4660
4661	if (sc->sc_flags & SCF_TIMESTAMP) {
4662	u_int32_t lp = (u_int32_t )(optp);
4663	/ Form timestamp option as shown in appendix A of RFC 1323. /
4664	*lp++ = htonl(TCPOPT_TSTAMP_HDR);
4665	*lp++ = htonl(SYN_CACHE_TIMESTAMP(sc));
4666	*lp = htonl(sc->sc_timestamp);
4667	optp += TCPOLEN_TSTAMP_APPA;
4668	}
4669
4670	if (sc->sc_flags & SCF_SACK_PERMIT) {
4671	u_int8_t *p = optp;
4672
4673	/ Let the peer know that we will SACK. /
4674	p[`0`] = TCPOPT_SACK_PERMITTED;
4675	p[`1`] = `2`;
4676	p[`2`] = TCPOPT_NOP;
4677	p[`3`] = TCPOPT_NOP;
4678	optp += `4`;
4679	}
4680
4681	/*
4682	* Send ECN SYN-ACK setup packet.
4683	* Routes can be asymetric, so, even if we receive a packet
4684	* with ECE and CWR set, we must not assume no one will block
4685	* the ECE packet we are about to send.
4686	*/
4687	if ((sc->sc_flags & SCF_ECN_PERMIT) && tp &&
4688	SEQ_GEQ(tp->snd_nxt, tp->snd_max)) {
4689	th->th_flags \|= TH_ECE;
4690	TCP_STATINC(TCP_STAT_ECN_SHS);
4691
4692	/*
4693	* draft-ietf-tcpm-ecnsyn-00.txt
4694	*
4695	* "[...] a TCP node MAY respond to an ECN-setup
4696	* SYN packet by setting ECT in the responding
4697	* ECN-setup SYN/ACK packet, indicating to routers
4698	* that the SYN/ACK packet is ECN-Capable.
4699	* This allows a congested router along the path
4700	* to mark the packet instead of dropping the
4701	* packet as an indication of congestion."
4702	*
4703	* "[...] There can be a great benefit in setting
4704	* an ECN-capable codepoint in SYN/ACK packets [...]
4705	* Congestion is most likely to occur in
4706	* the server-to-client direction. As a result,
4707	* setting an ECN-capable codepoint in SYN/ACK
4708	* packets can reduce the occurence of three-second
4709	* retransmit timeouts resulting from the drop
4710	* of SYN/ACK packets."
4711	*
4712	* Page 4 and 6, January 2006.
4713	*/
4714
4715	switch (sc->sc_src.sa.sa_family) {
4716	#ifdef INET
4717	case AF_INET:
4718	ip->ip_tos \|= IPTOS_ECN_ECT0;
4719	break;
4720	#endif
4721	#ifdef INET6
4722	case AF_INET6:
4723	ip6->ip6_flow \|= htonl(IPTOS_ECN_ECT0 << `20`);
4724	break;
4725	#endif
4726	}
4727	TCP_STATINC(TCP_STAT_ECN_ECT);
4728	}
4729
4730	#ifdef TCP_SIGNATURE
4731	if (sc->sc_flags & SCF_SIGNATURE) {
4732	struct secasvar *sav;
4733	u_int8_t *sigp;
4734
4735	sav = tcp_signature_getsav(m, th);
4736
4737	if (sav == NULL) {
4738	if (m)
4739	m_freem(m);
4740	return (EPERM);
4741	}
4742
4743	*optp++ = TCPOPT_SIGNATURE;
4744	*optp++ = TCPOLEN_SIGNATURE;
4745	sigp = optp;
4746	memset(optp, `0`, TCP_SIGLEN);
4747	optp += TCP_SIGLEN;
4748	*optp++ = TCPOPT_NOP;
4749	*optp++ = TCPOPT_EOL;
4750
4751	(void)tcp_signature(m, th, hlen, sav, sigp);
4752
4753	key_sa_recordxfer(sav, m);
4754	KEY_FREESAV(&sav);
4755	}
4756	#endif
4757
4758	/ Compute the packet's checksum. /
4759	switch (sc->sc_src.sa.sa_family) {
4760	case AF_INET:
4761	ip->ip_len = htons(tlen - hlen);
4762	th->th_sum = `0`;
4763	th->th_sum = in4_cksum(m, IPPROTO_TCP, hlen, tlen - hlen);
4764	break;
4765	#ifdef INET6
4766	case AF_INET6:
4767	ip6->ip6_plen = htons(tlen - hlen);
4768	th->th_sum = `0`;
4769	th->th_sum = in6_cksum(m, IPPROTO_TCP, hlen, tlen - hlen);
4770	break;
4771	#endif
4772	}
4773
4774	/*
4775	* Fill in some straggling IP bits. Note the stack expects
4776	* ip_len to be in host order, for convenience.
4777	*/
4778	switch (sc->sc_src.sa.sa_family) {
4779	#ifdef INET
4780	case AF_INET:
4781	ip->ip_len = htons(tlen);
4782	ip->ip_ttl = ip_defttl;
4783	/ XXX tos? /
4784	break;
4785	#endif
4786	#ifdef INET6
4787	case AF_INET6:
4788	ip6->ip6_vfc &= ~IPV6_VERSION_MASK;
4789	ip6->ip6_vfc \|= IPV6_VERSION;
4790	ip6->ip6_plen = htons(tlen - hlen);
4791	/ ip6_hlim will be initialized afterwards /
4792	/ XXX flowlabel? /
4793	break;
4794	#endif
4795	}
4796
4797	/ XXX use IPsec policy on listening socket, on SYN ACK /
4798	tp = sc->sc_tp;
4799
4800	switch (sc->sc_src.sa.sa_family) {
4801	#ifdef INET
4802	case AF_INET:
4803	error = ip_output(m, sc->sc_ipopts, ro,
4804	(ip_mtudisc ? IP_MTUDISC : `0`),
4805	NULL, so);
4806	break;
4807	#endif
4808	#ifdef INET6
4809	case AF_INET6:
4810	ip6->ip6_hlim = in6_selecthlim(NULL,
4811	(rt = rtcache_validate(ro)) != NULL ? rt->rt_ifp : NULL);
4812
4813	error = ip6_output(m, NULL /XXX/, ro, `0`, NULL, so, NULL);
4814	break;
4815	#endif
4816	default:
4817	error = EAFNOSUPPORT;
4818	break;
4819	}
4820	return (error);
4821	}
4822

Browse the source code of src/src/sys/netinet/tcp_input.c