Blame - net/ipv4/tcp_ipv4.c - linux-imx

blob: 4a5daecbd2ac185c604518fb4862b7b710ba8984 [file] [log] [blame]

Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1	/*
				2	* INET An implementation of the TCP/IP protocol suite for the LINUX
				3	* operating system. INET is implemented using the BSD Socket
				4	* interface as the means of communication with the user level.
				5	*
				6	* Implementation of the Transmission Control Protocol(TCP).
				7	*
				8	* Version: $Id: tcp_ipv4.c,v 1.240 2002/02/01 22:01:04 davem Exp $
				9	*
				10	* IPv4 specific functions
				11	*
				12	*
				13	* code split from:
				14	* linux/ipv4/tcp.c
				15	* linux/ipv4/tcp_input.c
				16	* linux/ipv4/tcp_output.c
				17	*
				18	* See tcp.c for author information
				19	*
				20	* This program is free software; you can redistribute it and/or
				21	* modify it under the terms of the GNU General Public License
				22	* as published by the Free Software Foundation; either version
				23	* 2 of the License, or (at your option) any later version.
				24	*/
				25
				26	/*
				27	* Changes:
				28	* David S. Miller : New socket lookup architecture.
				29	* This code is dedicated to John Dyson.
				30	* David S. Miller : Change semantics of established hash,
				31	* half is devoted to TIME_WAIT sockets
				32	* and the rest go in the other half.
				33	* Andi Kleen : Add support for syncookies and fixed
				34	* some bugs: ip options weren't passed to
				35	* the TCP layer, missed a check for an
				36	* ACK bit.
				37	* Andi Kleen : Implemented fast path mtu discovery.
				38	* Fixed many serious bugs in the
Arnaldo Carvalho de Melo	60236fd	2005-06-18 22:47:21 -0700	[diff] [blame]	39	* request_sock handling and moved
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	40	* most of it into the af independent code.
				41	* Added tail drop and some other bugfixes.
				42	* Added new listen sematics.
				43	* Mike McLagan : Routing by source
				44	* Juan Jose Ciarlante: ip_dynaddr bits
				45	* Andi Kleen: various fixes.
				46	* Vitaly E. Lavrov : Transparent proxy revived after year
				47	* coma.
				48	* Andi Kleen : Fix new listen.
				49	* Andi Kleen : Fix accept error reporting.
				50	* YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
				51	* Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
				52	* a single port at the same time.
				53	*/
				54
				55	#include <linux/config.h>
				56
				57	#include <linux/types.h>
				58	#include <linux/fcntl.h>
				59	#include <linux/module.h>
				60	#include <linux/random.h>
				61	#include <linux/cache.h>
				62	#include <linux/jhash.h>
				63	#include <linux/init.h>
				64	#include <linux/times.h>
				65
				66	#include <net/icmp.h>
				67	#include <net/tcp.h>
				68	#include <net/ipv6.h>
				69	#include <net/inet_common.h>
				70	#include <net/xfrm.h>
				71
				72	#include <linux/inet.h>
				73	#include <linux/ipv6.h>
				74	#include <linux/stddef.h>
				75	#include <linux/proc_fs.h>
				76	#include <linux/seq_file.h>
				77
				78	extern int sysctl_ip_dynaddr;
				79	int sysctl_tcp_tw_reuse;
				80	int sysctl_tcp_low_latency;
				81
				82	/* Check TCP sequence numbers in ICMP packets. */
				83	#define ICMP_MIN_LENGTH 8
				84
				85	/* Socket used for sending RSTs */
				86	static struct socket *tcp_socket;
				87
				88	void tcp_v4_send_check(struct sock sk, struct tcphdr th, int len,
				89	struct sk_buff *skb);
				90
				91	struct tcp_hashinfo __cacheline_aligned tcp_hashinfo = {
				92	.__tcp_lhash_lock = RW_LOCK_UNLOCKED,
				93	.__tcp_lhash_users = ATOMIC_INIT(0),
				94	.__tcp_lhash_wait
				95	= __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.__tcp_lhash_wait),
				96	.__tcp_portalloc_lock = SPIN_LOCK_UNLOCKED
				97	};
				98
				99	/*
				100	* This array holds the first and last local port number.
				101	* For high-usage systems, use sysctl to change this to
				102	* 32768-61000
				103	*/
				104	int sysctl_local_port_range[2] = { 1024, 4999 };
				105	int tcp_port_rover = 1024 - 1;
				106
				107	static __inline__ int tcp_hashfn(__u32 laddr, __u16 lport,
				108	__u32 faddr, __u16 fport)
				109	{
				110	int h = (laddr ^ lport) ^ (faddr ^ fport);
				111	h ^= h >> 16;
				112	h ^= h >> 8;
				113	return h & (tcp_ehash_size - 1);
				114	}
				115
				116	static __inline__ int tcp_sk_hashfn(struct sock *sk)
				117	{
				118	struct inet_sock *inet = inet_sk(sk);
				119	__u32 laddr = inet->rcv_saddr;
				120	__u16 lport = inet->num;
				121	__u32 faddr = inet->daddr;
				122	__u16 fport = inet->dport;
				123
				124	return tcp_hashfn(laddr, lport, faddr, fport);
				125	}
				126
				127	/* Allocate and initialize a new TCP local port bind bucket.
				128	* The bindhash mutex for snum's hash chain must be held here.
				129	*/
				130	struct tcp_bind_bucket tcp_bucket_create(struct tcp_bind_hashbucket head,
				131	unsigned short snum)
				132	{
				133	struct tcp_bind_bucket *tb = kmem_cache_alloc(tcp_bucket_cachep,
				134	SLAB_ATOMIC);
				135	if (tb) {
				136	tb->port = snum;
				137	tb->fastreuse = 0;
				138	INIT_HLIST_HEAD(&tb->owners);
				139	hlist_add_head(&tb->node, &head->chain);
				140	}
				141	return tb;
				142	}
				143
				144	/* Caller must hold hashbucket lock for this tb with local BH disabled */
				145	void tcp_bucket_destroy(struct tcp_bind_bucket *tb)
				146	{
				147	if (hlist_empty(&tb->owners)) {
				148	__hlist_del(&tb->node);
				149	kmem_cache_free(tcp_bucket_cachep, tb);
				150	}
				151	}
				152
				153	/* Caller must disable local BH processing. */
				154	static __inline__ void __tcp_inherit_port(struct sock sk, struct sock child)
				155	{
				156	struct tcp_bind_hashbucket *head =
				157	&tcp_bhash[tcp_bhashfn(inet_sk(child)->num)];
				158	struct tcp_bind_bucket *tb;
				159
				160	spin_lock(&head->lock);
				161	tb = tcp_sk(sk)->bind_hash;
				162	sk_add_bind_node(child, &tb->owners);
				163	tcp_sk(child)->bind_hash = tb;
				164	spin_unlock(&head->lock);
				165	}
				166
				167	inline void tcp_inherit_port(struct sock sk, struct sock child)
				168	{
				169	local_bh_disable();
				170	__tcp_inherit_port(sk, child);
				171	local_bh_enable();
				172	}
				173
				174	void tcp_bind_hash(struct sock sk, struct tcp_bind_bucket tb,
				175	unsigned short snum)
				176	{
				177	inet_sk(sk)->num = snum;
				178	sk_add_bind_node(sk, &tb->owners);
				179	tcp_sk(sk)->bind_hash = tb;
				180	}
				181
				182	static inline int tcp_bind_conflict(struct sock sk, struct tcp_bind_bucket tb)
				183	{
				184	const u32 sk_rcv_saddr = tcp_v4_rcv_saddr(sk);
				185	struct sock *sk2;
				186	struct hlist_node *node;
				187	int reuse = sk->sk_reuse;
				188
				189	sk_for_each_bound(sk2, node, &tb->owners) {
				190	if (sk != sk2 &&
				191	!tcp_v6_ipv6only(sk2) &&
				192	(!sk->sk_bound_dev_if \|\|
				193	!sk2->sk_bound_dev_if \|\|
				194	sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
				195	if (!reuse \|\| !sk2->sk_reuse \|\|
				196	sk2->sk_state == TCP_LISTEN) {
				197	const u32 sk2_rcv_saddr = tcp_v4_rcv_saddr(sk2);
				198	if (!sk2_rcv_saddr \|\| !sk_rcv_saddr \|\|
				199	sk2_rcv_saddr == sk_rcv_saddr)
				200	break;
				201	}
				202	}
				203	}
				204	return node != NULL;
				205	}
				206
				207	/* Obtain a reference to a local port for the given sock,
				208	* if snum is zero it means select any available local port.
				209	*/
				210	static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
				211	{
				212	struct tcp_bind_hashbucket *head;
				213	struct hlist_node *node;
				214	struct tcp_bind_bucket *tb;
				215	int ret;
				216
				217	local_bh_disable();
				218	if (!snum) {
				219	int low = sysctl_local_port_range[0];
				220	int high = sysctl_local_port_range[1];
				221	int remaining = (high - low) + 1;
				222	int rover;
				223
				224	spin_lock(&tcp_portalloc_lock);
Folkert van Heusden	0b2531b	2005-05-03 14:36:08 -0700	[diff] [blame]	225	if (tcp_port_rover < low)
				226	rover = low;
				227	else
				228	rover = tcp_port_rover;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	229	do {
				230	rover++;
Folkert van Heusden	0b2531b	2005-05-03 14:36:08 -0700	[diff] [blame]	231	if (rover > high)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	232	rover = low;
				233	head = &tcp_bhash[tcp_bhashfn(rover)];
				234	spin_lock(&head->lock);
				235	tb_for_each(tb, node, &head->chain)
				236	if (tb->port == rover)
				237	goto next;
				238	break;
				239	next:
				240	spin_unlock(&head->lock);
				241	} while (--remaining > 0);
				242	tcp_port_rover = rover;
				243	spin_unlock(&tcp_portalloc_lock);
				244
David S. Miller	d5d2837	2005-08-23 10:49:54 -0700	[diff] [blame]	245	/* Exhausted local port range during search? It is not
				246	* possible for us to be holding one of the bind hash
				247	* locks if this test triggers, because if 'remaining'
				248	* drops to zero, we broke out of the do/while loop at
				249	* the top level, not from the 'break;' statement.
				250	*/
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	251	ret = 1;
David S. Miller	d5d2837	2005-08-23 10:49:54 -0700	[diff] [blame]	252	if (unlikely(remaining <= 0))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	253	goto fail;
				254
				255	/* OK, here is the one we will use. HEAD is
				256	* non-NULL and we hold it's mutex.
				257	*/
				258	snum = rover;
				259	} else {
				260	head = &tcp_bhash[tcp_bhashfn(snum)];
				261	spin_lock(&head->lock);
				262	tb_for_each(tb, node, &head->chain)
				263	if (tb->port == snum)
				264	goto tb_found;
				265	}
				266	tb = NULL;
				267	goto tb_not_found;
				268	tb_found:
				269	if (!hlist_empty(&tb->owners)) {
				270	if (sk->sk_reuse > 1)
				271	goto success;
				272	if (tb->fastreuse > 0 &&
				273	sk->sk_reuse && sk->sk_state != TCP_LISTEN) {
				274	goto success;
				275	} else {
				276	ret = 1;
				277	if (tcp_bind_conflict(sk, tb))
				278	goto fail_unlock;
				279	}
				280	}
				281	tb_not_found:
				282	ret = 1;
				283	if (!tb && (tb = tcp_bucket_create(head, snum)) == NULL)
				284	goto fail_unlock;
				285	if (hlist_empty(&tb->owners)) {
				286	if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
				287	tb->fastreuse = 1;
				288	else
				289	tb->fastreuse = 0;
				290	} else if (tb->fastreuse &&
				291	(!sk->sk_reuse \|\| sk->sk_state == TCP_LISTEN))
				292	tb->fastreuse = 0;
				293	success:
				294	if (!tcp_sk(sk)->bind_hash)
				295	tcp_bind_hash(sk, tb, snum);
				296	BUG_TRAP(tcp_sk(sk)->bind_hash == tb);
				297	ret = 0;
				298
				299	fail_unlock:
				300	spin_unlock(&head->lock);
				301	fail:
				302	local_bh_enable();
				303	return ret;
				304	}
				305
				306	/* Get rid of any references to a local port held by the
				307	* given sock.
				308	*/
				309	static void __tcp_put_port(struct sock *sk)
				310	{
				311	struct inet_sock *inet = inet_sk(sk);
				312	struct tcp_bind_hashbucket *head = &tcp_bhash[tcp_bhashfn(inet->num)];
				313	struct tcp_bind_bucket *tb;
				314
				315	spin_lock(&head->lock);
				316	tb = tcp_sk(sk)->bind_hash;
				317	__sk_del_bind_node(sk);
				318	tcp_sk(sk)->bind_hash = NULL;
				319	inet->num = 0;
				320	tcp_bucket_destroy(tb);
				321	spin_unlock(&head->lock);
				322	}
				323
				324	void tcp_put_port(struct sock *sk)
				325	{
				326	local_bh_disable();
				327	__tcp_put_port(sk);
				328	local_bh_enable();
				329	}
				330
				331	/* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it can be very bad on SMP.
				332	* Look, when several writers sleep and reader wakes them up, all but one
				333	* immediately hit write lock and grab all the cpus. Exclusive sleep solves
				334	* this, _but_ remember, it adds useless work on UP machines (wake up each
				335	* exclusive lock release). It should be ifdefed really.
				336	*/
				337
				338	void tcp_listen_wlock(void)
				339	{
				340	write_lock(&tcp_lhash_lock);
				341
				342	if (atomic_read(&tcp_lhash_users)) {
				343	DEFINE_WAIT(wait);
				344
				345	for (;;) {
				346	prepare_to_wait_exclusive(&tcp_lhash_wait,
				347	&wait, TASK_UNINTERRUPTIBLE);
				348	if (!atomic_read(&tcp_lhash_users))
				349	break;
				350	write_unlock_bh(&tcp_lhash_lock);
				351	schedule();
				352	write_lock_bh(&tcp_lhash_lock);
				353	}
				354
				355	finish_wait(&tcp_lhash_wait, &wait);
				356	}
				357	}
				358
				359	static __inline__ void __tcp_v4_hash(struct sock *sk, const int listen_possible)
				360	{
				361	struct hlist_head *list;
				362	rwlock_t *lock;
				363
				364	BUG_TRAP(sk_unhashed(sk));
				365	if (listen_possible && sk->sk_state == TCP_LISTEN) {
				366	list = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)];
				367	lock = &tcp_lhash_lock;
				368	tcp_listen_wlock();
				369	} else {
				370	list = &tcp_ehash[(sk->sk_hashent = tcp_sk_hashfn(sk))].chain;
				371	lock = &tcp_ehash[sk->sk_hashent].lock;
				372	write_lock(lock);
				373	}
				374	__sk_add_node(sk, list);
				375	sock_prot_inc_use(sk->sk_prot);
				376	write_unlock(lock);
				377	if (listen_possible && sk->sk_state == TCP_LISTEN)
				378	wake_up(&tcp_lhash_wait);
				379	}
				380
				381	static void tcp_v4_hash(struct sock *sk)
				382	{
				383	if (sk->sk_state != TCP_CLOSE) {
				384	local_bh_disable();
				385	__tcp_v4_hash(sk, 1);
				386	local_bh_enable();
				387	}
				388	}
				389
				390	void tcp_unhash(struct sock *sk)
				391	{
				392	rwlock_t *lock;
				393
				394	if (sk_unhashed(sk))
				395	goto ende;
				396
				397	if (sk->sk_state == TCP_LISTEN) {
				398	local_bh_disable();
				399	tcp_listen_wlock();
				400	lock = &tcp_lhash_lock;
				401	} else {
				402	struct tcp_ehash_bucket *head = &tcp_ehash[sk->sk_hashent];
				403	lock = &head->lock;
				404	write_lock_bh(&head->lock);
				405	}
				406
				407	if (__sk_del_node_init(sk))
				408	sock_prot_dec_use(sk->sk_prot);
				409	write_unlock_bh(lock);
				410
				411	ende:
				412	if (sk->sk_state == TCP_LISTEN)
				413	wake_up(&tcp_lhash_wait);
				414	}
				415
				416	/* Don't inline this cruft. Here are some nice properties to
				417	* exploit here. The BSD API does not allow a listening TCP
				418	* to specify the remote port nor the remote address for the
				419	* connection. So always assume those are both wildcarded
				420	* during the search since they can never be otherwise.
				421	*/
				422	static struct sock __tcp_v4_lookup_listener(struct hlist_head head, u32 daddr,
				423	unsigned short hnum, int dif)
				424	{
				425	struct sock result = NULL, sk;
				426	struct hlist_node *node;
				427	int score, hiscore;
				428
				429	hiscore=-1;
				430	sk_for_each(sk, node, head) {
				431	struct inet_sock *inet = inet_sk(sk);
				432
				433	if (inet->num == hnum && !ipv6_only_sock(sk)) {
				434	__u32 rcv_saddr = inet->rcv_saddr;
				435
				436	score = (sk->sk_family == PF_INET ? 1 : 0);
				437	if (rcv_saddr) {
				438	if (rcv_saddr != daddr)
				439	continue;
				440	score+=2;
				441	}
				442	if (sk->sk_bound_dev_if) {
				443	if (sk->sk_bound_dev_if != dif)
				444	continue;
				445	score+=2;
				446	}
				447	if (score == 5)
				448	return sk;
				449	if (score > hiscore) {
				450	hiscore = score;
				451	result = sk;
				452	}
				453	}
				454	}
				455	return result;
				456	}
				457
				458	/* Optimize the common listener case. */
				459	static inline struct sock *tcp_v4_lookup_listener(u32 daddr,
				460	unsigned short hnum, int dif)
				461	{
				462	struct sock *sk = NULL;
				463	struct hlist_head *head;
				464
				465	read_lock(&tcp_lhash_lock);
				466	head = &tcp_listening_hash[tcp_lhashfn(hnum)];
				467	if (!hlist_empty(head)) {
				468	struct inet_sock *inet = inet_sk((sk = __sk_head(head)));
				469
				470	if (inet->num == hnum && !sk->sk_node.next &&
				471	(!inet->rcv_saddr \|\| inet->rcv_saddr == daddr) &&
				472	(sk->sk_family == PF_INET \|\| !ipv6_only_sock(sk)) &&
				473	!sk->sk_bound_dev_if)
				474	goto sherry_cache;
				475	sk = __tcp_v4_lookup_listener(head, daddr, hnum, dif);
				476	}
				477	if (sk) {
				478	sherry_cache:
				479	sock_hold(sk);
				480	}
				481	read_unlock(&tcp_lhash_lock);
				482	return sk;
				483	}
				484
				485	/* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
				486	* we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
				487	*
				488	* Local BH must be disabled here.
				489	*/
				490
				491	static inline struct sock *__tcp_v4_lookup_established(u32 saddr, u16 sport,
				492	u32 daddr, u16 hnum,
				493	int dif)
				494	{
				495	struct tcp_ehash_bucket *head;
				496	TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
				497	__u32 ports = TCP_COMBINED_PORTS(sport, hnum);
				498	struct sock *sk;
				499	struct hlist_node *node;
				500	/* Optimize here for direct hit, only listening connections can
				501	* have wildcards anyways.
				502	*/
				503	int hash = tcp_hashfn(daddr, hnum, saddr, sport);
				504	head = &tcp_ehash[hash];
				505	read_lock(&head->lock);
				506	sk_for_each(sk, node, &head->chain) {
				507	if (TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
				508	goto hit; /* You sunk my battleship! */
				509	}
				510
				511	/* Must check for a TIME_WAIT'er before going to listener hash. */
				512	sk_for_each(sk, node, &(head + tcp_ehash_size)->chain) {
				513	if (TCP_IPV4_TW_MATCH(sk, acookie, saddr, daddr, ports, dif))
				514	goto hit;
				515	}
				516	sk = NULL;
				517	out:
				518	read_unlock(&head->lock);
				519	return sk;
				520	hit:
				521	sock_hold(sk);
				522	goto out;
				523	}
				524
				525	static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport,
				526	u32 daddr, u16 hnum, int dif)
				527	{
				528	struct sock *sk = __tcp_v4_lookup_established(saddr, sport,
				529	daddr, hnum, dif);
				530
				531	return sk ? : tcp_v4_lookup_listener(daddr, hnum, dif);
				532	}
				533
				534	inline struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr,
				535	u16 dport, int dif)
				536	{
				537	struct sock *sk;
				538
				539	local_bh_disable();
				540	sk = __tcp_v4_lookup(saddr, sport, daddr, ntohs(dport), dif);
				541	local_bh_enable();
				542
				543	return sk;
				544	}
				545
				546	EXPORT_SYMBOL_GPL(tcp_v4_lookup);
				547
				548	static inline __u32 tcp_v4_init_sequence(struct sock sk, struct sk_buff skb)
				549	{
				550	return secure_tcp_sequence_number(skb->nh.iph->daddr,
				551	skb->nh.iph->saddr,
				552	skb->h.th->dest,
				553	skb->h.th->source);
				554	}
				555
				556	/* called with local bh disabled */
				557	static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
				558	struct tcp_tw_bucket **twp)
				559	{
				560	struct inet_sock *inet = inet_sk(sk);
				561	u32 daddr = inet->rcv_saddr;
				562	u32 saddr = inet->daddr;
				563	int dif = sk->sk_bound_dev_if;
				564	TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
				565	__u32 ports = TCP_COMBINED_PORTS(inet->dport, lport);
				566	int hash = tcp_hashfn(daddr, lport, saddr, inet->dport);
				567	struct tcp_ehash_bucket *head = &tcp_ehash[hash];
				568	struct sock *sk2;
				569	struct hlist_node *node;
				570	struct tcp_tw_bucket *tw;
				571
				572	write_lock(&head->lock);
				573
				574	/* Check TIME-WAIT sockets first. */
				575	sk_for_each(sk2, node, &(head + tcp_ehash_size)->chain) {
				576	tw = (struct tcp_tw_bucket *)sk2;
				577
				578	if (TCP_IPV4_TW_MATCH(sk2, acookie, saddr, daddr, ports, dif)) {
				579	struct tcp_sock *tp = tcp_sk(sk);
				580
				581	/* With PAWS, it is safe from the viewpoint
				582	of data integrity. Even without PAWS it
				583	is safe provided sequence spaces do not
				584	overlap i.e. at data rates <= 80Mbit/sec.
				585
				586	Actually, the idea is close to VJ's one,
				587	only timestamp cache is held not per host,
				588	but per port pair and TW bucket is used
				589	as state holder.
				590
				591	If TW bucket has been already destroyed we
				592	fall back to VJ's scheme and use initial
				593	timestamp retrieved from peer table.
				594	*/
				595	if (tw->tw_ts_recent_stamp &&
				596	(!twp \|\| (sysctl_tcp_tw_reuse &&
				597	xtime.tv_sec -
				598	tw->tw_ts_recent_stamp > 1))) {
				599	if ((tp->write_seq =
				600	tw->tw_snd_nxt + 65535 + 2) == 0)
				601	tp->write_seq = 1;
				602	tp->rx_opt.ts_recent = tw->tw_ts_recent;
				603	tp->rx_opt.ts_recent_stamp = tw->tw_ts_recent_stamp;
				604	sock_hold(sk2);
				605	goto unique;
				606	} else
				607	goto not_unique;
				608	}
				609	}
				610	tw = NULL;
				611
				612	/* And established part... */
				613	sk_for_each(sk2, node, &head->chain) {
				614	if (TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif))
				615	goto not_unique;
				616	}
				617
				618	unique:
				619	/* Must record num and sport now. Otherwise we will see
				620	* in hash table socket with a funny identity. */
				621	inet->num = lport;
				622	inet->sport = htons(lport);
				623	sk->sk_hashent = hash;
				624	BUG_TRAP(sk_unhashed(sk));
				625	__sk_add_node(sk, &head->chain);
				626	sock_prot_inc_use(sk->sk_prot);
				627	write_unlock(&head->lock);
				628
				629	if (twp) {
				630	*twp = tw;
				631	NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
				632	} else if (tw) {
				633	/* Silly. Should hash-dance instead... */
				634	tcp_tw_deschedule(tw);
				635	NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
				636
				637	tcp_tw_put(tw);
				638	}
				639
				640	return 0;
				641
				642	not_unique:
				643	write_unlock(&head->lock);
				644	return -EADDRNOTAVAIL;
				645	}
				646
				647	static inline u32 connect_port_offset(const struct sock *sk)
				648	{
				649	const struct inet_sock *inet = inet_sk(sk);
				650
				651	return secure_tcp_port_ephemeral(inet->rcv_saddr, inet->daddr,
				652	inet->dport);
				653	}
				654
				655	/*
				656	* Bind a port for a connect operation and hash it.
				657	*/
				658	static inline int tcp_v4_hash_connect(struct sock *sk)
				659	{
				660	unsigned short snum = inet_sk(sk)->num;
				661	struct tcp_bind_hashbucket *head;
				662	struct tcp_bind_bucket *tb;
				663	int ret;
				664
				665	if (!snum) {
				666	int low = sysctl_local_port_range[0];
				667	int high = sysctl_local_port_range[1];
				668	int range = high - low;
				669	int i;
				670	int port;
				671	static u32 hint;
				672	u32 offset = hint + connect_port_offset(sk);
				673	struct hlist_node *node;
				674	struct tcp_tw_bucket *tw = NULL;
				675
				676	local_bh_disable();
				677	for (i = 1; i <= range; i++) {
				678	port = low + (i + offset) % range;
				679	head = &tcp_bhash[tcp_bhashfn(port)];
				680	spin_lock(&head->lock);
				681
				682	/* Does not bother with rcv_saddr checks,
				683	* because the established check is already
				684	* unique enough.
				685	*/
				686	tb_for_each(tb, node, &head->chain) {
				687	if (tb->port == port) {
				688	BUG_TRAP(!hlist_empty(&tb->owners));
				689	if (tb->fastreuse >= 0)
				690	goto next_port;
				691	if (!__tcp_v4_check_established(sk,
				692	port,
				693	&tw))
				694	goto ok;
				695	goto next_port;
				696	}
				697	}
				698
				699	tb = tcp_bucket_create(head, port);
				700	if (!tb) {
				701	spin_unlock(&head->lock);
				702	break;
				703	}
				704	tb->fastreuse = -1;
				705	goto ok;
				706
				707	next_port:
				708	spin_unlock(&head->lock);
				709	}
				710	local_bh_enable();
				711
				712	return -EADDRNOTAVAIL;
				713
				714	ok:
				715	hint += i;
				716
				717	/* Head lock still held and bh's disabled */
				718	tcp_bind_hash(sk, tb, port);
				719	if (sk_unhashed(sk)) {
				720	inet_sk(sk)->sport = htons(port);
				721	__tcp_v4_hash(sk, 0);
				722	}
				723	spin_unlock(&head->lock);
				724
				725	if (tw) {
				726	tcp_tw_deschedule(tw);
				727	tcp_tw_put(tw);
				728	}
				729
				730	ret = 0;
				731	goto out;
				732	}
				733
				734	head = &tcp_bhash[tcp_bhashfn(snum)];
				735	tb = tcp_sk(sk)->bind_hash;
				736	spin_lock_bh(&head->lock);
				737	if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
				738	__tcp_v4_hash(sk, 0);
				739	spin_unlock_bh(&head->lock);
				740	return 0;
				741	} else {
				742	spin_unlock(&head->lock);
				743	/* No definite answer... Walk to established hash table */
				744	ret = __tcp_v4_check_established(sk, snum, NULL);
				745	out:
				746	local_bh_enable();
				747	return ret;
				748	}
				749	}
				750
				751	/* This will initiate an outgoing connection. */
				752	int tcp_v4_connect(struct sock sk, struct sockaddr uaddr, int addr_len)
				753	{
				754	struct inet_sock *inet = inet_sk(sk);
				755	struct tcp_sock *tp = tcp_sk(sk);
				756	struct sockaddr_in usin = (struct sockaddr_in )uaddr;
				757	struct rtable *rt;
				758	u32 daddr, nexthop;
				759	int tmp;
				760	int err;
				761
				762	if (addr_len < sizeof(struct sockaddr_in))
				763	return -EINVAL;
				764
				765	if (usin->sin_family != AF_INET)
				766	return -EAFNOSUPPORT;
				767
				768	nexthop = daddr = usin->sin_addr.s_addr;
				769	if (inet->opt && inet->opt->srr) {
				770	if (!daddr)
				771	return -EINVAL;
				772	nexthop = inet->opt->faddr;
				773	}
				774
				775	tmp = ip_route_connect(&rt, nexthop, inet->saddr,
				776	RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
				777	IPPROTO_TCP,
				778	inet->sport, usin->sin_port, sk);
				779	if (tmp < 0)
				780	return tmp;
				781
				782	if (rt->rt_flags & (RTCF_MULTICAST \| RTCF_BROADCAST)) {
				783	ip_rt_put(rt);
				784	return -ENETUNREACH;
				785	}
				786
				787	if (!inet->opt \|\| !inet->opt->srr)
				788	daddr = rt->rt_dst;
				789
				790	if (!inet->saddr)
				791	inet->saddr = rt->rt_src;
				792	inet->rcv_saddr = inet->saddr;
				793
				794	if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
				795	/* Reset inherited state */
				796	tp->rx_opt.ts_recent = 0;
				797	tp->rx_opt.ts_recent_stamp = 0;
				798	tp->write_seq = 0;
				799	}
				800
				801	if (sysctl_tcp_tw_recycle &&
				802	!tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
				803	struct inet_peer *peer = rt_get_peer(rt);
				804
				805	/* VJ's idea. We save last timestamp seen from
				806	* the destination in peer table, when entering state TIME-WAIT
				807	* and initialize rx_opt.ts_recent from it, when trying new connection.
				808	*/
				809
				810	if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {
				811	tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
				812	tp->rx_opt.ts_recent = peer->tcp_ts;
				813	}
				814	}
				815
				816	inet->dport = usin->sin_port;
				817	inet->daddr = daddr;
				818
				819	tp->ext_header_len = 0;
				820	if (inet->opt)
				821	tp->ext_header_len = inet->opt->optlen;
				822
				823	tp->rx_opt.mss_clamp = 536;
				824
				825	/* Socket identity is still unknown (sport may be zero).
				826	* However we set state to SYN-SENT and not releasing socket
				827	* lock select source port, enter ourselves into the hash tables and
				828	* complete initialization after this.
				829	*/
				830	tcp_set_state(sk, TCP_SYN_SENT);
				831	err = tcp_v4_hash_connect(sk);
				832	if (err)
				833	goto failure;
				834
				835	err = ip_route_newports(&rt, inet->sport, inet->dport, sk);
				836	if (err)
				837	goto failure;
				838
				839	/* OK, now commit destination to socket. */
Arnaldo Carvalho de Melo	6cbb0df	2005-08-09 19:49:02 -0700	[diff] [blame^]	840	sk_setup_caps(sk, &rt->u.dst);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	841
				842	if (!tp->write_seq)
				843	tp->write_seq = secure_tcp_sequence_number(inet->saddr,
				844	inet->daddr,
				845	inet->sport,
				846	usin->sin_port);
				847
				848	inet->id = tp->write_seq ^ jiffies;
				849
				850	err = tcp_connect(sk);
				851	rt = NULL;
				852	if (err)
				853	goto failure;
				854
				855	return 0;
				856
				857	failure:
				858	/* This unhashes the socket and releases the local port, if necessary. */
				859	tcp_set_state(sk, TCP_CLOSE);
				860	ip_rt_put(rt);
				861	sk->sk_route_caps = 0;
				862	inet->dport = 0;
				863	return err;
				864	}
				865
				866	static __inline__ int tcp_v4_iif(struct sk_buff *skb)
				867	{
				868	return ((struct rtable *)skb->dst)->rt_iif;
				869	}
				870
				871	static __inline__ u32 tcp_v4_synq_hash(u32 raddr, u16 rport, u32 rnd)
				872	{
				873	return (jhash_2words(raddr, (u32) rport, rnd) & (TCP_SYNQ_HSIZE - 1));
				874	}
				875
Arnaldo Carvalho de Melo	60236fd	2005-06-18 22:47:21 -0700	[diff] [blame]	876	static struct request_sock tcp_v4_search_req(struct tcp_sock tp,
				877	struct request_sock ***prevp,
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	878	__u16 rport,
				879	__u32 raddr, __u32 laddr)
				880	{
Arnaldo Carvalho de Melo	2ad69c5	2005-06-18 22:48:55 -0700	[diff] [blame]	881	struct listen_sock *lopt = tp->accept_queue.listen_opt;
Arnaldo Carvalho de Melo	60236fd	2005-06-18 22:47:21 -0700	[diff] [blame]	882	struct request_sock req, *prev;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	883
				884	for (prev = &lopt->syn_table[tcp_v4_synq_hash(raddr, rport, lopt->hash_rnd)];
				885	(req = *prev) != NULL;
				886	prev = &req->dl_next) {
Arnaldo Carvalho de Melo	2e6599c	2005-06-18 22:46:52 -0700	[diff] [blame]	887	const struct inet_request_sock *ireq = inet_rsk(req);
				888
				889	if (ireq->rmt_port == rport &&
				890	ireq->rmt_addr == raddr &&
				891	ireq->loc_addr == laddr &&
Arnaldo Carvalho de Melo	60236fd	2005-06-18 22:47:21 -0700	[diff] [blame]	892	TCP_INET_FAMILY(req->rsk_ops->family)) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	893	BUG_TRAP(!req->sk);
				894	*prevp = prev;
				895	break;
				896	}
				897	}
				898
				899	return req;
				900	}
				901
Arnaldo Carvalho de Melo	60236fd	2005-06-18 22:47:21 -0700	[diff] [blame]	902	static void tcp_v4_synq_add(struct sock sk, struct request_sock req)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	903	{
				904	struct tcp_sock *tp = tcp_sk(sk);
Arnaldo Carvalho de Melo	2ad69c5	2005-06-18 22:48:55 -0700	[diff] [blame]	905	struct listen_sock *lopt = tp->accept_queue.listen_opt;
Arnaldo Carvalho de Melo	2e6599c	2005-06-18 22:46:52 -0700	[diff] [blame]	906	u32 h = tcp_v4_synq_hash(inet_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port, lopt->hash_rnd);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	907
Arnaldo Carvalho de Melo	0e87506	2005-06-18 22:47:59 -0700	[diff] [blame]	908	reqsk_queue_hash_req(&tp->accept_queue, h, req, TCP_TIMEOUT_INIT);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	909	tcp_synq_added(sk);
				910	}
				911
				912
				913	/*
				914	* This routine does path mtu discovery as defined in RFC1191.
				915	*/
				916	static inline void do_pmtu_discovery(struct sock sk, struct iphdr iph,
				917	u32 mtu)
				918	{
				919	struct dst_entry *dst;
				920	struct inet_sock *inet = inet_sk(sk);
				921	struct tcp_sock *tp = tcp_sk(sk);
				922
				923	/* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
				924	* send out by Linux are always <576bytes so they should go through
				925	* unfragmented).
				926	*/
				927	if (sk->sk_state == TCP_LISTEN)
				928	return;
				929
				930	/* We don't check in the destentry if pmtu discovery is forbidden
				931	* on this route. We just assume that no packet_to_big packets
				932	* are send back when pmtu discovery is not active.
				933	* There is a small race when the user changes this flag in the
				934	* route, but I think that's acceptable.
				935	*/
				936	if ((dst = __sk_dst_check(sk, 0)) == NULL)
				937	return;
				938
				939	dst->ops->update_pmtu(dst, mtu);
				940
				941	/* Something is about to be wrong... Remember soft error
				942	* for the case, if this connection will not able to recover.
				943	*/
				944	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
				945	sk->sk_err_soft = EMSGSIZE;
				946
				947	mtu = dst_mtu(dst);
				948
				949	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
				950	tp->pmtu_cookie > mtu) {
				951	tcp_sync_mss(sk, mtu);
				952
				953	/* Resend the TCP packet because it's
				954	* clear that the old packet has been
				955	* dropped. This is the new "fast" path mtu
				956	* discovery.
				957	*/
				958	tcp_simple_retransmit(sk);
				959	} /* else let the usual retransmit timer handle it */
				960	}
				961
				962	/*
				963	* This routine is called by the ICMP module when it gets some
				964	* sort of error condition. If err < 0 then the socket should
				965	* be closed and the error returned to the user. If err > 0
				966	* it's just the icmp type << 8 \| icmp code. After adjustment
				967	* header points to the first 8 bytes of the tcp header. We need
				968	* to find the appropriate port.
				969	*
				970	* The locking strategy used here is very "optimistic". When
				971	* someone else accesses the socket the ICMP is just dropped
				972	* and for some paths there is no check at all.
				973	* A more general error queue to queue errors for later handling
				974	* is probably better.
				975	*
				976	*/
				977
				978	void tcp_v4_err(struct sk_buff *skb, u32 info)
				979	{
				980	struct iphdr iph = (struct iphdr )skb->data;
				981	struct tcphdr th = (struct tcphdr )(skb->data + (iph->ihl << 2));
				982	struct tcp_sock *tp;
				983	struct inet_sock *inet;
				984	int type = skb->h.icmph->type;
				985	int code = skb->h.icmph->code;
				986	struct sock *sk;
				987	__u32 seq;
				988	int err;
				989
				990	if (skb->len < (iph->ihl << 2) + 8) {
				991	ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
				992	return;
				993	}
				994
				995	sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr,
				996	th->source, tcp_v4_iif(skb));
				997	if (!sk) {
				998	ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
				999	return;
				1000	}
				1001	if (sk->sk_state == TCP_TIME_WAIT) {
				1002	tcp_tw_put((struct tcp_tw_bucket *)sk);
				1003	return;
				1004	}
				1005
				1006	bh_lock_sock(sk);
				1007	/* If too many ICMPs get dropped on busy
				1008	* servers this needs to be solved differently.
				1009	*/
				1010	if (sock_owned_by_user(sk))
				1011	NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
				1012
				1013	if (sk->sk_state == TCP_CLOSE)
				1014	goto out;
				1015
				1016	tp = tcp_sk(sk);
				1017	seq = ntohl(th->seq);
				1018	if (sk->sk_state != TCP_LISTEN &&
				1019	!between(seq, tp->snd_una, tp->snd_nxt)) {
				1020	NET_INC_STATS(LINUX_MIB_OUTOFWINDOWICMPS);
				1021	goto out;
				1022	}
				1023
				1024	switch (type) {
				1025	case ICMP_SOURCE_QUENCH:
				1026	/* Just silently ignore these. */
				1027	goto out;
				1028	case ICMP_PARAMETERPROB:
				1029	err = EPROTO;
				1030	break;
				1031	case ICMP_DEST_UNREACH:
				1032	if (code > NR_ICMP_UNREACH)
				1033	goto out;
				1034
				1035	if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
				1036	if (!sock_owned_by_user(sk))
				1037	do_pmtu_discovery(sk, iph, info);
				1038	goto out;
				1039	}
				1040
				1041	err = icmp_err_convert[code].errno;
				1042	break;
				1043	case ICMP_TIME_EXCEEDED:
				1044	err = EHOSTUNREACH;
				1045	break;
				1046	default:
				1047	goto out;
				1048	}
				1049
				1050	switch (sk->sk_state) {
Arnaldo Carvalho de Melo	60236fd	2005-06-18 22:47:21 -0700	[diff] [blame]	1051	struct request_sock req, *prev;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1052	case TCP_LISTEN:
				1053	if (sock_owned_by_user(sk))
				1054	goto out;
				1055
				1056	req = tcp_v4_search_req(tp, &prev, th->dest,
				1057	iph->daddr, iph->saddr);
				1058	if (!req)
				1059	goto out;
				1060
				1061	/* ICMPs are not backlogged, hence we cannot get
				1062	an established socket here.
				1063	*/
				1064	BUG_TRAP(!req->sk);
				1065
Arnaldo Carvalho de Melo	2e6599c	2005-06-18 22:46:52 -0700	[diff] [blame]	1066	if (seq != tcp_rsk(req)->snt_isn) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1067	NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
				1068	goto out;
				1069	}
				1070
				1071	/*
				1072	* Still in SYN_RECV, just remove it silently.
				1073	* There is no good way to pass the error to the newly
				1074	* created socket, and POSIX does not want network
				1075	* errors returned from accept().
				1076	*/
				1077	tcp_synq_drop(sk, req, prev);
				1078	goto out;
				1079
				1080	case TCP_SYN_SENT:
				1081	case TCP_SYN_RECV: /* Cannot happen.
				1082	It can f.e. if SYNs crossed.
				1083	*/
				1084	if (!sock_owned_by_user(sk)) {
				1085	TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
				1086	sk->sk_err = err;
				1087
				1088	sk->sk_error_report(sk);
				1089
				1090	tcp_done(sk);
				1091	} else {
				1092	sk->sk_err_soft = err;
				1093	}
				1094	goto out;
				1095	}
				1096
				1097	/* If we've already connected we will keep trying
				1098	* until we time out, or the user gives up.
				1099	*
				1100	* rfc1122 4.2.3.9 allows to consider as hard errors
				1101	* only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
				1102	* but it is obsoleted by pmtu discovery).
				1103	*
				1104	* Note, that in modern internet, where routing is unreliable
				1105	* and in each dark corner broken firewalls sit, sending random
				1106	* errors ordered by their masters even this two messages finally lose
				1107	* their original sense (even Linux sends invalid PORT_UNREACHs)
				1108	*
				1109	* Now we are in compliance with RFCs.
				1110	* --ANK (980905)
				1111	*/
				1112
				1113	inet = inet_sk(sk);
				1114	if (!sock_owned_by_user(sk) && inet->recverr) {
				1115	sk->sk_err = err;
				1116	sk->sk_error_report(sk);
				1117	} else { /* Only an error on timeout */
				1118	sk->sk_err_soft = err;
				1119	}
				1120
				1121	out:
				1122	bh_unlock_sock(sk);
				1123	sock_put(sk);
				1124	}
				1125
				1126	/* This routine computes an IPv4 TCP checksum. */
				1127	void tcp_v4_send_check(struct sock sk, struct tcphdr th, int len,
				1128	struct sk_buff *skb)
				1129	{
				1130	struct inet_sock *inet = inet_sk(sk);
				1131
				1132	if (skb->ip_summed == CHECKSUM_HW) {
				1133	th->check = ~tcp_v4_check(th, len, inet->saddr, inet->daddr, 0);
				1134	skb->csum = offsetof(struct tcphdr, check);
				1135	} else {
				1136	th->check = tcp_v4_check(th, len, inet->saddr, inet->daddr,
				1137	csum_partial((char *)th,
				1138	th->doff << 2,
				1139	skb->csum));
				1140	}
				1141	}
				1142
				1143	/*
				1144	* This routine will send an RST to the other tcp.
				1145	*
				1146	* Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
				1147	* for reset.
				1148	* Answer: if a packet caused RST, it is not for a socket
				1149	* existing in our system, if it is matched to a socket,
				1150	* it is just duplicate segment or bug in other side's TCP.
				1151	* So that we build reply only basing on parameters
				1152	* arrived with segment.
				1153	* Exception: precedence violation. We do not implement it in any case.
				1154	*/
				1155
				1156	static void tcp_v4_send_reset(struct sk_buff *skb)
				1157	{
				1158	struct tcphdr *th = skb->h.th;
				1159	struct tcphdr rth;
				1160	struct ip_reply_arg arg;
				1161
				1162	/* Never send a reset in response to a reset. */
				1163	if (th->rst)
				1164	return;
				1165
				1166	if (((struct rtable *)skb->dst)->rt_type != RTN_LOCAL)
				1167	return;
				1168
				1169	/* Swap the send and the receive. */
				1170	memset(&rth, 0, sizeof(struct tcphdr));
				1171	rth.dest = th->source;
				1172	rth.source = th->dest;
				1173	rth.doff = sizeof(struct tcphdr) / 4;
				1174	rth.rst = 1;
				1175
				1176	if (th->ack) {
				1177	rth.seq = th->ack_seq;
				1178	} else {
				1179	rth.ack = 1;
				1180	rth.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
				1181	skb->len - (th->doff << 2));
				1182	}
				1183
				1184	memset(&arg, 0, sizeof arg);
				1185	arg.iov[0].iov_base = (unsigned char *)&rth;
				1186	arg.iov[0].iov_len = sizeof rth;
				1187	arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
				1188	skb->nh.iph->saddr, /XXX/
				1189	sizeof(struct tcphdr), IPPROTO_TCP, 0);
				1190	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
				1191
				1192	ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth);
				1193
				1194	TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
				1195	TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
				1196	}
				1197
				1198	/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
				1199	outside socket context is ugly, certainly. What can I do?
				1200	*/
				1201
				1202	static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
				1203	u32 win, u32 ts)
				1204	{
				1205	struct tcphdr *th = skb->h.th;
				1206	struct {
				1207	struct tcphdr th;
				1208	u32 tsopt[3];
				1209	} rep;
				1210	struct ip_reply_arg arg;
				1211
				1212	memset(&rep.th, 0, sizeof(struct tcphdr));
				1213	memset(&arg, 0, sizeof arg);
				1214
				1215	arg.iov[0].iov_base = (unsigned char *)&rep;
				1216	arg.iov[0].iov_len = sizeof(rep.th);
				1217	if (ts) {
				1218	rep.tsopt[0] = htonl((TCPOPT_NOP << 24) \| (TCPOPT_NOP << 16) \|
				1219	(TCPOPT_TIMESTAMP << 8) \|
				1220	TCPOLEN_TIMESTAMP);
				1221	rep.tsopt[1] = htonl(tcp_time_stamp);
				1222	rep.tsopt[2] = htonl(ts);
				1223	arg.iov[0].iov_len = sizeof(rep);
				1224	}
				1225
				1226	/* Swap the send and the receive. */
				1227	rep.th.dest = th->source;
				1228	rep.th.source = th->dest;
				1229	rep.th.doff = arg.iov[0].iov_len / 4;
				1230	rep.th.seq = htonl(seq);
				1231	rep.th.ack_seq = htonl(ack);
				1232	rep.th.ack = 1;
				1233	rep.th.window = htons(win);
				1234
				1235	arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
				1236	skb->nh.iph->saddr, /XXX/
				1237	arg.iov[0].iov_len, IPPROTO_TCP, 0);
				1238	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
				1239
				1240	ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
				1241
				1242	TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
				1243	}
				1244
				1245	static void tcp_v4_timewait_ack(struct sock sk, struct sk_buff skb)
				1246	{
				1247	struct tcp_tw_bucket tw = (struct tcp_tw_bucket )sk;
				1248
				1249	tcp_v4_send_ack(skb, tw->tw_snd_nxt, tw->tw_rcv_nxt,
				1250	tw->tw_rcv_wnd >> tw->tw_rcv_wscale, tw->tw_ts_recent);
				1251
				1252	tcp_tw_put(tw);
				1253	}
				1254
Arnaldo Carvalho de Melo	60236fd	2005-06-18 22:47:21 -0700	[diff] [blame]	1255	static void tcp_v4_reqsk_send_ack(struct sk_buff skb, struct request_sock req)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1256	{
Arnaldo Carvalho de Melo	2e6599c	2005-06-18 22:46:52 -0700	[diff] [blame]	1257	tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1, tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1258	req->ts_recent);
				1259	}
				1260
				1261	static struct dst_entry* tcp_v4_route_req(struct sock *sk,
Arnaldo Carvalho de Melo	60236fd	2005-06-18 22:47:21 -0700	[diff] [blame]	1262	struct request_sock *req)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1263	{
				1264	struct rtable *rt;
Arnaldo Carvalho de Melo	2e6599c	2005-06-18 22:46:52 -0700	[diff] [blame]	1265	const struct inet_request_sock *ireq = inet_rsk(req);
				1266	struct ip_options *opt = inet_rsk(req)->opt;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1267	struct flowi fl = { .oif = sk->sk_bound_dev_if,
				1268	.nl_u = { .ip4_u =
				1269	{ .daddr = ((opt && opt->srr) ?
				1270	opt->faddr :
Arnaldo Carvalho de Melo	2e6599c	2005-06-18 22:46:52 -0700	[diff] [blame]	1271	ireq->rmt_addr),
				1272	.saddr = ireq->loc_addr,
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1273	.tos = RT_CONN_FLAGS(sk) } },
				1274	.proto = IPPROTO_TCP,
				1275	.uli_u = { .ports =
				1276	{ .sport = inet_sk(sk)->sport,
Arnaldo Carvalho de Melo	2e6599c	2005-06-18 22:46:52 -0700	[diff] [blame]	1277	.dport = ireq->rmt_port } } };
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1278
				1279	if (ip_route_output_flow(&rt, &fl, sk, 0)) {
				1280	IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
				1281	return NULL;
				1282	}
				1283	if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
				1284	ip_rt_put(rt);
				1285	IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
				1286	return NULL;
				1287	}
				1288	return &rt->u.dst;
				1289	}
				1290
				1291	/*
				1292	* Send a SYN-ACK after having received an ACK.
Arnaldo Carvalho de Melo	60236fd	2005-06-18 22:47:21 -0700	[diff] [blame]	1293	* This still operates on a request_sock only, not on a big
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1294	* socket.
				1295	*/
Arnaldo Carvalho de Melo	60236fd	2005-06-18 22:47:21 -0700	[diff] [blame]	1296	static int tcp_v4_send_synack(struct sock sk, struct request_sock req,
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1297	struct dst_entry *dst)
				1298	{
Arnaldo Carvalho de Melo	2e6599c	2005-06-18 22:46:52 -0700	[diff] [blame]	1299	const struct inet_request_sock *ireq = inet_rsk(req);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1300	int err = -1;
				1301	struct sk_buff * skb;
				1302
				1303	/* First, grab a route. */
				1304	if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
				1305	goto out;
				1306
				1307	skb = tcp_make_synack(sk, dst, req);
				1308
				1309	if (skb) {
				1310	struct tcphdr *th = skb->h.th;
				1311
				1312	th->check = tcp_v4_check(th, skb->len,
Arnaldo Carvalho de Melo	2e6599c	2005-06-18 22:46:52 -0700	[diff] [blame]	1313	ireq->loc_addr,
				1314	ireq->rmt_addr,
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1315	csum_partial((char *)th, skb->len,
				1316	skb->csum));
				1317
Arnaldo Carvalho de Melo	2e6599c	2005-06-18 22:46:52 -0700	[diff] [blame]	1318	err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
				1319	ireq->rmt_addr,
				1320	ireq->opt);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1321	if (err == NET_XMIT_CN)
				1322	err = 0;
				1323	}
				1324
				1325	out:
				1326	dst_release(dst);
				1327	return err;
				1328	}
				1329
				1330	/*
Arnaldo Carvalho de Melo	60236fd	2005-06-18 22:47:21 -0700	[diff] [blame]	1331	* IPv4 request_sock destructor.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1332	*/
Arnaldo Carvalho de Melo	60236fd	2005-06-18 22:47:21 -0700	[diff] [blame]	1333	static void tcp_v4_reqsk_destructor(struct request_sock *req)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1334	{
Arnaldo Carvalho de Melo	2e6599c	2005-06-18 22:46:52 -0700	[diff] [blame]	1335	if (inet_rsk(req)->opt)
				1336	kfree(inet_rsk(req)->opt);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1337	}
				1338
				1339	static inline void syn_flood_warning(struct sk_buff *skb)
				1340	{
				1341	static unsigned long warntime;
				1342
				1343	if (time_after(jiffies, (warntime + HZ * 60))) {
				1344	warntime = jiffies;
				1345	printk(KERN_INFO
				1346	"possible SYN flooding on port %d. Sending cookies.\n",
				1347	ntohs(skb->h.th->dest));
				1348	}
				1349	}
				1350
				1351	/*
Arnaldo Carvalho de Melo	60236fd	2005-06-18 22:47:21 -0700	[diff] [blame]	1352	* Save and compile IPv4 options into the request_sock if needed.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1353	*/
				1354	static inline struct ip_options tcp_v4_save_options(struct sock sk,
				1355	struct sk_buff *skb)
				1356	{
				1357	struct ip_options *opt = &(IPCB(skb)->opt);
				1358	struct ip_options *dopt = NULL;
				1359
				1360	if (opt && opt->optlen) {
				1361	int opt_size = optlength(opt);
				1362	dopt = kmalloc(opt_size, GFP_ATOMIC);
				1363	if (dopt) {
				1364	if (ip_options_echo(dopt, skb)) {
				1365	kfree(dopt);
				1366	dopt = NULL;
				1367	}
				1368	}
				1369	}
				1370	return dopt;
				1371	}
				1372
Arnaldo Carvalho de Melo	60236fd	2005-06-18 22:47:21 -0700	[diff] [blame]	1373	struct request_sock_ops tcp_request_sock_ops = {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1374	.family = PF_INET,
Arnaldo Carvalho de Melo	2e6599c	2005-06-18 22:46:52 -0700	[diff] [blame]	1375	.obj_size = sizeof(struct tcp_request_sock),
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1376	.rtx_syn_ack = tcp_v4_send_synack,
Arnaldo Carvalho de Melo	60236fd	2005-06-18 22:47:21 -0700	[diff] [blame]	1377	.send_ack = tcp_v4_reqsk_send_ack,
				1378	.destructor = tcp_v4_reqsk_destructor,
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1379	.send_reset = tcp_v4_send_reset,
				1380	};
				1381
				1382	int tcp_v4_conn_request(struct sock sk, struct sk_buff skb)
				1383	{
Arnaldo Carvalho de Melo	2e6599c	2005-06-18 22:46:52 -0700	[diff] [blame]	1384	struct inet_request_sock *ireq;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1385	struct tcp_options_received tmp_opt;
Arnaldo Carvalho de Melo	60236fd	2005-06-18 22:47:21 -0700	[diff] [blame]	1386	struct request_sock *req;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1387	__u32 saddr = skb->nh.iph->saddr;
				1388	__u32 daddr = skb->nh.iph->daddr;
				1389	__u32 isn = TCP_SKB_CB(skb)->when;
				1390	struct dst_entry *dst = NULL;
				1391	#ifdef CONFIG_SYN_COOKIES
				1392	int want_cookie = 0;
				1393	#else
				1394	#define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
				1395	#endif
				1396
				1397	/* Never answer to SYNs send to broadcast or multicast */
				1398	if (((struct rtable *)skb->dst)->rt_flags &
				1399	(RTCF_BROADCAST \| RTCF_MULTICAST))
				1400	goto drop;
				1401
				1402	/* TW buckets are converted to open requests without
				1403	* limitations, they conserve resources and peer is
				1404	* evidently real one.
				1405	*/
				1406	if (tcp_synq_is_full(sk) && !isn) {
				1407	#ifdef CONFIG_SYN_COOKIES
				1408	if (sysctl_tcp_syncookies) {
				1409	want_cookie = 1;
				1410	} else
				1411	#endif
				1412	goto drop;
				1413	}
				1414
				1415	/* Accept backlog is full. If we have already queued enough
				1416	* of warm entries in syn queue, drop request. It is better than
				1417	* clogging syn queue with openreqs with exponentially increasing
				1418	* timeout.
				1419	*/
				1420	if (sk_acceptq_is_full(sk) && tcp_synq_young(sk) > 1)
				1421	goto drop;
				1422
Arnaldo Carvalho de Melo	60236fd	2005-06-18 22:47:21 -0700	[diff] [blame]	1423	req = reqsk_alloc(&tcp_request_sock_ops);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1424	if (!req)
				1425	goto drop;
				1426
				1427	tcp_clear_options(&tmp_opt);
				1428	tmp_opt.mss_clamp = 536;
				1429	tmp_opt.user_mss = tcp_sk(sk)->rx_opt.user_mss;
				1430
				1431	tcp_parse_options(skb, &tmp_opt, 0);
				1432
				1433	if (want_cookie) {
				1434	tcp_clear_options(&tmp_opt);
				1435	tmp_opt.saw_tstamp = 0;
				1436	}
				1437
				1438	if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
				1439	/* Some OSes (unknown ones, but I see them on web server, which
				1440	* contains information interesting only for windows'
				1441	* users) do not send their stamp in SYN. It is easy case.
				1442	* We simply do not advertise TS support.
				1443	*/
				1444	tmp_opt.saw_tstamp = 0;
				1445	tmp_opt.tstamp_ok = 0;
				1446	}
				1447	tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
				1448
				1449	tcp_openreq_init(req, &tmp_opt, skb);
				1450
Arnaldo Carvalho de Melo	2e6599c	2005-06-18 22:46:52 -0700	[diff] [blame]	1451	ireq = inet_rsk(req);
				1452	ireq->loc_addr = daddr;
				1453	ireq->rmt_addr = saddr;
				1454	ireq->opt = tcp_v4_save_options(sk, skb);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1455	if (!want_cookie)
				1456	TCP_ECN_create_request(req, skb->h.th);
				1457
				1458	if (want_cookie) {
				1459	#ifdef CONFIG_SYN_COOKIES
				1460	syn_flood_warning(skb);
				1461	#endif
				1462	isn = cookie_v4_init_sequence(sk, skb, &req->mss);
				1463	} else if (!isn) {
				1464	struct inet_peer *peer = NULL;
				1465
				1466	/* VJ's idea. We save last timestamp seen
				1467	* from the destination in peer table, when entering
				1468	* state TIME-WAIT, and check against it before
				1469	* accepting new connection request.
				1470	*
				1471	* If "isn" is not zero, this request hit alive
				1472	* timewait bucket, so that all the necessary checks
				1473	* are made in the function processing timewait state.
				1474	*/
				1475	if (tmp_opt.saw_tstamp &&
				1476	sysctl_tcp_tw_recycle &&
				1477	(dst = tcp_v4_route_req(sk, req)) != NULL &&
				1478	(peer = rt_get_peer((struct rtable *)dst)) != NULL &&
				1479	peer->v4daddr == saddr) {
				1480	if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
				1481	(s32)(peer->tcp_ts - req->ts_recent) >
				1482	TCP_PAWS_WINDOW) {
				1483	NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
				1484	dst_release(dst);
				1485	goto drop_and_free;
				1486	}
				1487	}
				1488	/* Kill the following clause, if you dislike this way. */
				1489	else if (!sysctl_tcp_syncookies &&
				1490	(sysctl_max_syn_backlog - tcp_synq_len(sk) <
				1491	(sysctl_max_syn_backlog >> 2)) &&
				1492	(!peer \|\| !peer->tcp_ts_stamp) &&
				1493	(!dst \|\| !dst_metric(dst, RTAX_RTT))) {
				1494	/* Without syncookies last quarter of
				1495	* backlog is filled with destinations,
				1496	* proven to be alive.
				1497	* It means that we continue to communicate
				1498	* to destinations, already remembered
				1499	* to the moment of synflood.
				1500	*/
Heikki Orsila	ca93345	2005-08-08 14:26:52 -0700	[diff] [blame]	1501	LIMIT_NETDEBUG(printk(KERN_DEBUG "TCP: drop open "
				1502	"request from %u.%u."
				1503	"%u.%u/%u\n",
				1504	NIPQUAD(saddr),
				1505	ntohs(skb->h.th->source)));
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1506	dst_release(dst);
				1507	goto drop_and_free;
				1508	}
				1509
				1510	isn = tcp_v4_init_sequence(sk, skb);
				1511	}
Arnaldo Carvalho de Melo	2e6599c	2005-06-18 22:46:52 -0700	[diff] [blame]	1512	tcp_rsk(req)->snt_isn = isn;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1513
				1514	if (tcp_v4_send_synack(sk, req, dst))
				1515	goto drop_and_free;
				1516
				1517	if (want_cookie) {
Arnaldo Carvalho de Melo	60236fd	2005-06-18 22:47:21 -0700	[diff] [blame]	1518	reqsk_free(req);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1519	} else {
				1520	tcp_v4_synq_add(sk, req);
				1521	}
				1522	return 0;
				1523
				1524	drop_and_free:
Arnaldo Carvalho de Melo	60236fd	2005-06-18 22:47:21 -0700	[diff] [blame]	1525	reqsk_free(req);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1526	drop:
				1527	TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
				1528	return 0;
				1529	}
				1530
				1531
				1532	/*
				1533	* The three way handshake has completed - we got a valid synack -
				1534	* now create the new socket.
				1535	*/
				1536	struct sock tcp_v4_syn_recv_sock(struct sock sk, struct sk_buff *skb,
Arnaldo Carvalho de Melo	60236fd	2005-06-18 22:47:21 -0700	[diff] [blame]	1537	struct request_sock *req,
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1538	struct dst_entry *dst)
				1539	{
Arnaldo Carvalho de Melo	2e6599c	2005-06-18 22:46:52 -0700	[diff] [blame]	1540	struct inet_request_sock *ireq;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1541	struct inet_sock *newinet;
				1542	struct tcp_sock *newtp;
				1543	struct sock *newsk;
				1544
				1545	if (sk_acceptq_is_full(sk))
				1546	goto exit_overflow;
				1547
				1548	if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
				1549	goto exit;
				1550
				1551	newsk = tcp_create_openreq_child(sk, req, skb);
				1552	if (!newsk)
				1553	goto exit;
				1554
Arnaldo Carvalho de Melo	6cbb0df	2005-08-09 19:49:02 -0700	[diff] [blame^]	1555	sk_setup_caps(newsk, dst);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1556
				1557	newtp = tcp_sk(newsk);
				1558	newinet = inet_sk(newsk);
Arnaldo Carvalho de Melo	2e6599c	2005-06-18 22:46:52 -0700	[diff] [blame]	1559	ireq = inet_rsk(req);
				1560	newinet->daddr = ireq->rmt_addr;
				1561	newinet->rcv_saddr = ireq->loc_addr;
				1562	newinet->saddr = ireq->loc_addr;
				1563	newinet->opt = ireq->opt;
				1564	ireq->opt = NULL;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1565	newinet->mc_index = tcp_v4_iif(skb);
				1566	newinet->mc_ttl = skb->nh.iph->ttl;
				1567	newtp->ext_header_len = 0;
				1568	if (newinet->opt)
				1569	newtp->ext_header_len = newinet->opt->optlen;
				1570	newinet->id = newtp->write_seq ^ jiffies;
				1571
				1572	tcp_sync_mss(newsk, dst_mtu(dst));
				1573	newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
				1574	tcp_initialize_rcv_mss(newsk);
				1575
				1576	__tcp_v4_hash(newsk, 0);
				1577	__tcp_inherit_port(sk, newsk);
				1578
				1579	return newsk;
				1580
				1581	exit_overflow:
				1582	NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
				1583	exit:
				1584	NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
				1585	dst_release(dst);
				1586	return NULL;
				1587	}
				1588
				1589	static struct sock tcp_v4_hnd_req(struct sock sk, struct sk_buff *skb)
				1590	{
				1591	struct tcphdr *th = skb->h.th;
				1592	struct iphdr *iph = skb->nh.iph;
				1593	struct tcp_sock *tp = tcp_sk(sk);
				1594	struct sock *nsk;
Arnaldo Carvalho de Melo	60236fd	2005-06-18 22:47:21 -0700	[diff] [blame]	1595	struct request_sock **prev;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1596	/* Find possible connection requests. */
Arnaldo Carvalho de Melo	60236fd	2005-06-18 22:47:21 -0700	[diff] [blame]	1597	struct request_sock *req = tcp_v4_search_req(tp, &prev, th->source,
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1598	iph->saddr, iph->daddr);
				1599	if (req)
				1600	return tcp_check_req(sk, skb, req, prev);
				1601
				1602	nsk = __tcp_v4_lookup_established(skb->nh.iph->saddr,
				1603	th->source,
				1604	skb->nh.iph->daddr,
				1605	ntohs(th->dest),
				1606	tcp_v4_iif(skb));
				1607
				1608	if (nsk) {
				1609	if (nsk->sk_state != TCP_TIME_WAIT) {
				1610	bh_lock_sock(nsk);
				1611	return nsk;
				1612	}
				1613	tcp_tw_put((struct tcp_tw_bucket *)nsk);
				1614	return NULL;
				1615	}
				1616
				1617	#ifdef CONFIG_SYN_COOKIES
				1618	if (!th->rst && !th->syn && th->ack)
				1619	sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
				1620	#endif
				1621	return sk;
				1622	}
				1623
				1624	static int tcp_v4_checksum_init(struct sk_buff *skb)
				1625	{
				1626	if (skb->ip_summed == CHECKSUM_HW) {
				1627	skb->ip_summed = CHECKSUM_UNNECESSARY;
				1628	if (!tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
				1629	skb->nh.iph->daddr, skb->csum))
				1630	return 0;
				1631
Heikki Orsila	ca93345	2005-08-08 14:26:52 -0700	[diff] [blame]	1632	LIMIT_NETDEBUG(printk(KERN_DEBUG "hw tcp v4 csum failed\n"));
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1633	skb->ip_summed = CHECKSUM_NONE;
				1634	}
				1635	if (skb->len <= 76) {
				1636	if (tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
				1637	skb->nh.iph->daddr,
				1638	skb_checksum(skb, 0, skb->len, 0)))
				1639	return -1;
				1640	skb->ip_summed = CHECKSUM_UNNECESSARY;
				1641	} else {
				1642	skb->csum = ~tcp_v4_check(skb->h.th, skb->len,
				1643	skb->nh.iph->saddr,
				1644	skb->nh.iph->daddr, 0);
				1645	}
				1646	return 0;
				1647	}
				1648
				1649
				1650	/* The socket must have it's spinlock held when we get
				1651	* here.
				1652	*
				1653	* We have a potential double-lock case here, so even when
				1654	* doing backlog processing we use the BH locking scheme.
				1655	* This is because we cannot sleep with the original spinlock
				1656	* held.
				1657	*/
				1658	int tcp_v4_do_rcv(struct sock sk, struct sk_buff skb)
				1659	{
				1660	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
				1661	TCP_CHECK_TIMER(sk);
				1662	if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
				1663	goto reset;
				1664	TCP_CHECK_TIMER(sk);
				1665	return 0;
				1666	}
				1667
				1668	if (skb->len < (skb->h.th->doff << 2) \|\| tcp_checksum_complete(skb))
				1669	goto csum_err;
				1670
				1671	if (sk->sk_state == TCP_LISTEN) {
				1672	struct sock *nsk = tcp_v4_hnd_req(sk, skb);
				1673	if (!nsk)
				1674	goto discard;
				1675
				1676	if (nsk != sk) {
				1677	if (tcp_child_process(sk, nsk, skb))
				1678	goto reset;
				1679	return 0;
				1680	}
				1681	}
				1682
				1683	TCP_CHECK_TIMER(sk);
				1684	if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
				1685	goto reset;
				1686	TCP_CHECK_TIMER(sk);
				1687	return 0;
				1688
				1689	reset:
				1690	tcp_v4_send_reset(skb);
				1691	discard:
				1692	kfree_skb(skb);
				1693	/* Be careful here. If this function gets more complicated and
				1694	* gcc suffers from register pressure on the x86, sk (in %ebx)
				1695	* might be destroyed here. This current version compiles correctly,
				1696	* but you have been warned.
				1697	*/
				1698	return 0;
				1699
				1700	csum_err:
				1701	TCP_INC_STATS_BH(TCP_MIB_INERRS);
				1702	goto discard;
				1703	}
				1704
				1705	/*
				1706	* From tcp_input.c
				1707	*/
				1708
				1709	int tcp_v4_rcv(struct sk_buff *skb)
				1710	{
				1711	struct tcphdr *th;
				1712	struct sock *sk;
				1713	int ret;
				1714
				1715	if (skb->pkt_type != PACKET_HOST)
				1716	goto discard_it;
				1717
				1718	/* Count it even if it's bad */
				1719	TCP_INC_STATS_BH(TCP_MIB_INSEGS);
				1720
				1721	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
				1722	goto discard_it;
				1723
				1724	th = skb->h.th;
				1725
				1726	if (th->doff < sizeof(struct tcphdr) / 4)
				1727	goto bad_packet;
				1728	if (!pskb_may_pull(skb, th->doff * 4))
				1729	goto discard_it;
				1730
				1731	/* An explanation is required here, I think.
				1732	* Packet length and doff are validated by header prediction,
				1733	* provided case of th->doff==0 is elimineted.
				1734	* So, we defer the checks. */
				1735	if ((skb->ip_summed != CHECKSUM_UNNECESSARY &&
				1736	tcp_v4_checksum_init(skb) < 0))
				1737	goto bad_packet;
				1738
				1739	th = skb->h.th;
				1740	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
				1741	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
				1742	skb->len - th->doff * 4);
				1743	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
				1744	TCP_SKB_CB(skb)->when = 0;
				1745	TCP_SKB_CB(skb)->flags = skb->nh.iph->tos;
				1746	TCP_SKB_CB(skb)->sacked = 0;
				1747
				1748	sk = __tcp_v4_lookup(skb->nh.iph->saddr, th->source,
				1749	skb->nh.iph->daddr, ntohs(th->dest),
				1750	tcp_v4_iif(skb));
				1751
				1752	if (!sk)
				1753	goto no_tcp_socket;
				1754
				1755	process:
				1756	if (sk->sk_state == TCP_TIME_WAIT)
				1757	goto do_time_wait;
				1758
				1759	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
				1760	goto discard_and_relse;
				1761
				1762	if (sk_filter(sk, skb, 0))
				1763	goto discard_and_relse;
				1764
				1765	skb->dev = NULL;
				1766
				1767	bh_lock_sock(sk);
				1768	ret = 0;
				1769	if (!sock_owned_by_user(sk)) {
				1770	if (!tcp_prequeue(sk, skb))
				1771	ret = tcp_v4_do_rcv(sk, skb);
				1772	} else
				1773	sk_add_backlog(sk, skb);
				1774	bh_unlock_sock(sk);
				1775
				1776	sock_put(sk);
				1777
				1778	return ret;
				1779
				1780	no_tcp_socket:
				1781	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
				1782	goto discard_it;
				1783
				1784	if (skb->len < (th->doff << 2) \|\| tcp_checksum_complete(skb)) {
				1785	bad_packet:
				1786	TCP_INC_STATS_BH(TCP_MIB_INERRS);
				1787	} else {
				1788	tcp_v4_send_reset(skb);
				1789	}
				1790
				1791	discard_it:
				1792	/* Discard frame. */
				1793	kfree_skb(skb);
				1794	return 0;
				1795
				1796	discard_and_relse:
				1797	sock_put(sk);
				1798	goto discard_it;
				1799
				1800	do_time_wait:
				1801	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
				1802	tcp_tw_put((struct tcp_tw_bucket *) sk);
				1803	goto discard_it;
				1804	}
				1805
				1806	if (skb->len < (th->doff << 2) \|\| tcp_checksum_complete(skb)) {
				1807	TCP_INC_STATS_BH(TCP_MIB_INERRS);
				1808	tcp_tw_put((struct tcp_tw_bucket *) sk);
				1809	goto discard_it;
				1810	}
				1811	switch (tcp_timewait_state_process((struct tcp_tw_bucket *)sk,
				1812	skb, th, skb->len)) {
				1813	case TCP_TW_SYN: {
				1814	struct sock *sk2 = tcp_v4_lookup_listener(skb->nh.iph->daddr,
				1815	ntohs(th->dest),
				1816	tcp_v4_iif(skb));
				1817	if (sk2) {
				1818	tcp_tw_deschedule((struct tcp_tw_bucket *)sk);
				1819	tcp_tw_put((struct tcp_tw_bucket *)sk);
				1820	sk = sk2;
				1821	goto process;
				1822	}
				1823	/* Fall through to ACK */
				1824	}
				1825	case TCP_TW_ACK:
				1826	tcp_v4_timewait_ack(sk, skb);
				1827	break;
				1828	case TCP_TW_RST:
				1829	goto no_tcp_socket;
				1830	case TCP_TW_SUCCESS:;
				1831	}
				1832	goto discard_it;
				1833	}
				1834
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1835	static int tcp_v4_reselect_saddr(struct sock *sk)
				1836	{
				1837	struct inet_sock *inet = inet_sk(sk);
				1838	int err;
				1839	struct rtable *rt;
				1840	__u32 old_saddr = inet->saddr;
				1841	__u32 new_saddr;
				1842	__u32 daddr = inet->daddr;
				1843
				1844	if (inet->opt && inet->opt->srr)
				1845	daddr = inet->opt->faddr;
				1846
				1847	/* Query new route. */
				1848	err = ip_route_connect(&rt, daddr, 0,
				1849	RT_CONN_FLAGS(sk),
				1850	sk->sk_bound_dev_if,
				1851	IPPROTO_TCP,
				1852	inet->sport, inet->dport, sk);
				1853	if (err)
				1854	return err;
				1855
Arnaldo Carvalho de Melo	6cbb0df	2005-08-09 19:49:02 -0700	[diff] [blame^]	1856	sk_setup_caps(sk, &rt->u.dst);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1857
				1858	new_saddr = rt->rt_src;
				1859
				1860	if (new_saddr == old_saddr)
				1861	return 0;
				1862
				1863	if (sysctl_ip_dynaddr > 1) {
				1864	printk(KERN_INFO "tcp_v4_rebuild_header(): shifting inet->"
				1865	"saddr from %d.%d.%d.%d to %d.%d.%d.%d\n",
				1866	NIPQUAD(old_saddr),
				1867	NIPQUAD(new_saddr));
				1868	}
				1869
				1870	inet->saddr = new_saddr;
				1871	inet->rcv_saddr = new_saddr;
				1872
				1873	/* XXX The only one ugly spot where we need to
				1874	* XXX really change the sockets identity after
				1875	* XXX it has entered the hashes. -DaveM
				1876	*
				1877	* Besides that, it does not check for connection
				1878	* uniqueness. Wait for troubles.
				1879	*/
Arnaldo Carvalho de Melo	614c6cb	2005-08-09 19:47:37 -0700	[diff] [blame]	1880	__sk_prot_rehash(sk);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1881	return 0;
				1882	}
				1883
				1884	int tcp_v4_rebuild_header(struct sock *sk)
				1885	{
				1886	struct inet_sock *inet = inet_sk(sk);
				1887	struct rtable rt = (struct rtable )__sk_dst_check(sk, 0);
				1888	u32 daddr;
				1889	int err;
				1890
				1891	/* Route is OK, nothing to do. */
				1892	if (rt)
				1893	return 0;
				1894
				1895	/* Reroute. */
				1896	daddr = inet->daddr;
				1897	if (inet->opt && inet->opt->srr)
				1898	daddr = inet->opt->faddr;
				1899
				1900	{
				1901	struct flowi fl = { .oif = sk->sk_bound_dev_if,
				1902	.nl_u = { .ip4_u =
				1903	{ .daddr = daddr,
				1904	.saddr = inet->saddr,
				1905	.tos = RT_CONN_FLAGS(sk) } },
				1906	.proto = IPPROTO_TCP,
				1907	.uli_u = { .ports =
				1908	{ .sport = inet->sport,
				1909	.dport = inet->dport } } };
				1910
				1911	err = ip_route_output_flow(&rt, &fl, sk, 0);
				1912	}
				1913	if (!err) {
Arnaldo Carvalho de Melo	6cbb0df	2005-08-09 19:49:02 -0700	[diff] [blame^]	1914	sk_setup_caps(sk, &rt->u.dst);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1915	return 0;
				1916	}
				1917
				1918	/* Routing failed... */
				1919	sk->sk_route_caps = 0;
				1920
				1921	if (!sysctl_ip_dynaddr \|\|
				1922	sk->sk_state != TCP_SYN_SENT \|\|
				1923	(sk->sk_userlocks & SOCK_BINDADDR_LOCK) \|\|
				1924	(err = tcp_v4_reselect_saddr(sk)) != 0)
				1925	sk->sk_err_soft = -err;
				1926
				1927	return err;
				1928	}
				1929
				1930	static void v4_addr2sockaddr(struct sock sk, struct sockaddr uaddr)
				1931	{
				1932	struct sockaddr_in sin = (struct sockaddr_in ) uaddr;
				1933	struct inet_sock *inet = inet_sk(sk);
				1934
				1935	sin->sin_family = AF_INET;
				1936	sin->sin_addr.s_addr = inet->daddr;
				1937	sin->sin_port = inet->dport;
				1938	}
				1939
				1940	/* VJ's idea. Save last timestamp seen from this destination
				1941	* and hold it at least for normal timewait interval to use for duplicate
				1942	* segment detection in subsequent connections, before they enter synchronized
				1943	* state.
				1944	*/
				1945
				1946	int tcp_v4_remember_stamp(struct sock *sk)
				1947	{
				1948	struct inet_sock *inet = inet_sk(sk);
				1949	struct tcp_sock *tp = tcp_sk(sk);
				1950	struct rtable rt = (struct rtable )__sk_dst_get(sk);
				1951	struct inet_peer *peer = NULL;
				1952	int release_it = 0;
				1953
				1954	if (!rt \|\| rt->rt_dst != inet->daddr) {
				1955	peer = inet_getpeer(inet->daddr, 1);
				1956	release_it = 1;
				1957	} else {
				1958	if (!rt->peer)
				1959	rt_bind_peer(rt, 1);
				1960	peer = rt->peer;
				1961	}
				1962
				1963	if (peer) {
				1964	if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 \|\|
				1965	(peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
				1966	peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
				1967	peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
				1968	peer->tcp_ts = tp->rx_opt.ts_recent;
				1969	}
				1970	if (release_it)
				1971	inet_putpeer(peer);
				1972	return 1;
				1973	}
				1974
				1975	return 0;
				1976	}
				1977
				1978	int tcp_v4_tw_remember_stamp(struct tcp_tw_bucket *tw)
				1979	{
				1980	struct inet_peer *peer = NULL;
				1981
				1982	peer = inet_getpeer(tw->tw_daddr, 1);
				1983
				1984	if (peer) {
				1985	if ((s32)(peer->tcp_ts - tw->tw_ts_recent) <= 0 \|\|
				1986	(peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
				1987	peer->tcp_ts_stamp <= tw->tw_ts_recent_stamp)) {
				1988	peer->tcp_ts_stamp = tw->tw_ts_recent_stamp;
				1989	peer->tcp_ts = tw->tw_ts_recent;
				1990	}
				1991	inet_putpeer(peer);
				1992	return 1;
				1993	}
				1994
				1995	return 0;
				1996	}
				1997
				1998	struct tcp_func ipv4_specific = {
				1999	.queue_xmit = ip_queue_xmit,
				2000	.send_check = tcp_v4_send_check,
				2001	.rebuild_header = tcp_v4_rebuild_header,
				2002	.conn_request = tcp_v4_conn_request,
				2003	.syn_recv_sock = tcp_v4_syn_recv_sock,
				2004	.remember_stamp = tcp_v4_remember_stamp,
				2005	.net_header_len = sizeof(struct iphdr),
				2006	.setsockopt = ip_setsockopt,
				2007	.getsockopt = ip_getsockopt,
				2008	.addr2sockaddr = v4_addr2sockaddr,
				2009	.sockaddr_len = sizeof(struct sockaddr_in),
				2010	};
				2011
				2012	/* NOTE: A lot of things set to zero explicitly by call to
				2013	* sk_alloc() so need not be done here.
				2014	*/
				2015	static int tcp_v4_init_sock(struct sock *sk)
				2016	{
				2017	struct tcp_sock *tp = tcp_sk(sk);
				2018
				2019	skb_queue_head_init(&tp->out_of_order_queue);
				2020	tcp_init_xmit_timers(sk);
				2021	tcp_prequeue_init(tp);
				2022
				2023	tp->rto = TCP_TIMEOUT_INIT;
				2024	tp->mdev = TCP_TIMEOUT_INIT;
				2025
				2026	/* So many TCP implementations out there (incorrectly) count the
				2027	* initial SYN frame in their delayed-ACK and congestion control
				2028	* algorithms that we must have the following bandaid to talk
				2029	* efficiently to them. -DaveM
				2030	*/
				2031	tp->snd_cwnd = 2;
				2032
				2033	/* See draft-stevens-tcpca-spec-01 for discussion of the
				2034	* initialization of these values.
				2035	*/
				2036	tp->snd_ssthresh = 0x7fffffff; /* Infinity */
				2037	tp->snd_cwnd_clamp = ~0;
David S. Miller	c1b4a7e	2005-07-05 15:24:38 -0700	[diff] [blame]	2038	tp->mss_cache = 536;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2039
				2040	tp->reordering = sysctl_tcp_reordering;
Stephen Hemminger	5f8ef48	2005-06-23 20:37:36 -0700	[diff] [blame]	2041	tp->ca_ops = &tcp_init_congestion_ops;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2042
				2043	sk->sk_state = TCP_CLOSE;
				2044
				2045	sk->sk_write_space = sk_stream_write_space;
				2046	sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
				2047
				2048	tp->af_specific = &ipv4_specific;
				2049
				2050	sk->sk_sndbuf = sysctl_tcp_wmem[1];
				2051	sk->sk_rcvbuf = sysctl_tcp_rmem[1];
				2052
				2053	atomic_inc(&tcp_sockets_allocated);
				2054
				2055	return 0;
				2056	}
				2057
				2058	int tcp_v4_destroy_sock(struct sock *sk)
				2059	{
				2060	struct tcp_sock *tp = tcp_sk(sk);
				2061
				2062	tcp_clear_xmit_timers(sk);
				2063
Stephen Hemminger	317a76f	2005-06-23 12:19:55 -0700	[diff] [blame]	2064	tcp_cleanup_congestion_control(tp);
				2065
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2066	/* Cleanup up the write buffer. */
				2067	sk_stream_writequeue_purge(sk);
				2068
				2069	/* Cleans up our, hopefully empty, out_of_order_queue. */
				2070	__skb_queue_purge(&tp->out_of_order_queue);
				2071
				2072	/* Clean prequeue, it must be empty really */
				2073	__skb_queue_purge(&tp->ucopy.prequeue);
				2074
				2075	/* Clean up a referenced TCP bind bucket. */
				2076	if (tp->bind_hash)
				2077	tcp_put_port(sk);
				2078
				2079	/*
				2080	* If sendmsg cached page exists, toss it.
				2081	*/
				2082	if (sk->sk_sndmsg_page) {
				2083	__free_page(sk->sk_sndmsg_page);
				2084	sk->sk_sndmsg_page = NULL;
				2085	}
				2086
				2087	atomic_dec(&tcp_sockets_allocated);
				2088
				2089	return 0;
				2090	}
				2091
				2092	EXPORT_SYMBOL(tcp_v4_destroy_sock);
				2093
				2094	#ifdef CONFIG_PROC_FS
				2095	/* Proc filesystem TCP sock list dumping. */
				2096
				2097	static inline struct tcp_tw_bucket tw_head(struct hlist_head head)
				2098	{
				2099	return hlist_empty(head) ? NULL :
				2100	list_entry(head->first, struct tcp_tw_bucket, tw_node);
				2101	}
				2102
				2103	static inline struct tcp_tw_bucket tw_next(struct tcp_tw_bucket tw)
				2104	{
				2105	return tw->tw_node.next ?
				2106	hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
				2107	}
				2108
				2109	static void listening_get_next(struct seq_file seq, void *cur)
				2110	{
				2111	struct tcp_sock *tp;
				2112	struct hlist_node *node;
				2113	struct sock *sk = cur;
				2114	struct tcp_iter_state* st = seq->private;
				2115
				2116	if (!sk) {
				2117	st->bucket = 0;
				2118	sk = sk_head(&tcp_listening_hash[0]);
				2119	goto get_sk;
				2120	}
				2121
				2122	++st->num;
				2123
				2124	if (st->state == TCP_SEQ_STATE_OPENREQ) {
Arnaldo Carvalho de Melo	60236fd	2005-06-18 22:47:21 -0700	[diff] [blame]	2125	struct request_sock *req = cur;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2126
				2127	tp = tcp_sk(st->syn_wait_sk);
				2128	req = req->dl_next;
				2129	while (1) {
				2130	while (req) {
Arnaldo Carvalho de Melo	60236fd	2005-06-18 22:47:21 -0700	[diff] [blame]	2131	if (req->rsk_ops->family == st->family) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2132	cur = req;
				2133	goto out;
				2134	}
				2135	req = req->dl_next;
				2136	}
				2137	if (++st->sbucket >= TCP_SYNQ_HSIZE)
				2138	break;
				2139	get_req:
Arnaldo Carvalho de Melo	0e87506	2005-06-18 22:47:59 -0700	[diff] [blame]	2140	req = tp->accept_queue.listen_opt->syn_table[st->sbucket];
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2141	}
				2142	sk = sk_next(st->syn_wait_sk);
				2143	st->state = TCP_SEQ_STATE_LISTENING;
Arnaldo Carvalho de Melo	0e87506	2005-06-18 22:47:59 -0700	[diff] [blame]	2144	read_unlock_bh(&tp->accept_queue.syn_wait_lock);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2145	} else {
				2146	tp = tcp_sk(sk);
Arnaldo Carvalho de Melo	0e87506	2005-06-18 22:47:59 -0700	[diff] [blame]	2147	read_lock_bh(&tp->accept_queue.syn_wait_lock);
				2148	if (reqsk_queue_len(&tp->accept_queue))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2149	goto start_req;
Arnaldo Carvalho de Melo	0e87506	2005-06-18 22:47:59 -0700	[diff] [blame]	2150	read_unlock_bh(&tp->accept_queue.syn_wait_lock);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2151	sk = sk_next(sk);
				2152	}
				2153	get_sk:
				2154	sk_for_each_from(sk, node) {
				2155	if (sk->sk_family == st->family) {
				2156	cur = sk;
				2157	goto out;
				2158	}
				2159	tp = tcp_sk(sk);
Arnaldo Carvalho de Melo	0e87506	2005-06-18 22:47:59 -0700	[diff] [blame]	2160	read_lock_bh(&tp->accept_queue.syn_wait_lock);
				2161	if (reqsk_queue_len(&tp->accept_queue)) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2162	start_req:
				2163	st->uid = sock_i_uid(sk);
				2164	st->syn_wait_sk = sk;
				2165	st->state = TCP_SEQ_STATE_OPENREQ;
				2166	st->sbucket = 0;
				2167	goto get_req;
				2168	}
Arnaldo Carvalho de Melo	0e87506	2005-06-18 22:47:59 -0700	[diff] [blame]	2169	read_unlock_bh(&tp->accept_queue.syn_wait_lock);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2170	}
				2171	if (++st->bucket < TCP_LHTABLE_SIZE) {
				2172	sk = sk_head(&tcp_listening_hash[st->bucket]);
				2173	goto get_sk;
				2174	}
				2175	cur = NULL;
				2176	out:
				2177	return cur;
				2178	}
				2179
				2180	static void listening_get_idx(struct seq_file seq, loff_t *pos)
				2181	{
				2182	void *rc = listening_get_next(seq, NULL);
				2183
				2184	while (rc && *pos) {
				2185	rc = listening_get_next(seq, rc);
				2186	--*pos;
				2187	}
				2188	return rc;
				2189	}
				2190
				2191	static void established_get_first(struct seq_file seq)
				2192	{
				2193	struct tcp_iter_state* st = seq->private;
				2194	void *rc = NULL;
				2195
				2196	for (st->bucket = 0; st->bucket < tcp_ehash_size; ++st->bucket) {
				2197	struct sock *sk;
				2198	struct hlist_node *node;
				2199	struct tcp_tw_bucket *tw;
				2200
				2201	/* We can reschedule _before_ having picked the target: */
				2202	cond_resched_softirq();
				2203
				2204	read_lock(&tcp_ehash[st->bucket].lock);
				2205	sk_for_each(sk, node, &tcp_ehash[st->bucket].chain) {
				2206	if (sk->sk_family != st->family) {
				2207	continue;
				2208	}
				2209	rc = sk;
				2210	goto out;
				2211	}
				2212	st->state = TCP_SEQ_STATE_TIME_WAIT;
				2213	tw_for_each(tw, node,
				2214	&tcp_ehash[st->bucket + tcp_ehash_size].chain) {
				2215	if (tw->tw_family != st->family) {
				2216	continue;
				2217	}
				2218	rc = tw;
				2219	goto out;
				2220	}
				2221	read_unlock(&tcp_ehash[st->bucket].lock);
				2222	st->state = TCP_SEQ_STATE_ESTABLISHED;
				2223	}
				2224	out:
				2225	return rc;
				2226	}
				2227
				2228	static void established_get_next(struct seq_file seq, void *cur)
				2229	{
				2230	struct sock *sk = cur;
				2231	struct tcp_tw_bucket *tw;
				2232	struct hlist_node *node;
				2233	struct tcp_iter_state* st = seq->private;
				2234
				2235	++st->num;
				2236
				2237	if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
				2238	tw = cur;
				2239	tw = tw_next(tw);
				2240	get_tw:
				2241	while (tw && tw->tw_family != st->family) {
				2242	tw = tw_next(tw);
				2243	}
				2244	if (tw) {
				2245	cur = tw;
				2246	goto out;
				2247	}
				2248	read_unlock(&tcp_ehash[st->bucket].lock);
				2249	st->state = TCP_SEQ_STATE_ESTABLISHED;
				2250
				2251	/* We can reschedule between buckets: */
				2252	cond_resched_softirq();
				2253
				2254	if (++st->bucket < tcp_ehash_size) {
				2255	read_lock(&tcp_ehash[st->bucket].lock);
				2256	sk = sk_head(&tcp_ehash[st->bucket].chain);
				2257	} else {
				2258	cur = NULL;
				2259	goto out;
				2260	}
				2261	} else
				2262	sk = sk_next(sk);
				2263
				2264	sk_for_each_from(sk, node) {
				2265	if (sk->sk_family == st->family)
				2266	goto found;
				2267	}
				2268
				2269	st->state = TCP_SEQ_STATE_TIME_WAIT;
				2270	tw = tw_head(&tcp_ehash[st->bucket + tcp_ehash_size].chain);
				2271	goto get_tw;
				2272	found:
				2273	cur = sk;
				2274	out:
				2275	return cur;
				2276	}
				2277
				2278	static void established_get_idx(struct seq_file seq, loff_t pos)
				2279	{
				2280	void *rc = established_get_first(seq);
				2281
				2282	while (rc && pos) {
				2283	rc = established_get_next(seq, rc);
				2284	--pos;
				2285	}
				2286	return rc;
				2287	}
				2288
				2289	static void tcp_get_idx(struct seq_file seq, loff_t pos)
				2290	{
				2291	void *rc;
				2292	struct tcp_iter_state* st = seq->private;
				2293
				2294	tcp_listen_lock();
				2295	st->state = TCP_SEQ_STATE_LISTENING;
				2296	rc = listening_get_idx(seq, &pos);
				2297
				2298	if (!rc) {
				2299	tcp_listen_unlock();
				2300	local_bh_disable();
				2301	st->state = TCP_SEQ_STATE_ESTABLISHED;
				2302	rc = established_get_idx(seq, pos);
				2303	}
				2304
				2305	return rc;
				2306	}
				2307
				2308	static void tcp_seq_start(struct seq_file seq, loff_t *pos)
				2309	{
				2310	struct tcp_iter_state* st = seq->private;
				2311	st->state = TCP_SEQ_STATE_LISTENING;
				2312	st->num = 0;
				2313	return pos ? tcp_get_idx(seq, pos - 1) : SEQ_START_TOKEN;
				2314	}
				2315
				2316	static void tcp_seq_next(struct seq_file seq, void v, loff_t pos)
				2317	{
				2318	void *rc = NULL;
				2319	struct tcp_iter_state* st;
				2320
				2321	if (v == SEQ_START_TOKEN) {
				2322	rc = tcp_get_idx(seq, 0);
				2323	goto out;
				2324	}
				2325	st = seq->private;
				2326
				2327	switch (st->state) {
				2328	case TCP_SEQ_STATE_OPENREQ:
				2329	case TCP_SEQ_STATE_LISTENING:
				2330	rc = listening_get_next(seq, v);
				2331	if (!rc) {
				2332	tcp_listen_unlock();
				2333	local_bh_disable();
				2334	st->state = TCP_SEQ_STATE_ESTABLISHED;
				2335	rc = established_get_first(seq);
				2336	}
				2337	break;
				2338	case TCP_SEQ_STATE_ESTABLISHED:
				2339	case TCP_SEQ_STATE_TIME_WAIT:
				2340	rc = established_get_next(seq, v);
				2341	break;
				2342	}
				2343	out:
				2344	++*pos;
				2345	return rc;
				2346	}
				2347
				2348	static void tcp_seq_stop(struct seq_file seq, void v)
				2349	{
				2350	struct tcp_iter_state* st = seq->private;
				2351
				2352	switch (st->state) {
				2353	case TCP_SEQ_STATE_OPENREQ:
				2354	if (v) {
				2355	struct tcp_sock *tp = tcp_sk(st->syn_wait_sk);
Arnaldo Carvalho de Melo	0e87506	2005-06-18 22:47:59 -0700	[diff] [blame]	2356	read_unlock_bh(&tp->accept_queue.syn_wait_lock);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2357	}
				2358	case TCP_SEQ_STATE_LISTENING:
				2359	if (v != SEQ_START_TOKEN)
				2360	tcp_listen_unlock();
				2361	break;
				2362	case TCP_SEQ_STATE_TIME_WAIT:
				2363	case TCP_SEQ_STATE_ESTABLISHED:
				2364	if (v)
				2365	read_unlock(&tcp_ehash[st->bucket].lock);
				2366	local_bh_enable();
				2367	break;
				2368	}
				2369	}
				2370
				2371	static int tcp_seq_open(struct inode inode, struct file file)
				2372	{
				2373	struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
				2374	struct seq_file *seq;
				2375	struct tcp_iter_state *s;
				2376	int rc;
				2377
				2378	if (unlikely(afinfo == NULL))
				2379	return -EINVAL;
				2380
				2381	s = kmalloc(sizeof(*s), GFP_KERNEL);
				2382	if (!s)
				2383	return -ENOMEM;
				2384	memset(s, 0, sizeof(*s));
				2385	s->family = afinfo->family;
				2386	s->seq_ops.start = tcp_seq_start;
				2387	s->seq_ops.next = tcp_seq_next;
				2388	s->seq_ops.show = afinfo->seq_show;
				2389	s->seq_ops.stop = tcp_seq_stop;
				2390
				2391	rc = seq_open(file, &s->seq_ops);
				2392	if (rc)
				2393	goto out_kfree;
				2394	seq = file->private_data;
				2395	seq->private = s;
				2396	out:
				2397	return rc;
				2398	out_kfree:
				2399	kfree(s);
				2400	goto out;
				2401	}
				2402
				2403	int tcp_proc_register(struct tcp_seq_afinfo *afinfo)
				2404	{
				2405	int rc = 0;
				2406	struct proc_dir_entry *p;
				2407
				2408	if (!afinfo)
				2409	return -EINVAL;
				2410	afinfo->seq_fops->owner = afinfo->owner;
				2411	afinfo->seq_fops->open = tcp_seq_open;
				2412	afinfo->seq_fops->read = seq_read;
				2413	afinfo->seq_fops->llseek = seq_lseek;
				2414	afinfo->seq_fops->release = seq_release_private;
				2415
				2416	p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops);
				2417	if (p)
				2418	p->data = afinfo;
				2419	else
				2420	rc = -ENOMEM;
				2421	return rc;
				2422	}
				2423
				2424	void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo)
				2425	{
				2426	if (!afinfo)
				2427	return;
				2428	proc_net_remove(afinfo->name);
				2429	memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops));
				2430	}
				2431
Arnaldo Carvalho de Melo	60236fd	2005-06-18 22:47:21 -0700	[diff] [blame]	2432	static void get_openreq4(struct sock sk, struct request_sock req,
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2433	char *tmpbuf, int i, int uid)
				2434	{
Arnaldo Carvalho de Melo	2e6599c	2005-06-18 22:46:52 -0700	[diff] [blame]	2435	const struct inet_request_sock *ireq = inet_rsk(req);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2436	int ttd = req->expires - jiffies;
				2437
				2438	sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
				2439	" %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p",
				2440	i,
Arnaldo Carvalho de Melo	2e6599c	2005-06-18 22:46:52 -0700	[diff] [blame]	2441	ireq->loc_addr,
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2442	ntohs(inet_sk(sk)->sport),
Arnaldo Carvalho de Melo	2e6599c	2005-06-18 22:46:52 -0700	[diff] [blame]	2443	ireq->rmt_addr,
				2444	ntohs(ireq->rmt_port),
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2445	TCP_SYN_RECV,
				2446	0, 0, /* could print option size, but that is af dependent. */
				2447	1, /* timers active (only the expire timer) */
				2448	jiffies_to_clock_t(ttd),
				2449	req->retrans,
				2450	uid,
				2451	0, /* non standard timer */
				2452	0, /* open_requests have no inode */
				2453	atomic_read(&sk->sk_refcnt),
				2454	req);
				2455	}
				2456
				2457	static void get_tcp4_sock(struct sock sp, char tmpbuf, int i)
				2458	{
				2459	int timer_active;
				2460	unsigned long timer_expires;
				2461	struct tcp_sock *tp = tcp_sk(sp);
				2462	struct inet_sock *inet = inet_sk(sp);
				2463	unsigned int dest = inet->daddr;
				2464	unsigned int src = inet->rcv_saddr;
				2465	__u16 destp = ntohs(inet->dport);
				2466	__u16 srcp = ntohs(inet->sport);
				2467
				2468	if (tp->pending == TCP_TIME_RETRANS) {
				2469	timer_active = 1;
				2470	timer_expires = tp->timeout;
				2471	} else if (tp->pending == TCP_TIME_PROBE0) {
				2472	timer_active = 4;
				2473	timer_expires = tp->timeout;
				2474	} else if (timer_pending(&sp->sk_timer)) {
				2475	timer_active = 2;
				2476	timer_expires = sp->sk_timer.expires;
				2477	} else {
				2478	timer_active = 0;
				2479	timer_expires = jiffies;
				2480	}
				2481
				2482	sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
				2483	"%08X %5d %8d %lu %d %p %u %u %u %u %d",
				2484	i, src, srcp, dest, destp, sp->sk_state,
				2485	tp->write_seq - tp->snd_una, tp->rcv_nxt - tp->copied_seq,
				2486	timer_active,
				2487	jiffies_to_clock_t(timer_expires - jiffies),
				2488	tp->retransmits,
				2489	sock_i_uid(sp),
				2490	tp->probes_out,
				2491	sock_i_ino(sp),
				2492	atomic_read(&sp->sk_refcnt), sp,
				2493	tp->rto, tp->ack.ato, (tp->ack.quick << 1) \| tp->ack.pingpong,
				2494	tp->snd_cwnd,
				2495	tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh);
				2496	}
				2497
				2498	static void get_timewait4_sock(struct tcp_tw_bucket tw, char tmpbuf, int i)
				2499	{
				2500	unsigned int dest, src;
				2501	__u16 destp, srcp;
				2502	int ttd = tw->tw_ttd - jiffies;
				2503
				2504	if (ttd < 0)
				2505	ttd = 0;
				2506
				2507	dest = tw->tw_daddr;
				2508	src = tw->tw_rcv_saddr;
				2509	destp = ntohs(tw->tw_dport);
				2510	srcp = ntohs(tw->tw_sport);
				2511
				2512	sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
				2513	" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p",
				2514	i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
				2515	3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
				2516	atomic_read(&tw->tw_refcnt), tw);
				2517	}
				2518
				2519	#define TMPSZ 150
				2520
				2521	static int tcp4_seq_show(struct seq_file seq, void v)
				2522	{
				2523	struct tcp_iter_state* st;
				2524	char tmpbuf[TMPSZ + 1];
				2525
				2526	if (v == SEQ_START_TOKEN) {
				2527	seq_printf(seq, "%-*s\n", TMPSZ - 1,
				2528	" sl local_address rem_address st tx_queue "
				2529	"rx_queue tr tm->when retrnsmt uid timeout "
				2530	"inode");
				2531	goto out;
				2532	}
				2533	st = seq->private;
				2534
				2535	switch (st->state) {
				2536	case TCP_SEQ_STATE_LISTENING:
				2537	case TCP_SEQ_STATE_ESTABLISHED:
				2538	get_tcp4_sock(v, tmpbuf, st->num);
				2539	break;
				2540	case TCP_SEQ_STATE_OPENREQ:
				2541	get_openreq4(st->syn_wait_sk, v, tmpbuf, st->num, st->uid);
				2542	break;
				2543	case TCP_SEQ_STATE_TIME_WAIT:
				2544	get_timewait4_sock(v, tmpbuf, st->num);
				2545	break;
				2546	}
				2547	seq_printf(seq, "%-*s\n", TMPSZ - 1, tmpbuf);
				2548	out:
				2549	return 0;
				2550	}
				2551
				2552	static struct file_operations tcp4_seq_fops;
				2553	static struct tcp_seq_afinfo tcp4_seq_afinfo = {
				2554	.owner = THIS_MODULE,
				2555	.name = "tcp",
				2556	.family = AF_INET,
				2557	.seq_show = tcp4_seq_show,
				2558	.seq_fops = &tcp4_seq_fops,
				2559	};
				2560
				2561	int __init tcp4_proc_init(void)
				2562	{
				2563	return tcp_proc_register(&tcp4_seq_afinfo);
				2564	}
				2565
				2566	void tcp4_proc_exit(void)
				2567	{
				2568	tcp_proc_unregister(&tcp4_seq_afinfo);
				2569	}
				2570	#endif /* CONFIG_PROC_FS */
				2571
				2572	struct proto tcp_prot = {
				2573	.name = "TCP",
				2574	.owner = THIS_MODULE,
				2575	.close = tcp_close,
				2576	.connect = tcp_v4_connect,
				2577	.disconnect = tcp_disconnect,
				2578	.accept = tcp_accept,
				2579	.ioctl = tcp_ioctl,
				2580	.init = tcp_v4_init_sock,
				2581	.destroy = tcp_v4_destroy_sock,
				2582	.shutdown = tcp_shutdown,
				2583	.setsockopt = tcp_setsockopt,
				2584	.getsockopt = tcp_getsockopt,
				2585	.sendmsg = tcp_sendmsg,
				2586	.recvmsg = tcp_recvmsg,
				2587	.backlog_rcv = tcp_v4_do_rcv,
				2588	.hash = tcp_v4_hash,
				2589	.unhash = tcp_unhash,
				2590	.get_port = tcp_v4_get_port,
				2591	.enter_memory_pressure = tcp_enter_memory_pressure,
				2592	.sockets_allocated = &tcp_sockets_allocated,
				2593	.memory_allocated = &tcp_memory_allocated,
				2594	.memory_pressure = &tcp_memory_pressure,
				2595	.sysctl_mem = sysctl_tcp_mem,
				2596	.sysctl_wmem = sysctl_tcp_wmem,
				2597	.sysctl_rmem = sysctl_tcp_rmem,
				2598	.max_header = MAX_TCP_HEADER,
				2599	.obj_size = sizeof(struct tcp_sock),
Arnaldo Carvalho de Melo	60236fd	2005-06-18 22:47:21 -0700	[diff] [blame]	2600	.rsk_prot = &tcp_request_sock_ops,
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2601	};
				2602
				2603
				2604
				2605	void __init tcp_v4_init(struct net_proto_family *ops)
				2606	{
				2607	int err = sock_create_kern(PF_INET, SOCK_RAW, IPPROTO_TCP, &tcp_socket);
				2608	if (err < 0)
				2609	panic("Failed to create the TCP control socket.\n");
				2610	tcp_socket->sk->sk_allocation = GFP_ATOMIC;
				2611	inet_sk(tcp_socket->sk)->uc_ttl = -1;
				2612
				2613	/* Unhash it so that IP input processing does not even
				2614	* see it, we do not wish this socket to see incoming
				2615	* packets.
				2616	*/
				2617	tcp_socket->sk->sk_prot->unhash(tcp_socket->sk);
				2618	}
				2619
				2620	EXPORT_SYMBOL(ipv4_specific);
				2621	EXPORT_SYMBOL(tcp_bind_hash);
				2622	EXPORT_SYMBOL(tcp_bucket_create);
				2623	EXPORT_SYMBOL(tcp_hashinfo);
				2624	EXPORT_SYMBOL(tcp_inherit_port);
				2625	EXPORT_SYMBOL(tcp_listen_wlock);
				2626	EXPORT_SYMBOL(tcp_port_rover);
				2627	EXPORT_SYMBOL(tcp_prot);
				2628	EXPORT_SYMBOL(tcp_put_port);
				2629	EXPORT_SYMBOL(tcp_unhash);
				2630	EXPORT_SYMBOL(tcp_v4_conn_request);
				2631	EXPORT_SYMBOL(tcp_v4_connect);
				2632	EXPORT_SYMBOL(tcp_v4_do_rcv);
				2633	EXPORT_SYMBOL(tcp_v4_rebuild_header);
				2634	EXPORT_SYMBOL(tcp_v4_remember_stamp);
				2635	EXPORT_SYMBOL(tcp_v4_send_check);
				2636	EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
				2637
				2638	#ifdef CONFIG_PROC_FS
				2639	EXPORT_SYMBOL(tcp_proc_register);
				2640	EXPORT_SYMBOL(tcp_proc_unregister);
				2641	#endif
				2642	EXPORT_SYMBOL(sysctl_local_port_range);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2643	EXPORT_SYMBOL(sysctl_tcp_low_latency);
				2644	EXPORT_SYMBOL(sysctl_tcp_tw_reuse);
				2645