blob: 4a5daecbd2ac185c604518fb4862b7b710ba8984 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Implementation of the Transmission Control Protocol(TCP).
7 *
8 * Version: $Id: tcp_ipv4.c,v 1.240 2002/02/01 22:01:04 davem Exp $
9 *
10 * IPv4 specific functions
11 *
12 *
13 * code split from:
14 * linux/ipv4/tcp.c
15 * linux/ipv4/tcp_input.c
16 * linux/ipv4/tcp_output.c
17 *
18 * See tcp.c for author information
19 *
20 * This program is free software; you can redistribute it and/or
21 * modify it under the terms of the GNU General Public License
22 * as published by the Free Software Foundation; either version
23 * 2 of the License, or (at your option) any later version.
24 */
25
26/*
27 * Changes:
28 * David S. Miller : New socket lookup architecture.
29 * This code is dedicated to John Dyson.
30 * David S. Miller : Change semantics of established hash,
31 * half is devoted to TIME_WAIT sockets
32 * and the rest go in the other half.
33 * Andi Kleen : Add support for syncookies and fixed
34 * some bugs: ip options weren't passed to
35 * the TCP layer, missed a check for an
36 * ACK bit.
37 * Andi Kleen : Implemented fast path mtu discovery.
38 * Fixed many serious bugs in the
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -070039 * request_sock handling and moved
Linus Torvalds1da177e2005-04-16 15:20:36 -070040 * most of it into the af independent code.
41 * Added tail drop and some other bugfixes.
42 * Added new listen sematics.
43 * Mike McLagan : Routing by source
44 * Juan Jose Ciarlante: ip_dynaddr bits
45 * Andi Kleen: various fixes.
46 * Vitaly E. Lavrov : Transparent proxy revived after year
47 * coma.
48 * Andi Kleen : Fix new listen.
49 * Andi Kleen : Fix accept error reporting.
50 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
51 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
52 * a single port at the same time.
53 */
54
55#include <linux/config.h>
56
57#include <linux/types.h>
58#include <linux/fcntl.h>
59#include <linux/module.h>
60#include <linux/random.h>
61#include <linux/cache.h>
62#include <linux/jhash.h>
63#include <linux/init.h>
64#include <linux/times.h>
65
66#include <net/icmp.h>
67#include <net/tcp.h>
68#include <net/ipv6.h>
69#include <net/inet_common.h>
70#include <net/xfrm.h>
71
72#include <linux/inet.h>
73#include <linux/ipv6.h>
74#include <linux/stddef.h>
75#include <linux/proc_fs.h>
76#include <linux/seq_file.h>
77
78extern int sysctl_ip_dynaddr;
79int sysctl_tcp_tw_reuse;
80int sysctl_tcp_low_latency;
81
82/* Check TCP sequence numbers in ICMP packets. */
83#define ICMP_MIN_LENGTH 8
84
85/* Socket used for sending RSTs */
86static struct socket *tcp_socket;
87
88void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
89 struct sk_buff *skb);
90
91struct tcp_hashinfo __cacheline_aligned tcp_hashinfo = {
92 .__tcp_lhash_lock = RW_LOCK_UNLOCKED,
93 .__tcp_lhash_users = ATOMIC_INIT(0),
94 .__tcp_lhash_wait
95 = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.__tcp_lhash_wait),
96 .__tcp_portalloc_lock = SPIN_LOCK_UNLOCKED
97};
98
99/*
100 * This array holds the first and last local port number.
101 * For high-usage systems, use sysctl to change this to
102 * 32768-61000
103 */
104int sysctl_local_port_range[2] = { 1024, 4999 };
105int tcp_port_rover = 1024 - 1;
106
107static __inline__ int tcp_hashfn(__u32 laddr, __u16 lport,
108 __u32 faddr, __u16 fport)
109{
110 int h = (laddr ^ lport) ^ (faddr ^ fport);
111 h ^= h >> 16;
112 h ^= h >> 8;
113 return h & (tcp_ehash_size - 1);
114}
115
116static __inline__ int tcp_sk_hashfn(struct sock *sk)
117{
118 struct inet_sock *inet = inet_sk(sk);
119 __u32 laddr = inet->rcv_saddr;
120 __u16 lport = inet->num;
121 __u32 faddr = inet->daddr;
122 __u16 fport = inet->dport;
123
124 return tcp_hashfn(laddr, lport, faddr, fport);
125}
126
127/* Allocate and initialize a new TCP local port bind bucket.
128 * The bindhash mutex for snum's hash chain must be held here.
129 */
130struct tcp_bind_bucket *tcp_bucket_create(struct tcp_bind_hashbucket *head,
131 unsigned short snum)
132{
133 struct tcp_bind_bucket *tb = kmem_cache_alloc(tcp_bucket_cachep,
134 SLAB_ATOMIC);
135 if (tb) {
136 tb->port = snum;
137 tb->fastreuse = 0;
138 INIT_HLIST_HEAD(&tb->owners);
139 hlist_add_head(&tb->node, &head->chain);
140 }
141 return tb;
142}
143
144/* Caller must hold hashbucket lock for this tb with local BH disabled */
145void tcp_bucket_destroy(struct tcp_bind_bucket *tb)
146{
147 if (hlist_empty(&tb->owners)) {
148 __hlist_del(&tb->node);
149 kmem_cache_free(tcp_bucket_cachep, tb);
150 }
151}
152
153/* Caller must disable local BH processing. */
154static __inline__ void __tcp_inherit_port(struct sock *sk, struct sock *child)
155{
156 struct tcp_bind_hashbucket *head =
157 &tcp_bhash[tcp_bhashfn(inet_sk(child)->num)];
158 struct tcp_bind_bucket *tb;
159
160 spin_lock(&head->lock);
161 tb = tcp_sk(sk)->bind_hash;
162 sk_add_bind_node(child, &tb->owners);
163 tcp_sk(child)->bind_hash = tb;
164 spin_unlock(&head->lock);
165}
166
167inline void tcp_inherit_port(struct sock *sk, struct sock *child)
168{
169 local_bh_disable();
170 __tcp_inherit_port(sk, child);
171 local_bh_enable();
172}
173
174void tcp_bind_hash(struct sock *sk, struct tcp_bind_bucket *tb,
175 unsigned short snum)
176{
177 inet_sk(sk)->num = snum;
178 sk_add_bind_node(sk, &tb->owners);
179 tcp_sk(sk)->bind_hash = tb;
180}
181
182static inline int tcp_bind_conflict(struct sock *sk, struct tcp_bind_bucket *tb)
183{
184 const u32 sk_rcv_saddr = tcp_v4_rcv_saddr(sk);
185 struct sock *sk2;
186 struct hlist_node *node;
187 int reuse = sk->sk_reuse;
188
189 sk_for_each_bound(sk2, node, &tb->owners) {
190 if (sk != sk2 &&
191 !tcp_v6_ipv6only(sk2) &&
192 (!sk->sk_bound_dev_if ||
193 !sk2->sk_bound_dev_if ||
194 sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
195 if (!reuse || !sk2->sk_reuse ||
196 sk2->sk_state == TCP_LISTEN) {
197 const u32 sk2_rcv_saddr = tcp_v4_rcv_saddr(sk2);
198 if (!sk2_rcv_saddr || !sk_rcv_saddr ||
199 sk2_rcv_saddr == sk_rcv_saddr)
200 break;
201 }
202 }
203 }
204 return node != NULL;
205}
206
207/* Obtain a reference to a local port for the given sock,
208 * if snum is zero it means select any available local port.
209 */
210static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
211{
212 struct tcp_bind_hashbucket *head;
213 struct hlist_node *node;
214 struct tcp_bind_bucket *tb;
215 int ret;
216
217 local_bh_disable();
218 if (!snum) {
219 int low = sysctl_local_port_range[0];
220 int high = sysctl_local_port_range[1];
221 int remaining = (high - low) + 1;
222 int rover;
223
224 spin_lock(&tcp_portalloc_lock);
Folkert van Heusden0b2531b2005-05-03 14:36:08 -0700225 if (tcp_port_rover < low)
226 rover = low;
227 else
228 rover = tcp_port_rover;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700229 do {
230 rover++;
Folkert van Heusden0b2531b2005-05-03 14:36:08 -0700231 if (rover > high)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700232 rover = low;
233 head = &tcp_bhash[tcp_bhashfn(rover)];
234 spin_lock(&head->lock);
235 tb_for_each(tb, node, &head->chain)
236 if (tb->port == rover)
237 goto next;
238 break;
239 next:
240 spin_unlock(&head->lock);
241 } while (--remaining > 0);
242 tcp_port_rover = rover;
243 spin_unlock(&tcp_portalloc_lock);
244
David S. Millerd5d28372005-08-23 10:49:54 -0700245 /* Exhausted local port range during search? It is not
246 * possible for us to be holding one of the bind hash
247 * locks if this test triggers, because if 'remaining'
248 * drops to zero, we broke out of the do/while loop at
249 * the top level, not from the 'break;' statement.
250 */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700251 ret = 1;
David S. Millerd5d28372005-08-23 10:49:54 -0700252 if (unlikely(remaining <= 0))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700253 goto fail;
254
255 /* OK, here is the one we will use. HEAD is
256 * non-NULL and we hold it's mutex.
257 */
258 snum = rover;
259 } else {
260 head = &tcp_bhash[tcp_bhashfn(snum)];
261 spin_lock(&head->lock);
262 tb_for_each(tb, node, &head->chain)
263 if (tb->port == snum)
264 goto tb_found;
265 }
266 tb = NULL;
267 goto tb_not_found;
268tb_found:
269 if (!hlist_empty(&tb->owners)) {
270 if (sk->sk_reuse > 1)
271 goto success;
272 if (tb->fastreuse > 0 &&
273 sk->sk_reuse && sk->sk_state != TCP_LISTEN) {
274 goto success;
275 } else {
276 ret = 1;
277 if (tcp_bind_conflict(sk, tb))
278 goto fail_unlock;
279 }
280 }
281tb_not_found:
282 ret = 1;
283 if (!tb && (tb = tcp_bucket_create(head, snum)) == NULL)
284 goto fail_unlock;
285 if (hlist_empty(&tb->owners)) {
286 if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
287 tb->fastreuse = 1;
288 else
289 tb->fastreuse = 0;
290 } else if (tb->fastreuse &&
291 (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
292 tb->fastreuse = 0;
293success:
294 if (!tcp_sk(sk)->bind_hash)
295 tcp_bind_hash(sk, tb, snum);
296 BUG_TRAP(tcp_sk(sk)->bind_hash == tb);
297 ret = 0;
298
299fail_unlock:
300 spin_unlock(&head->lock);
301fail:
302 local_bh_enable();
303 return ret;
304}
305
306/* Get rid of any references to a local port held by the
307 * given sock.
308 */
309static void __tcp_put_port(struct sock *sk)
310{
311 struct inet_sock *inet = inet_sk(sk);
312 struct tcp_bind_hashbucket *head = &tcp_bhash[tcp_bhashfn(inet->num)];
313 struct tcp_bind_bucket *tb;
314
315 spin_lock(&head->lock);
316 tb = tcp_sk(sk)->bind_hash;
317 __sk_del_bind_node(sk);
318 tcp_sk(sk)->bind_hash = NULL;
319 inet->num = 0;
320 tcp_bucket_destroy(tb);
321 spin_unlock(&head->lock);
322}
323
324void tcp_put_port(struct sock *sk)
325{
326 local_bh_disable();
327 __tcp_put_port(sk);
328 local_bh_enable();
329}
330
331/* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it can be very bad on SMP.
332 * Look, when several writers sleep and reader wakes them up, all but one
333 * immediately hit write lock and grab all the cpus. Exclusive sleep solves
334 * this, _but_ remember, it adds useless work on UP machines (wake up each
335 * exclusive lock release). It should be ifdefed really.
336 */
337
338void tcp_listen_wlock(void)
339{
340 write_lock(&tcp_lhash_lock);
341
342 if (atomic_read(&tcp_lhash_users)) {
343 DEFINE_WAIT(wait);
344
345 for (;;) {
346 prepare_to_wait_exclusive(&tcp_lhash_wait,
347 &wait, TASK_UNINTERRUPTIBLE);
348 if (!atomic_read(&tcp_lhash_users))
349 break;
350 write_unlock_bh(&tcp_lhash_lock);
351 schedule();
352 write_lock_bh(&tcp_lhash_lock);
353 }
354
355 finish_wait(&tcp_lhash_wait, &wait);
356 }
357}
358
359static __inline__ void __tcp_v4_hash(struct sock *sk, const int listen_possible)
360{
361 struct hlist_head *list;
362 rwlock_t *lock;
363
364 BUG_TRAP(sk_unhashed(sk));
365 if (listen_possible && sk->sk_state == TCP_LISTEN) {
366 list = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)];
367 lock = &tcp_lhash_lock;
368 tcp_listen_wlock();
369 } else {
370 list = &tcp_ehash[(sk->sk_hashent = tcp_sk_hashfn(sk))].chain;
371 lock = &tcp_ehash[sk->sk_hashent].lock;
372 write_lock(lock);
373 }
374 __sk_add_node(sk, list);
375 sock_prot_inc_use(sk->sk_prot);
376 write_unlock(lock);
377 if (listen_possible && sk->sk_state == TCP_LISTEN)
378 wake_up(&tcp_lhash_wait);
379}
380
381static void tcp_v4_hash(struct sock *sk)
382{
383 if (sk->sk_state != TCP_CLOSE) {
384 local_bh_disable();
385 __tcp_v4_hash(sk, 1);
386 local_bh_enable();
387 }
388}
389
390void tcp_unhash(struct sock *sk)
391{
392 rwlock_t *lock;
393
394 if (sk_unhashed(sk))
395 goto ende;
396
397 if (sk->sk_state == TCP_LISTEN) {
398 local_bh_disable();
399 tcp_listen_wlock();
400 lock = &tcp_lhash_lock;
401 } else {
402 struct tcp_ehash_bucket *head = &tcp_ehash[sk->sk_hashent];
403 lock = &head->lock;
404 write_lock_bh(&head->lock);
405 }
406
407 if (__sk_del_node_init(sk))
408 sock_prot_dec_use(sk->sk_prot);
409 write_unlock_bh(lock);
410
411 ende:
412 if (sk->sk_state == TCP_LISTEN)
413 wake_up(&tcp_lhash_wait);
414}
415
416/* Don't inline this cruft. Here are some nice properties to
417 * exploit here. The BSD API does not allow a listening TCP
418 * to specify the remote port nor the remote address for the
419 * connection. So always assume those are both wildcarded
420 * during the search since they can never be otherwise.
421 */
422static struct sock *__tcp_v4_lookup_listener(struct hlist_head *head, u32 daddr,
423 unsigned short hnum, int dif)
424{
425 struct sock *result = NULL, *sk;
426 struct hlist_node *node;
427 int score, hiscore;
428
429 hiscore=-1;
430 sk_for_each(sk, node, head) {
431 struct inet_sock *inet = inet_sk(sk);
432
433 if (inet->num == hnum && !ipv6_only_sock(sk)) {
434 __u32 rcv_saddr = inet->rcv_saddr;
435
436 score = (sk->sk_family == PF_INET ? 1 : 0);
437 if (rcv_saddr) {
438 if (rcv_saddr != daddr)
439 continue;
440 score+=2;
441 }
442 if (sk->sk_bound_dev_if) {
443 if (sk->sk_bound_dev_if != dif)
444 continue;
445 score+=2;
446 }
447 if (score == 5)
448 return sk;
449 if (score > hiscore) {
450 hiscore = score;
451 result = sk;
452 }
453 }
454 }
455 return result;
456}
457
458/* Optimize the common listener case. */
459static inline struct sock *tcp_v4_lookup_listener(u32 daddr,
460 unsigned short hnum, int dif)
461{
462 struct sock *sk = NULL;
463 struct hlist_head *head;
464
465 read_lock(&tcp_lhash_lock);
466 head = &tcp_listening_hash[tcp_lhashfn(hnum)];
467 if (!hlist_empty(head)) {
468 struct inet_sock *inet = inet_sk((sk = __sk_head(head)));
469
470 if (inet->num == hnum && !sk->sk_node.next &&
471 (!inet->rcv_saddr || inet->rcv_saddr == daddr) &&
472 (sk->sk_family == PF_INET || !ipv6_only_sock(sk)) &&
473 !sk->sk_bound_dev_if)
474 goto sherry_cache;
475 sk = __tcp_v4_lookup_listener(head, daddr, hnum, dif);
476 }
477 if (sk) {
478sherry_cache:
479 sock_hold(sk);
480 }
481 read_unlock(&tcp_lhash_lock);
482 return sk;
483}
484
485/* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
486 * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
487 *
488 * Local BH must be disabled here.
489 */
490
491static inline struct sock *__tcp_v4_lookup_established(u32 saddr, u16 sport,
492 u32 daddr, u16 hnum,
493 int dif)
494{
495 struct tcp_ehash_bucket *head;
496 TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
497 __u32 ports = TCP_COMBINED_PORTS(sport, hnum);
498 struct sock *sk;
499 struct hlist_node *node;
500 /* Optimize here for direct hit, only listening connections can
501 * have wildcards anyways.
502 */
503 int hash = tcp_hashfn(daddr, hnum, saddr, sport);
504 head = &tcp_ehash[hash];
505 read_lock(&head->lock);
506 sk_for_each(sk, node, &head->chain) {
507 if (TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
508 goto hit; /* You sunk my battleship! */
509 }
510
511 /* Must check for a TIME_WAIT'er before going to listener hash. */
512 sk_for_each(sk, node, &(head + tcp_ehash_size)->chain) {
513 if (TCP_IPV4_TW_MATCH(sk, acookie, saddr, daddr, ports, dif))
514 goto hit;
515 }
516 sk = NULL;
517out:
518 read_unlock(&head->lock);
519 return sk;
520hit:
521 sock_hold(sk);
522 goto out;
523}
524
525static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport,
526 u32 daddr, u16 hnum, int dif)
527{
528 struct sock *sk = __tcp_v4_lookup_established(saddr, sport,
529 daddr, hnum, dif);
530
531 return sk ? : tcp_v4_lookup_listener(daddr, hnum, dif);
532}
533
534inline struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr,
535 u16 dport, int dif)
536{
537 struct sock *sk;
538
539 local_bh_disable();
540 sk = __tcp_v4_lookup(saddr, sport, daddr, ntohs(dport), dif);
541 local_bh_enable();
542
543 return sk;
544}
545
546EXPORT_SYMBOL_GPL(tcp_v4_lookup);
547
548static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
549{
550 return secure_tcp_sequence_number(skb->nh.iph->daddr,
551 skb->nh.iph->saddr,
552 skb->h.th->dest,
553 skb->h.th->source);
554}
555
556/* called with local bh disabled */
557static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
558 struct tcp_tw_bucket **twp)
559{
560 struct inet_sock *inet = inet_sk(sk);
561 u32 daddr = inet->rcv_saddr;
562 u32 saddr = inet->daddr;
563 int dif = sk->sk_bound_dev_if;
564 TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
565 __u32 ports = TCP_COMBINED_PORTS(inet->dport, lport);
566 int hash = tcp_hashfn(daddr, lport, saddr, inet->dport);
567 struct tcp_ehash_bucket *head = &tcp_ehash[hash];
568 struct sock *sk2;
569 struct hlist_node *node;
570 struct tcp_tw_bucket *tw;
571
572 write_lock(&head->lock);
573
574 /* Check TIME-WAIT sockets first. */
575 sk_for_each(sk2, node, &(head + tcp_ehash_size)->chain) {
576 tw = (struct tcp_tw_bucket *)sk2;
577
578 if (TCP_IPV4_TW_MATCH(sk2, acookie, saddr, daddr, ports, dif)) {
579 struct tcp_sock *tp = tcp_sk(sk);
580
581 /* With PAWS, it is safe from the viewpoint
582 of data integrity. Even without PAWS it
583 is safe provided sequence spaces do not
584 overlap i.e. at data rates <= 80Mbit/sec.
585
586 Actually, the idea is close to VJ's one,
587 only timestamp cache is held not per host,
588 but per port pair and TW bucket is used
589 as state holder.
590
591 If TW bucket has been already destroyed we
592 fall back to VJ's scheme and use initial
593 timestamp retrieved from peer table.
594 */
595 if (tw->tw_ts_recent_stamp &&
596 (!twp || (sysctl_tcp_tw_reuse &&
597 xtime.tv_sec -
598 tw->tw_ts_recent_stamp > 1))) {
599 if ((tp->write_seq =
600 tw->tw_snd_nxt + 65535 + 2) == 0)
601 tp->write_seq = 1;
602 tp->rx_opt.ts_recent = tw->tw_ts_recent;
603 tp->rx_opt.ts_recent_stamp = tw->tw_ts_recent_stamp;
604 sock_hold(sk2);
605 goto unique;
606 } else
607 goto not_unique;
608 }
609 }
610 tw = NULL;
611
612 /* And established part... */
613 sk_for_each(sk2, node, &head->chain) {
614 if (TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif))
615 goto not_unique;
616 }
617
618unique:
619 /* Must record num and sport now. Otherwise we will see
620 * in hash table socket with a funny identity. */
621 inet->num = lport;
622 inet->sport = htons(lport);
623 sk->sk_hashent = hash;
624 BUG_TRAP(sk_unhashed(sk));
625 __sk_add_node(sk, &head->chain);
626 sock_prot_inc_use(sk->sk_prot);
627 write_unlock(&head->lock);
628
629 if (twp) {
630 *twp = tw;
631 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
632 } else if (tw) {
633 /* Silly. Should hash-dance instead... */
634 tcp_tw_deschedule(tw);
635 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
636
637 tcp_tw_put(tw);
638 }
639
640 return 0;
641
642not_unique:
643 write_unlock(&head->lock);
644 return -EADDRNOTAVAIL;
645}
646
647static inline u32 connect_port_offset(const struct sock *sk)
648{
649 const struct inet_sock *inet = inet_sk(sk);
650
651 return secure_tcp_port_ephemeral(inet->rcv_saddr, inet->daddr,
652 inet->dport);
653}
654
655/*
656 * Bind a port for a connect operation and hash it.
657 */
658static inline int tcp_v4_hash_connect(struct sock *sk)
659{
660 unsigned short snum = inet_sk(sk)->num;
661 struct tcp_bind_hashbucket *head;
662 struct tcp_bind_bucket *tb;
663 int ret;
664
665 if (!snum) {
666 int low = sysctl_local_port_range[0];
667 int high = sysctl_local_port_range[1];
668 int range = high - low;
669 int i;
670 int port;
671 static u32 hint;
672 u32 offset = hint + connect_port_offset(sk);
673 struct hlist_node *node;
674 struct tcp_tw_bucket *tw = NULL;
675
676 local_bh_disable();
677 for (i = 1; i <= range; i++) {
678 port = low + (i + offset) % range;
679 head = &tcp_bhash[tcp_bhashfn(port)];
680 spin_lock(&head->lock);
681
682 /* Does not bother with rcv_saddr checks,
683 * because the established check is already
684 * unique enough.
685 */
686 tb_for_each(tb, node, &head->chain) {
687 if (tb->port == port) {
688 BUG_TRAP(!hlist_empty(&tb->owners));
689 if (tb->fastreuse >= 0)
690 goto next_port;
691 if (!__tcp_v4_check_established(sk,
692 port,
693 &tw))
694 goto ok;
695 goto next_port;
696 }
697 }
698
699 tb = tcp_bucket_create(head, port);
700 if (!tb) {
701 spin_unlock(&head->lock);
702 break;
703 }
704 tb->fastreuse = -1;
705 goto ok;
706
707 next_port:
708 spin_unlock(&head->lock);
709 }
710 local_bh_enable();
711
712 return -EADDRNOTAVAIL;
713
714ok:
715 hint += i;
716
717 /* Head lock still held and bh's disabled */
718 tcp_bind_hash(sk, tb, port);
719 if (sk_unhashed(sk)) {
720 inet_sk(sk)->sport = htons(port);
721 __tcp_v4_hash(sk, 0);
722 }
723 spin_unlock(&head->lock);
724
725 if (tw) {
726 tcp_tw_deschedule(tw);
727 tcp_tw_put(tw);
728 }
729
730 ret = 0;
731 goto out;
732 }
733
734 head = &tcp_bhash[tcp_bhashfn(snum)];
735 tb = tcp_sk(sk)->bind_hash;
736 spin_lock_bh(&head->lock);
737 if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
738 __tcp_v4_hash(sk, 0);
739 spin_unlock_bh(&head->lock);
740 return 0;
741 } else {
742 spin_unlock(&head->lock);
743 /* No definite answer... Walk to established hash table */
744 ret = __tcp_v4_check_established(sk, snum, NULL);
745out:
746 local_bh_enable();
747 return ret;
748 }
749}
750
751/* This will initiate an outgoing connection. */
752int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
753{
754 struct inet_sock *inet = inet_sk(sk);
755 struct tcp_sock *tp = tcp_sk(sk);
756 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
757 struct rtable *rt;
758 u32 daddr, nexthop;
759 int tmp;
760 int err;
761
762 if (addr_len < sizeof(struct sockaddr_in))
763 return -EINVAL;
764
765 if (usin->sin_family != AF_INET)
766 return -EAFNOSUPPORT;
767
768 nexthop = daddr = usin->sin_addr.s_addr;
769 if (inet->opt && inet->opt->srr) {
770 if (!daddr)
771 return -EINVAL;
772 nexthop = inet->opt->faddr;
773 }
774
775 tmp = ip_route_connect(&rt, nexthop, inet->saddr,
776 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
777 IPPROTO_TCP,
778 inet->sport, usin->sin_port, sk);
779 if (tmp < 0)
780 return tmp;
781
782 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
783 ip_rt_put(rt);
784 return -ENETUNREACH;
785 }
786
787 if (!inet->opt || !inet->opt->srr)
788 daddr = rt->rt_dst;
789
790 if (!inet->saddr)
791 inet->saddr = rt->rt_src;
792 inet->rcv_saddr = inet->saddr;
793
794 if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
795 /* Reset inherited state */
796 tp->rx_opt.ts_recent = 0;
797 tp->rx_opt.ts_recent_stamp = 0;
798 tp->write_seq = 0;
799 }
800
801 if (sysctl_tcp_tw_recycle &&
802 !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
803 struct inet_peer *peer = rt_get_peer(rt);
804
805 /* VJ's idea. We save last timestamp seen from
806 * the destination in peer table, when entering state TIME-WAIT
807 * and initialize rx_opt.ts_recent from it, when trying new connection.
808 */
809
810 if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {
811 tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
812 tp->rx_opt.ts_recent = peer->tcp_ts;
813 }
814 }
815
816 inet->dport = usin->sin_port;
817 inet->daddr = daddr;
818
819 tp->ext_header_len = 0;
820 if (inet->opt)
821 tp->ext_header_len = inet->opt->optlen;
822
823 tp->rx_opt.mss_clamp = 536;
824
825 /* Socket identity is still unknown (sport may be zero).
826 * However we set state to SYN-SENT and not releasing socket
827 * lock select source port, enter ourselves into the hash tables and
828 * complete initialization after this.
829 */
830 tcp_set_state(sk, TCP_SYN_SENT);
831 err = tcp_v4_hash_connect(sk);
832 if (err)
833 goto failure;
834
835 err = ip_route_newports(&rt, inet->sport, inet->dport, sk);
836 if (err)
837 goto failure;
838
839 /* OK, now commit destination to socket. */
Arnaldo Carvalho de Melo6cbb0df2005-08-09 19:49:02 -0700840 sk_setup_caps(sk, &rt->u.dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700841
842 if (!tp->write_seq)
843 tp->write_seq = secure_tcp_sequence_number(inet->saddr,
844 inet->daddr,
845 inet->sport,
846 usin->sin_port);
847
848 inet->id = tp->write_seq ^ jiffies;
849
850 err = tcp_connect(sk);
851 rt = NULL;
852 if (err)
853 goto failure;
854
855 return 0;
856
857failure:
858 /* This unhashes the socket and releases the local port, if necessary. */
859 tcp_set_state(sk, TCP_CLOSE);
860 ip_rt_put(rt);
861 sk->sk_route_caps = 0;
862 inet->dport = 0;
863 return err;
864}
865
866static __inline__ int tcp_v4_iif(struct sk_buff *skb)
867{
868 return ((struct rtable *)skb->dst)->rt_iif;
869}
870
871static __inline__ u32 tcp_v4_synq_hash(u32 raddr, u16 rport, u32 rnd)
872{
873 return (jhash_2words(raddr, (u32) rport, rnd) & (TCP_SYNQ_HSIZE - 1));
874}
875
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -0700876static struct request_sock *tcp_v4_search_req(struct tcp_sock *tp,
877 struct request_sock ***prevp,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700878 __u16 rport,
879 __u32 raddr, __u32 laddr)
880{
Arnaldo Carvalho de Melo2ad69c52005-06-18 22:48:55 -0700881 struct listen_sock *lopt = tp->accept_queue.listen_opt;
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -0700882 struct request_sock *req, **prev;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700883
884 for (prev = &lopt->syn_table[tcp_v4_synq_hash(raddr, rport, lopt->hash_rnd)];
885 (req = *prev) != NULL;
886 prev = &req->dl_next) {
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -0700887 const struct inet_request_sock *ireq = inet_rsk(req);
888
889 if (ireq->rmt_port == rport &&
890 ireq->rmt_addr == raddr &&
891 ireq->loc_addr == laddr &&
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -0700892 TCP_INET_FAMILY(req->rsk_ops->family)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700893 BUG_TRAP(!req->sk);
894 *prevp = prev;
895 break;
896 }
897 }
898
899 return req;
900}
901
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -0700902static void tcp_v4_synq_add(struct sock *sk, struct request_sock *req)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700903{
904 struct tcp_sock *tp = tcp_sk(sk);
Arnaldo Carvalho de Melo2ad69c52005-06-18 22:48:55 -0700905 struct listen_sock *lopt = tp->accept_queue.listen_opt;
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -0700906 u32 h = tcp_v4_synq_hash(inet_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port, lopt->hash_rnd);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700907
Arnaldo Carvalho de Melo0e875062005-06-18 22:47:59 -0700908 reqsk_queue_hash_req(&tp->accept_queue, h, req, TCP_TIMEOUT_INIT);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700909 tcp_synq_added(sk);
910}
911
912
913/*
914 * This routine does path mtu discovery as defined in RFC1191.
915 */
916static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *iph,
917 u32 mtu)
918{
919 struct dst_entry *dst;
920 struct inet_sock *inet = inet_sk(sk);
921 struct tcp_sock *tp = tcp_sk(sk);
922
923 /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
924 * send out by Linux are always <576bytes so they should go through
925 * unfragmented).
926 */
927 if (sk->sk_state == TCP_LISTEN)
928 return;
929
930 /* We don't check in the destentry if pmtu discovery is forbidden
931 * on this route. We just assume that no packet_to_big packets
932 * are send back when pmtu discovery is not active.
933 * There is a small race when the user changes this flag in the
934 * route, but I think that's acceptable.
935 */
936 if ((dst = __sk_dst_check(sk, 0)) == NULL)
937 return;
938
939 dst->ops->update_pmtu(dst, mtu);
940
941 /* Something is about to be wrong... Remember soft error
942 * for the case, if this connection will not able to recover.
943 */
944 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
945 sk->sk_err_soft = EMSGSIZE;
946
947 mtu = dst_mtu(dst);
948
949 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
950 tp->pmtu_cookie > mtu) {
951 tcp_sync_mss(sk, mtu);
952
953 /* Resend the TCP packet because it's
954 * clear that the old packet has been
955 * dropped. This is the new "fast" path mtu
956 * discovery.
957 */
958 tcp_simple_retransmit(sk);
959 } /* else let the usual retransmit timer handle it */
960}
961
962/*
963 * This routine is called by the ICMP module when it gets some
964 * sort of error condition. If err < 0 then the socket should
965 * be closed and the error returned to the user. If err > 0
966 * it's just the icmp type << 8 | icmp code. After adjustment
967 * header points to the first 8 bytes of the tcp header. We need
968 * to find the appropriate port.
969 *
970 * The locking strategy used here is very "optimistic". When
971 * someone else accesses the socket the ICMP is just dropped
972 * and for some paths there is no check at all.
973 * A more general error queue to queue errors for later handling
974 * is probably better.
975 *
976 */
977
978void tcp_v4_err(struct sk_buff *skb, u32 info)
979{
980 struct iphdr *iph = (struct iphdr *)skb->data;
981 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
982 struct tcp_sock *tp;
983 struct inet_sock *inet;
984 int type = skb->h.icmph->type;
985 int code = skb->h.icmph->code;
986 struct sock *sk;
987 __u32 seq;
988 int err;
989
990 if (skb->len < (iph->ihl << 2) + 8) {
991 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
992 return;
993 }
994
995 sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr,
996 th->source, tcp_v4_iif(skb));
997 if (!sk) {
998 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
999 return;
1000 }
1001 if (sk->sk_state == TCP_TIME_WAIT) {
1002 tcp_tw_put((struct tcp_tw_bucket *)sk);
1003 return;
1004 }
1005
1006 bh_lock_sock(sk);
1007 /* If too many ICMPs get dropped on busy
1008 * servers this needs to be solved differently.
1009 */
1010 if (sock_owned_by_user(sk))
1011 NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
1012
1013 if (sk->sk_state == TCP_CLOSE)
1014 goto out;
1015
1016 tp = tcp_sk(sk);
1017 seq = ntohl(th->seq);
1018 if (sk->sk_state != TCP_LISTEN &&
1019 !between(seq, tp->snd_una, tp->snd_nxt)) {
1020 NET_INC_STATS(LINUX_MIB_OUTOFWINDOWICMPS);
1021 goto out;
1022 }
1023
1024 switch (type) {
1025 case ICMP_SOURCE_QUENCH:
1026 /* Just silently ignore these. */
1027 goto out;
1028 case ICMP_PARAMETERPROB:
1029 err = EPROTO;
1030 break;
1031 case ICMP_DEST_UNREACH:
1032 if (code > NR_ICMP_UNREACH)
1033 goto out;
1034
1035 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
1036 if (!sock_owned_by_user(sk))
1037 do_pmtu_discovery(sk, iph, info);
1038 goto out;
1039 }
1040
1041 err = icmp_err_convert[code].errno;
1042 break;
1043 case ICMP_TIME_EXCEEDED:
1044 err = EHOSTUNREACH;
1045 break;
1046 default:
1047 goto out;
1048 }
1049
1050 switch (sk->sk_state) {
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001051 struct request_sock *req, **prev;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001052 case TCP_LISTEN:
1053 if (sock_owned_by_user(sk))
1054 goto out;
1055
1056 req = tcp_v4_search_req(tp, &prev, th->dest,
1057 iph->daddr, iph->saddr);
1058 if (!req)
1059 goto out;
1060
1061 /* ICMPs are not backlogged, hence we cannot get
1062 an established socket here.
1063 */
1064 BUG_TRAP(!req->sk);
1065
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001066 if (seq != tcp_rsk(req)->snt_isn) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001067 NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
1068 goto out;
1069 }
1070
1071 /*
1072 * Still in SYN_RECV, just remove it silently.
1073 * There is no good way to pass the error to the newly
1074 * created socket, and POSIX does not want network
1075 * errors returned from accept().
1076 */
1077 tcp_synq_drop(sk, req, prev);
1078 goto out;
1079
1080 case TCP_SYN_SENT:
1081 case TCP_SYN_RECV: /* Cannot happen.
1082 It can f.e. if SYNs crossed.
1083 */
1084 if (!sock_owned_by_user(sk)) {
1085 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1086 sk->sk_err = err;
1087
1088 sk->sk_error_report(sk);
1089
1090 tcp_done(sk);
1091 } else {
1092 sk->sk_err_soft = err;
1093 }
1094 goto out;
1095 }
1096
1097 /* If we've already connected we will keep trying
1098 * until we time out, or the user gives up.
1099 *
1100 * rfc1122 4.2.3.9 allows to consider as hard errors
1101 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
1102 * but it is obsoleted by pmtu discovery).
1103 *
1104 * Note, that in modern internet, where routing is unreliable
1105 * and in each dark corner broken firewalls sit, sending random
1106 * errors ordered by their masters even this two messages finally lose
1107 * their original sense (even Linux sends invalid PORT_UNREACHs)
1108 *
1109 * Now we are in compliance with RFCs.
1110 * --ANK (980905)
1111 */
1112
1113 inet = inet_sk(sk);
1114 if (!sock_owned_by_user(sk) && inet->recverr) {
1115 sk->sk_err = err;
1116 sk->sk_error_report(sk);
1117 } else { /* Only an error on timeout */
1118 sk->sk_err_soft = err;
1119 }
1120
1121out:
1122 bh_unlock_sock(sk);
1123 sock_put(sk);
1124}
1125
1126/* This routine computes an IPv4 TCP checksum. */
1127void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
1128 struct sk_buff *skb)
1129{
1130 struct inet_sock *inet = inet_sk(sk);
1131
1132 if (skb->ip_summed == CHECKSUM_HW) {
1133 th->check = ~tcp_v4_check(th, len, inet->saddr, inet->daddr, 0);
1134 skb->csum = offsetof(struct tcphdr, check);
1135 } else {
1136 th->check = tcp_v4_check(th, len, inet->saddr, inet->daddr,
1137 csum_partial((char *)th,
1138 th->doff << 2,
1139 skb->csum));
1140 }
1141}
1142
1143/*
1144 * This routine will send an RST to the other tcp.
1145 *
1146 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
1147 * for reset.
1148 * Answer: if a packet caused RST, it is not for a socket
1149 * existing in our system, if it is matched to a socket,
1150 * it is just duplicate segment or bug in other side's TCP.
1151 * So that we build reply only basing on parameters
1152 * arrived with segment.
1153 * Exception: precedence violation. We do not implement it in any case.
1154 */
1155
1156static void tcp_v4_send_reset(struct sk_buff *skb)
1157{
1158 struct tcphdr *th = skb->h.th;
1159 struct tcphdr rth;
1160 struct ip_reply_arg arg;
1161
1162 /* Never send a reset in response to a reset. */
1163 if (th->rst)
1164 return;
1165
1166 if (((struct rtable *)skb->dst)->rt_type != RTN_LOCAL)
1167 return;
1168
1169 /* Swap the send and the receive. */
1170 memset(&rth, 0, sizeof(struct tcphdr));
1171 rth.dest = th->source;
1172 rth.source = th->dest;
1173 rth.doff = sizeof(struct tcphdr) / 4;
1174 rth.rst = 1;
1175
1176 if (th->ack) {
1177 rth.seq = th->ack_seq;
1178 } else {
1179 rth.ack = 1;
1180 rth.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
1181 skb->len - (th->doff << 2));
1182 }
1183
1184 memset(&arg, 0, sizeof arg);
1185 arg.iov[0].iov_base = (unsigned char *)&rth;
1186 arg.iov[0].iov_len = sizeof rth;
1187 arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1188 skb->nh.iph->saddr, /*XXX*/
1189 sizeof(struct tcphdr), IPPROTO_TCP, 0);
1190 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1191
1192 ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth);
1193
1194 TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
1195 TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
1196}
1197
1198/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
1199 outside socket context is ugly, certainly. What can I do?
1200 */
1201
1202static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
1203 u32 win, u32 ts)
1204{
1205 struct tcphdr *th = skb->h.th;
1206 struct {
1207 struct tcphdr th;
1208 u32 tsopt[3];
1209 } rep;
1210 struct ip_reply_arg arg;
1211
1212 memset(&rep.th, 0, sizeof(struct tcphdr));
1213 memset(&arg, 0, sizeof arg);
1214
1215 arg.iov[0].iov_base = (unsigned char *)&rep;
1216 arg.iov[0].iov_len = sizeof(rep.th);
1217 if (ts) {
1218 rep.tsopt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
1219 (TCPOPT_TIMESTAMP << 8) |
1220 TCPOLEN_TIMESTAMP);
1221 rep.tsopt[1] = htonl(tcp_time_stamp);
1222 rep.tsopt[2] = htonl(ts);
1223 arg.iov[0].iov_len = sizeof(rep);
1224 }
1225
1226 /* Swap the send and the receive. */
1227 rep.th.dest = th->source;
1228 rep.th.source = th->dest;
1229 rep.th.doff = arg.iov[0].iov_len / 4;
1230 rep.th.seq = htonl(seq);
1231 rep.th.ack_seq = htonl(ack);
1232 rep.th.ack = 1;
1233 rep.th.window = htons(win);
1234
1235 arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1236 skb->nh.iph->saddr, /*XXX*/
1237 arg.iov[0].iov_len, IPPROTO_TCP, 0);
1238 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1239
1240 ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
1241
1242 TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
1243}
1244
1245static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
1246{
1247 struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sk;
1248
1249 tcp_v4_send_ack(skb, tw->tw_snd_nxt, tw->tw_rcv_nxt,
1250 tw->tw_rcv_wnd >> tw->tw_rcv_wscale, tw->tw_ts_recent);
1251
1252 tcp_tw_put(tw);
1253}
1254
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001255static void tcp_v4_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001256{
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001257 tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1, tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001258 req->ts_recent);
1259}
1260
1261static struct dst_entry* tcp_v4_route_req(struct sock *sk,
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001262 struct request_sock *req)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001263{
1264 struct rtable *rt;
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001265 const struct inet_request_sock *ireq = inet_rsk(req);
1266 struct ip_options *opt = inet_rsk(req)->opt;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001267 struct flowi fl = { .oif = sk->sk_bound_dev_if,
1268 .nl_u = { .ip4_u =
1269 { .daddr = ((opt && opt->srr) ?
1270 opt->faddr :
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001271 ireq->rmt_addr),
1272 .saddr = ireq->loc_addr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001273 .tos = RT_CONN_FLAGS(sk) } },
1274 .proto = IPPROTO_TCP,
1275 .uli_u = { .ports =
1276 { .sport = inet_sk(sk)->sport,
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001277 .dport = ireq->rmt_port } } };
Linus Torvalds1da177e2005-04-16 15:20:36 -07001278
1279 if (ip_route_output_flow(&rt, &fl, sk, 0)) {
1280 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
1281 return NULL;
1282 }
1283 if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
1284 ip_rt_put(rt);
1285 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
1286 return NULL;
1287 }
1288 return &rt->u.dst;
1289}
1290
1291/*
1292 * Send a SYN-ACK after having received an ACK.
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001293 * This still operates on a request_sock only, not on a big
Linus Torvalds1da177e2005-04-16 15:20:36 -07001294 * socket.
1295 */
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001296static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001297 struct dst_entry *dst)
1298{
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001299 const struct inet_request_sock *ireq = inet_rsk(req);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001300 int err = -1;
1301 struct sk_buff * skb;
1302
1303 /* First, grab a route. */
1304 if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
1305 goto out;
1306
1307 skb = tcp_make_synack(sk, dst, req);
1308
1309 if (skb) {
1310 struct tcphdr *th = skb->h.th;
1311
1312 th->check = tcp_v4_check(th, skb->len,
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001313 ireq->loc_addr,
1314 ireq->rmt_addr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001315 csum_partial((char *)th, skb->len,
1316 skb->csum));
1317
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001318 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
1319 ireq->rmt_addr,
1320 ireq->opt);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001321 if (err == NET_XMIT_CN)
1322 err = 0;
1323 }
1324
1325out:
1326 dst_release(dst);
1327 return err;
1328}
1329
1330/*
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001331 * IPv4 request_sock destructor.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001332 */
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001333static void tcp_v4_reqsk_destructor(struct request_sock *req)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001334{
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001335 if (inet_rsk(req)->opt)
1336 kfree(inet_rsk(req)->opt);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001337}
1338
1339static inline void syn_flood_warning(struct sk_buff *skb)
1340{
1341 static unsigned long warntime;
1342
1343 if (time_after(jiffies, (warntime + HZ * 60))) {
1344 warntime = jiffies;
1345 printk(KERN_INFO
1346 "possible SYN flooding on port %d. Sending cookies.\n",
1347 ntohs(skb->h.th->dest));
1348 }
1349}
1350
1351/*
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001352 * Save and compile IPv4 options into the request_sock if needed.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001353 */
1354static inline struct ip_options *tcp_v4_save_options(struct sock *sk,
1355 struct sk_buff *skb)
1356{
1357 struct ip_options *opt = &(IPCB(skb)->opt);
1358 struct ip_options *dopt = NULL;
1359
1360 if (opt && opt->optlen) {
1361 int opt_size = optlength(opt);
1362 dopt = kmalloc(opt_size, GFP_ATOMIC);
1363 if (dopt) {
1364 if (ip_options_echo(dopt, skb)) {
1365 kfree(dopt);
1366 dopt = NULL;
1367 }
1368 }
1369 }
1370 return dopt;
1371}
1372
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001373struct request_sock_ops tcp_request_sock_ops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001374 .family = PF_INET,
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001375 .obj_size = sizeof(struct tcp_request_sock),
Linus Torvalds1da177e2005-04-16 15:20:36 -07001376 .rtx_syn_ack = tcp_v4_send_synack,
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001377 .send_ack = tcp_v4_reqsk_send_ack,
1378 .destructor = tcp_v4_reqsk_destructor,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001379 .send_reset = tcp_v4_send_reset,
1380};
1381
1382int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1383{
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001384 struct inet_request_sock *ireq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001385 struct tcp_options_received tmp_opt;
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001386 struct request_sock *req;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001387 __u32 saddr = skb->nh.iph->saddr;
1388 __u32 daddr = skb->nh.iph->daddr;
1389 __u32 isn = TCP_SKB_CB(skb)->when;
1390 struct dst_entry *dst = NULL;
1391#ifdef CONFIG_SYN_COOKIES
1392 int want_cookie = 0;
1393#else
1394#define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1395#endif
1396
1397 /* Never answer to SYNs send to broadcast or multicast */
1398 if (((struct rtable *)skb->dst)->rt_flags &
1399 (RTCF_BROADCAST | RTCF_MULTICAST))
1400 goto drop;
1401
1402 /* TW buckets are converted to open requests without
1403 * limitations, they conserve resources and peer is
1404 * evidently real one.
1405 */
1406 if (tcp_synq_is_full(sk) && !isn) {
1407#ifdef CONFIG_SYN_COOKIES
1408 if (sysctl_tcp_syncookies) {
1409 want_cookie = 1;
1410 } else
1411#endif
1412 goto drop;
1413 }
1414
1415 /* Accept backlog is full. If we have already queued enough
1416 * of warm entries in syn queue, drop request. It is better than
1417 * clogging syn queue with openreqs with exponentially increasing
1418 * timeout.
1419 */
1420 if (sk_acceptq_is_full(sk) && tcp_synq_young(sk) > 1)
1421 goto drop;
1422
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001423 req = reqsk_alloc(&tcp_request_sock_ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001424 if (!req)
1425 goto drop;
1426
1427 tcp_clear_options(&tmp_opt);
1428 tmp_opt.mss_clamp = 536;
1429 tmp_opt.user_mss = tcp_sk(sk)->rx_opt.user_mss;
1430
1431 tcp_parse_options(skb, &tmp_opt, 0);
1432
1433 if (want_cookie) {
1434 tcp_clear_options(&tmp_opt);
1435 tmp_opt.saw_tstamp = 0;
1436 }
1437
1438 if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
1439 /* Some OSes (unknown ones, but I see them on web server, which
1440 * contains information interesting only for windows'
1441 * users) do not send their stamp in SYN. It is easy case.
1442 * We simply do not advertise TS support.
1443 */
1444 tmp_opt.saw_tstamp = 0;
1445 tmp_opt.tstamp_ok = 0;
1446 }
1447 tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1448
1449 tcp_openreq_init(req, &tmp_opt, skb);
1450
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001451 ireq = inet_rsk(req);
1452 ireq->loc_addr = daddr;
1453 ireq->rmt_addr = saddr;
1454 ireq->opt = tcp_v4_save_options(sk, skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001455 if (!want_cookie)
1456 TCP_ECN_create_request(req, skb->h.th);
1457
1458 if (want_cookie) {
1459#ifdef CONFIG_SYN_COOKIES
1460 syn_flood_warning(skb);
1461#endif
1462 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1463 } else if (!isn) {
1464 struct inet_peer *peer = NULL;
1465
1466 /* VJ's idea. We save last timestamp seen
1467 * from the destination in peer table, when entering
1468 * state TIME-WAIT, and check against it before
1469 * accepting new connection request.
1470 *
1471 * If "isn" is not zero, this request hit alive
1472 * timewait bucket, so that all the necessary checks
1473 * are made in the function processing timewait state.
1474 */
1475 if (tmp_opt.saw_tstamp &&
1476 sysctl_tcp_tw_recycle &&
1477 (dst = tcp_v4_route_req(sk, req)) != NULL &&
1478 (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1479 peer->v4daddr == saddr) {
1480 if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1481 (s32)(peer->tcp_ts - req->ts_recent) >
1482 TCP_PAWS_WINDOW) {
1483 NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
1484 dst_release(dst);
1485 goto drop_and_free;
1486 }
1487 }
1488 /* Kill the following clause, if you dislike this way. */
1489 else if (!sysctl_tcp_syncookies &&
1490 (sysctl_max_syn_backlog - tcp_synq_len(sk) <
1491 (sysctl_max_syn_backlog >> 2)) &&
1492 (!peer || !peer->tcp_ts_stamp) &&
1493 (!dst || !dst_metric(dst, RTAX_RTT))) {
1494 /* Without syncookies last quarter of
1495 * backlog is filled with destinations,
1496 * proven to be alive.
1497 * It means that we continue to communicate
1498 * to destinations, already remembered
1499 * to the moment of synflood.
1500 */
Heikki Orsilaca933452005-08-08 14:26:52 -07001501 LIMIT_NETDEBUG(printk(KERN_DEBUG "TCP: drop open "
1502 "request from %u.%u."
1503 "%u.%u/%u\n",
1504 NIPQUAD(saddr),
1505 ntohs(skb->h.th->source)));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001506 dst_release(dst);
1507 goto drop_and_free;
1508 }
1509
1510 isn = tcp_v4_init_sequence(sk, skb);
1511 }
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001512 tcp_rsk(req)->snt_isn = isn;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001513
1514 if (tcp_v4_send_synack(sk, req, dst))
1515 goto drop_and_free;
1516
1517 if (want_cookie) {
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001518 reqsk_free(req);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001519 } else {
1520 tcp_v4_synq_add(sk, req);
1521 }
1522 return 0;
1523
1524drop_and_free:
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001525 reqsk_free(req);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001526drop:
1527 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1528 return 0;
1529}
1530
1531
1532/*
1533 * The three way handshake has completed - we got a valid synack -
1534 * now create the new socket.
1535 */
1536struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001537 struct request_sock *req,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001538 struct dst_entry *dst)
1539{
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001540 struct inet_request_sock *ireq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001541 struct inet_sock *newinet;
1542 struct tcp_sock *newtp;
1543 struct sock *newsk;
1544
1545 if (sk_acceptq_is_full(sk))
1546 goto exit_overflow;
1547
1548 if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
1549 goto exit;
1550
1551 newsk = tcp_create_openreq_child(sk, req, skb);
1552 if (!newsk)
1553 goto exit;
1554
Arnaldo Carvalho de Melo6cbb0df2005-08-09 19:49:02 -07001555 sk_setup_caps(newsk, dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001556
1557 newtp = tcp_sk(newsk);
1558 newinet = inet_sk(newsk);
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001559 ireq = inet_rsk(req);
1560 newinet->daddr = ireq->rmt_addr;
1561 newinet->rcv_saddr = ireq->loc_addr;
1562 newinet->saddr = ireq->loc_addr;
1563 newinet->opt = ireq->opt;
1564 ireq->opt = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001565 newinet->mc_index = tcp_v4_iif(skb);
1566 newinet->mc_ttl = skb->nh.iph->ttl;
1567 newtp->ext_header_len = 0;
1568 if (newinet->opt)
1569 newtp->ext_header_len = newinet->opt->optlen;
1570 newinet->id = newtp->write_seq ^ jiffies;
1571
1572 tcp_sync_mss(newsk, dst_mtu(dst));
1573 newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1574 tcp_initialize_rcv_mss(newsk);
1575
1576 __tcp_v4_hash(newsk, 0);
1577 __tcp_inherit_port(sk, newsk);
1578
1579 return newsk;
1580
1581exit_overflow:
1582 NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
1583exit:
1584 NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
1585 dst_release(dst);
1586 return NULL;
1587}
1588
1589static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1590{
1591 struct tcphdr *th = skb->h.th;
1592 struct iphdr *iph = skb->nh.iph;
1593 struct tcp_sock *tp = tcp_sk(sk);
1594 struct sock *nsk;
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001595 struct request_sock **prev;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001596 /* Find possible connection requests. */
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001597 struct request_sock *req = tcp_v4_search_req(tp, &prev, th->source,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001598 iph->saddr, iph->daddr);
1599 if (req)
1600 return tcp_check_req(sk, skb, req, prev);
1601
1602 nsk = __tcp_v4_lookup_established(skb->nh.iph->saddr,
1603 th->source,
1604 skb->nh.iph->daddr,
1605 ntohs(th->dest),
1606 tcp_v4_iif(skb));
1607
1608 if (nsk) {
1609 if (nsk->sk_state != TCP_TIME_WAIT) {
1610 bh_lock_sock(nsk);
1611 return nsk;
1612 }
1613 tcp_tw_put((struct tcp_tw_bucket *)nsk);
1614 return NULL;
1615 }
1616
1617#ifdef CONFIG_SYN_COOKIES
1618 if (!th->rst && !th->syn && th->ack)
1619 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1620#endif
1621 return sk;
1622}
1623
1624static int tcp_v4_checksum_init(struct sk_buff *skb)
1625{
1626 if (skb->ip_summed == CHECKSUM_HW) {
1627 skb->ip_summed = CHECKSUM_UNNECESSARY;
1628 if (!tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1629 skb->nh.iph->daddr, skb->csum))
1630 return 0;
1631
Heikki Orsilaca933452005-08-08 14:26:52 -07001632 LIMIT_NETDEBUG(printk(KERN_DEBUG "hw tcp v4 csum failed\n"));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001633 skb->ip_summed = CHECKSUM_NONE;
1634 }
1635 if (skb->len <= 76) {
1636 if (tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1637 skb->nh.iph->daddr,
1638 skb_checksum(skb, 0, skb->len, 0)))
1639 return -1;
1640 skb->ip_summed = CHECKSUM_UNNECESSARY;
1641 } else {
1642 skb->csum = ~tcp_v4_check(skb->h.th, skb->len,
1643 skb->nh.iph->saddr,
1644 skb->nh.iph->daddr, 0);
1645 }
1646 return 0;
1647}
1648
1649
1650/* The socket must have it's spinlock held when we get
1651 * here.
1652 *
1653 * We have a potential double-lock case here, so even when
1654 * doing backlog processing we use the BH locking scheme.
1655 * This is because we cannot sleep with the original spinlock
1656 * held.
1657 */
1658int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1659{
1660 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1661 TCP_CHECK_TIMER(sk);
1662 if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
1663 goto reset;
1664 TCP_CHECK_TIMER(sk);
1665 return 0;
1666 }
1667
1668 if (skb->len < (skb->h.th->doff << 2) || tcp_checksum_complete(skb))
1669 goto csum_err;
1670
1671 if (sk->sk_state == TCP_LISTEN) {
1672 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1673 if (!nsk)
1674 goto discard;
1675
1676 if (nsk != sk) {
1677 if (tcp_child_process(sk, nsk, skb))
1678 goto reset;
1679 return 0;
1680 }
1681 }
1682
1683 TCP_CHECK_TIMER(sk);
1684 if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
1685 goto reset;
1686 TCP_CHECK_TIMER(sk);
1687 return 0;
1688
1689reset:
1690 tcp_v4_send_reset(skb);
1691discard:
1692 kfree_skb(skb);
1693 /* Be careful here. If this function gets more complicated and
1694 * gcc suffers from register pressure on the x86, sk (in %ebx)
1695 * might be destroyed here. This current version compiles correctly,
1696 * but you have been warned.
1697 */
1698 return 0;
1699
1700csum_err:
1701 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1702 goto discard;
1703}
1704
1705/*
1706 * From tcp_input.c
1707 */
1708
1709int tcp_v4_rcv(struct sk_buff *skb)
1710{
1711 struct tcphdr *th;
1712 struct sock *sk;
1713 int ret;
1714
1715 if (skb->pkt_type != PACKET_HOST)
1716 goto discard_it;
1717
1718 /* Count it even if it's bad */
1719 TCP_INC_STATS_BH(TCP_MIB_INSEGS);
1720
1721 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1722 goto discard_it;
1723
1724 th = skb->h.th;
1725
1726 if (th->doff < sizeof(struct tcphdr) / 4)
1727 goto bad_packet;
1728 if (!pskb_may_pull(skb, th->doff * 4))
1729 goto discard_it;
1730
1731 /* An explanation is required here, I think.
1732 * Packet length and doff are validated by header prediction,
1733 * provided case of th->doff==0 is elimineted.
1734 * So, we defer the checks. */
1735 if ((skb->ip_summed != CHECKSUM_UNNECESSARY &&
1736 tcp_v4_checksum_init(skb) < 0))
1737 goto bad_packet;
1738
1739 th = skb->h.th;
1740 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1741 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1742 skb->len - th->doff * 4);
1743 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1744 TCP_SKB_CB(skb)->when = 0;
1745 TCP_SKB_CB(skb)->flags = skb->nh.iph->tos;
1746 TCP_SKB_CB(skb)->sacked = 0;
1747
1748 sk = __tcp_v4_lookup(skb->nh.iph->saddr, th->source,
1749 skb->nh.iph->daddr, ntohs(th->dest),
1750 tcp_v4_iif(skb));
1751
1752 if (!sk)
1753 goto no_tcp_socket;
1754
1755process:
1756 if (sk->sk_state == TCP_TIME_WAIT)
1757 goto do_time_wait;
1758
1759 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1760 goto discard_and_relse;
1761
1762 if (sk_filter(sk, skb, 0))
1763 goto discard_and_relse;
1764
1765 skb->dev = NULL;
1766
1767 bh_lock_sock(sk);
1768 ret = 0;
1769 if (!sock_owned_by_user(sk)) {
1770 if (!tcp_prequeue(sk, skb))
1771 ret = tcp_v4_do_rcv(sk, skb);
1772 } else
1773 sk_add_backlog(sk, skb);
1774 bh_unlock_sock(sk);
1775
1776 sock_put(sk);
1777
1778 return ret;
1779
1780no_tcp_socket:
1781 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1782 goto discard_it;
1783
1784 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1785bad_packet:
1786 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1787 } else {
1788 tcp_v4_send_reset(skb);
1789 }
1790
1791discard_it:
1792 /* Discard frame. */
1793 kfree_skb(skb);
1794 return 0;
1795
1796discard_and_relse:
1797 sock_put(sk);
1798 goto discard_it;
1799
1800do_time_wait:
1801 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1802 tcp_tw_put((struct tcp_tw_bucket *) sk);
1803 goto discard_it;
1804 }
1805
1806 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1807 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1808 tcp_tw_put((struct tcp_tw_bucket *) sk);
1809 goto discard_it;
1810 }
1811 switch (tcp_timewait_state_process((struct tcp_tw_bucket *)sk,
1812 skb, th, skb->len)) {
1813 case TCP_TW_SYN: {
1814 struct sock *sk2 = tcp_v4_lookup_listener(skb->nh.iph->daddr,
1815 ntohs(th->dest),
1816 tcp_v4_iif(skb));
1817 if (sk2) {
1818 tcp_tw_deschedule((struct tcp_tw_bucket *)sk);
1819 tcp_tw_put((struct tcp_tw_bucket *)sk);
1820 sk = sk2;
1821 goto process;
1822 }
1823 /* Fall through to ACK */
1824 }
1825 case TCP_TW_ACK:
1826 tcp_v4_timewait_ack(sk, skb);
1827 break;
1828 case TCP_TW_RST:
1829 goto no_tcp_socket;
1830 case TCP_TW_SUCCESS:;
1831 }
1832 goto discard_it;
1833}
1834
Linus Torvalds1da177e2005-04-16 15:20:36 -07001835static int tcp_v4_reselect_saddr(struct sock *sk)
1836{
1837 struct inet_sock *inet = inet_sk(sk);
1838 int err;
1839 struct rtable *rt;
1840 __u32 old_saddr = inet->saddr;
1841 __u32 new_saddr;
1842 __u32 daddr = inet->daddr;
1843
1844 if (inet->opt && inet->opt->srr)
1845 daddr = inet->opt->faddr;
1846
1847 /* Query new route. */
1848 err = ip_route_connect(&rt, daddr, 0,
1849 RT_CONN_FLAGS(sk),
1850 sk->sk_bound_dev_if,
1851 IPPROTO_TCP,
1852 inet->sport, inet->dport, sk);
1853 if (err)
1854 return err;
1855
Arnaldo Carvalho de Melo6cbb0df2005-08-09 19:49:02 -07001856 sk_setup_caps(sk, &rt->u.dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001857
1858 new_saddr = rt->rt_src;
1859
1860 if (new_saddr == old_saddr)
1861 return 0;
1862
1863 if (sysctl_ip_dynaddr > 1) {
1864 printk(KERN_INFO "tcp_v4_rebuild_header(): shifting inet->"
1865 "saddr from %d.%d.%d.%d to %d.%d.%d.%d\n",
1866 NIPQUAD(old_saddr),
1867 NIPQUAD(new_saddr));
1868 }
1869
1870 inet->saddr = new_saddr;
1871 inet->rcv_saddr = new_saddr;
1872
1873 /* XXX The only one ugly spot where we need to
1874 * XXX really change the sockets identity after
1875 * XXX it has entered the hashes. -DaveM
1876 *
1877 * Besides that, it does not check for connection
1878 * uniqueness. Wait for troubles.
1879 */
Arnaldo Carvalho de Melo614c6cb2005-08-09 19:47:37 -07001880 __sk_prot_rehash(sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001881 return 0;
1882}
1883
1884int tcp_v4_rebuild_header(struct sock *sk)
1885{
1886 struct inet_sock *inet = inet_sk(sk);
1887 struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0);
1888 u32 daddr;
1889 int err;
1890
1891 /* Route is OK, nothing to do. */
1892 if (rt)
1893 return 0;
1894
1895 /* Reroute. */
1896 daddr = inet->daddr;
1897 if (inet->opt && inet->opt->srr)
1898 daddr = inet->opt->faddr;
1899
1900 {
1901 struct flowi fl = { .oif = sk->sk_bound_dev_if,
1902 .nl_u = { .ip4_u =
1903 { .daddr = daddr,
1904 .saddr = inet->saddr,
1905 .tos = RT_CONN_FLAGS(sk) } },
1906 .proto = IPPROTO_TCP,
1907 .uli_u = { .ports =
1908 { .sport = inet->sport,
1909 .dport = inet->dport } } };
1910
1911 err = ip_route_output_flow(&rt, &fl, sk, 0);
1912 }
1913 if (!err) {
Arnaldo Carvalho de Melo6cbb0df2005-08-09 19:49:02 -07001914 sk_setup_caps(sk, &rt->u.dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001915 return 0;
1916 }
1917
1918 /* Routing failed... */
1919 sk->sk_route_caps = 0;
1920
1921 if (!sysctl_ip_dynaddr ||
1922 sk->sk_state != TCP_SYN_SENT ||
1923 (sk->sk_userlocks & SOCK_BINDADDR_LOCK) ||
1924 (err = tcp_v4_reselect_saddr(sk)) != 0)
1925 sk->sk_err_soft = -err;
1926
1927 return err;
1928}
1929
1930static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
1931{
1932 struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
1933 struct inet_sock *inet = inet_sk(sk);
1934
1935 sin->sin_family = AF_INET;
1936 sin->sin_addr.s_addr = inet->daddr;
1937 sin->sin_port = inet->dport;
1938}
1939
1940/* VJ's idea. Save last timestamp seen from this destination
1941 * and hold it at least for normal timewait interval to use for duplicate
1942 * segment detection in subsequent connections, before they enter synchronized
1943 * state.
1944 */
1945
1946int tcp_v4_remember_stamp(struct sock *sk)
1947{
1948 struct inet_sock *inet = inet_sk(sk);
1949 struct tcp_sock *tp = tcp_sk(sk);
1950 struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1951 struct inet_peer *peer = NULL;
1952 int release_it = 0;
1953
1954 if (!rt || rt->rt_dst != inet->daddr) {
1955 peer = inet_getpeer(inet->daddr, 1);
1956 release_it = 1;
1957 } else {
1958 if (!rt->peer)
1959 rt_bind_peer(rt, 1);
1960 peer = rt->peer;
1961 }
1962
1963 if (peer) {
1964 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1965 (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1966 peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
1967 peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
1968 peer->tcp_ts = tp->rx_opt.ts_recent;
1969 }
1970 if (release_it)
1971 inet_putpeer(peer);
1972 return 1;
1973 }
1974
1975 return 0;
1976}
1977
1978int tcp_v4_tw_remember_stamp(struct tcp_tw_bucket *tw)
1979{
1980 struct inet_peer *peer = NULL;
1981
1982 peer = inet_getpeer(tw->tw_daddr, 1);
1983
1984 if (peer) {
1985 if ((s32)(peer->tcp_ts - tw->tw_ts_recent) <= 0 ||
1986 (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1987 peer->tcp_ts_stamp <= tw->tw_ts_recent_stamp)) {
1988 peer->tcp_ts_stamp = tw->tw_ts_recent_stamp;
1989 peer->tcp_ts = tw->tw_ts_recent;
1990 }
1991 inet_putpeer(peer);
1992 return 1;
1993 }
1994
1995 return 0;
1996}
1997
1998struct tcp_func ipv4_specific = {
1999 .queue_xmit = ip_queue_xmit,
2000 .send_check = tcp_v4_send_check,
2001 .rebuild_header = tcp_v4_rebuild_header,
2002 .conn_request = tcp_v4_conn_request,
2003 .syn_recv_sock = tcp_v4_syn_recv_sock,
2004 .remember_stamp = tcp_v4_remember_stamp,
2005 .net_header_len = sizeof(struct iphdr),
2006 .setsockopt = ip_setsockopt,
2007 .getsockopt = ip_getsockopt,
2008 .addr2sockaddr = v4_addr2sockaddr,
2009 .sockaddr_len = sizeof(struct sockaddr_in),
2010};
2011
2012/* NOTE: A lot of things set to zero explicitly by call to
2013 * sk_alloc() so need not be done here.
2014 */
2015static int tcp_v4_init_sock(struct sock *sk)
2016{
2017 struct tcp_sock *tp = tcp_sk(sk);
2018
2019 skb_queue_head_init(&tp->out_of_order_queue);
2020 tcp_init_xmit_timers(sk);
2021 tcp_prequeue_init(tp);
2022
2023 tp->rto = TCP_TIMEOUT_INIT;
2024 tp->mdev = TCP_TIMEOUT_INIT;
2025
2026 /* So many TCP implementations out there (incorrectly) count the
2027 * initial SYN frame in their delayed-ACK and congestion control
2028 * algorithms that we must have the following bandaid to talk
2029 * efficiently to them. -DaveM
2030 */
2031 tp->snd_cwnd = 2;
2032
2033 /* See draft-stevens-tcpca-spec-01 for discussion of the
2034 * initialization of these values.
2035 */
2036 tp->snd_ssthresh = 0x7fffffff; /* Infinity */
2037 tp->snd_cwnd_clamp = ~0;
David S. Millerc1b4a7e2005-07-05 15:24:38 -07002038 tp->mss_cache = 536;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002039
2040 tp->reordering = sysctl_tcp_reordering;
Stephen Hemminger5f8ef482005-06-23 20:37:36 -07002041 tp->ca_ops = &tcp_init_congestion_ops;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002042
2043 sk->sk_state = TCP_CLOSE;
2044
2045 sk->sk_write_space = sk_stream_write_space;
2046 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2047
2048 tp->af_specific = &ipv4_specific;
2049
2050 sk->sk_sndbuf = sysctl_tcp_wmem[1];
2051 sk->sk_rcvbuf = sysctl_tcp_rmem[1];
2052
2053 atomic_inc(&tcp_sockets_allocated);
2054
2055 return 0;
2056}
2057
2058int tcp_v4_destroy_sock(struct sock *sk)
2059{
2060 struct tcp_sock *tp = tcp_sk(sk);
2061
2062 tcp_clear_xmit_timers(sk);
2063
Stephen Hemminger317a76f2005-06-23 12:19:55 -07002064 tcp_cleanup_congestion_control(tp);
2065
Linus Torvalds1da177e2005-04-16 15:20:36 -07002066 /* Cleanup up the write buffer. */
2067 sk_stream_writequeue_purge(sk);
2068
2069 /* Cleans up our, hopefully empty, out_of_order_queue. */
2070 __skb_queue_purge(&tp->out_of_order_queue);
2071
2072 /* Clean prequeue, it must be empty really */
2073 __skb_queue_purge(&tp->ucopy.prequeue);
2074
2075 /* Clean up a referenced TCP bind bucket. */
2076 if (tp->bind_hash)
2077 tcp_put_port(sk);
2078
2079 /*
2080 * If sendmsg cached page exists, toss it.
2081 */
2082 if (sk->sk_sndmsg_page) {
2083 __free_page(sk->sk_sndmsg_page);
2084 sk->sk_sndmsg_page = NULL;
2085 }
2086
2087 atomic_dec(&tcp_sockets_allocated);
2088
2089 return 0;
2090}
2091
2092EXPORT_SYMBOL(tcp_v4_destroy_sock);
2093
2094#ifdef CONFIG_PROC_FS
2095/* Proc filesystem TCP sock list dumping. */
2096
2097static inline struct tcp_tw_bucket *tw_head(struct hlist_head *head)
2098{
2099 return hlist_empty(head) ? NULL :
2100 list_entry(head->first, struct tcp_tw_bucket, tw_node);
2101}
2102
2103static inline struct tcp_tw_bucket *tw_next(struct tcp_tw_bucket *tw)
2104{
2105 return tw->tw_node.next ?
2106 hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
2107}
2108
2109static void *listening_get_next(struct seq_file *seq, void *cur)
2110{
2111 struct tcp_sock *tp;
2112 struct hlist_node *node;
2113 struct sock *sk = cur;
2114 struct tcp_iter_state* st = seq->private;
2115
2116 if (!sk) {
2117 st->bucket = 0;
2118 sk = sk_head(&tcp_listening_hash[0]);
2119 goto get_sk;
2120 }
2121
2122 ++st->num;
2123
2124 if (st->state == TCP_SEQ_STATE_OPENREQ) {
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07002125 struct request_sock *req = cur;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002126
2127 tp = tcp_sk(st->syn_wait_sk);
2128 req = req->dl_next;
2129 while (1) {
2130 while (req) {
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07002131 if (req->rsk_ops->family == st->family) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002132 cur = req;
2133 goto out;
2134 }
2135 req = req->dl_next;
2136 }
2137 if (++st->sbucket >= TCP_SYNQ_HSIZE)
2138 break;
2139get_req:
Arnaldo Carvalho de Melo0e875062005-06-18 22:47:59 -07002140 req = tp->accept_queue.listen_opt->syn_table[st->sbucket];
Linus Torvalds1da177e2005-04-16 15:20:36 -07002141 }
2142 sk = sk_next(st->syn_wait_sk);
2143 st->state = TCP_SEQ_STATE_LISTENING;
Arnaldo Carvalho de Melo0e875062005-06-18 22:47:59 -07002144 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002145 } else {
2146 tp = tcp_sk(sk);
Arnaldo Carvalho de Melo0e875062005-06-18 22:47:59 -07002147 read_lock_bh(&tp->accept_queue.syn_wait_lock);
2148 if (reqsk_queue_len(&tp->accept_queue))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002149 goto start_req;
Arnaldo Carvalho de Melo0e875062005-06-18 22:47:59 -07002150 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002151 sk = sk_next(sk);
2152 }
2153get_sk:
2154 sk_for_each_from(sk, node) {
2155 if (sk->sk_family == st->family) {
2156 cur = sk;
2157 goto out;
2158 }
2159 tp = tcp_sk(sk);
Arnaldo Carvalho de Melo0e875062005-06-18 22:47:59 -07002160 read_lock_bh(&tp->accept_queue.syn_wait_lock);
2161 if (reqsk_queue_len(&tp->accept_queue)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002162start_req:
2163 st->uid = sock_i_uid(sk);
2164 st->syn_wait_sk = sk;
2165 st->state = TCP_SEQ_STATE_OPENREQ;
2166 st->sbucket = 0;
2167 goto get_req;
2168 }
Arnaldo Carvalho de Melo0e875062005-06-18 22:47:59 -07002169 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002170 }
2171 if (++st->bucket < TCP_LHTABLE_SIZE) {
2172 sk = sk_head(&tcp_listening_hash[st->bucket]);
2173 goto get_sk;
2174 }
2175 cur = NULL;
2176out:
2177 return cur;
2178}
2179
2180static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2181{
2182 void *rc = listening_get_next(seq, NULL);
2183
2184 while (rc && *pos) {
2185 rc = listening_get_next(seq, rc);
2186 --*pos;
2187 }
2188 return rc;
2189}
2190
2191static void *established_get_first(struct seq_file *seq)
2192{
2193 struct tcp_iter_state* st = seq->private;
2194 void *rc = NULL;
2195
2196 for (st->bucket = 0; st->bucket < tcp_ehash_size; ++st->bucket) {
2197 struct sock *sk;
2198 struct hlist_node *node;
2199 struct tcp_tw_bucket *tw;
2200
2201 /* We can reschedule _before_ having picked the target: */
2202 cond_resched_softirq();
2203
2204 read_lock(&tcp_ehash[st->bucket].lock);
2205 sk_for_each(sk, node, &tcp_ehash[st->bucket].chain) {
2206 if (sk->sk_family != st->family) {
2207 continue;
2208 }
2209 rc = sk;
2210 goto out;
2211 }
2212 st->state = TCP_SEQ_STATE_TIME_WAIT;
2213 tw_for_each(tw, node,
2214 &tcp_ehash[st->bucket + tcp_ehash_size].chain) {
2215 if (tw->tw_family != st->family) {
2216 continue;
2217 }
2218 rc = tw;
2219 goto out;
2220 }
2221 read_unlock(&tcp_ehash[st->bucket].lock);
2222 st->state = TCP_SEQ_STATE_ESTABLISHED;
2223 }
2224out:
2225 return rc;
2226}
2227
2228static void *established_get_next(struct seq_file *seq, void *cur)
2229{
2230 struct sock *sk = cur;
2231 struct tcp_tw_bucket *tw;
2232 struct hlist_node *node;
2233 struct tcp_iter_state* st = seq->private;
2234
2235 ++st->num;
2236
2237 if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2238 tw = cur;
2239 tw = tw_next(tw);
2240get_tw:
2241 while (tw && tw->tw_family != st->family) {
2242 tw = tw_next(tw);
2243 }
2244 if (tw) {
2245 cur = tw;
2246 goto out;
2247 }
2248 read_unlock(&tcp_ehash[st->bucket].lock);
2249 st->state = TCP_SEQ_STATE_ESTABLISHED;
2250
2251 /* We can reschedule between buckets: */
2252 cond_resched_softirq();
2253
2254 if (++st->bucket < tcp_ehash_size) {
2255 read_lock(&tcp_ehash[st->bucket].lock);
2256 sk = sk_head(&tcp_ehash[st->bucket].chain);
2257 } else {
2258 cur = NULL;
2259 goto out;
2260 }
2261 } else
2262 sk = sk_next(sk);
2263
2264 sk_for_each_from(sk, node) {
2265 if (sk->sk_family == st->family)
2266 goto found;
2267 }
2268
2269 st->state = TCP_SEQ_STATE_TIME_WAIT;
2270 tw = tw_head(&tcp_ehash[st->bucket + tcp_ehash_size].chain);
2271 goto get_tw;
2272found:
2273 cur = sk;
2274out:
2275 return cur;
2276}
2277
2278static void *established_get_idx(struct seq_file *seq, loff_t pos)
2279{
2280 void *rc = established_get_first(seq);
2281
2282 while (rc && pos) {
2283 rc = established_get_next(seq, rc);
2284 --pos;
2285 }
2286 return rc;
2287}
2288
2289static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2290{
2291 void *rc;
2292 struct tcp_iter_state* st = seq->private;
2293
2294 tcp_listen_lock();
2295 st->state = TCP_SEQ_STATE_LISTENING;
2296 rc = listening_get_idx(seq, &pos);
2297
2298 if (!rc) {
2299 tcp_listen_unlock();
2300 local_bh_disable();
2301 st->state = TCP_SEQ_STATE_ESTABLISHED;
2302 rc = established_get_idx(seq, pos);
2303 }
2304
2305 return rc;
2306}
2307
2308static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2309{
2310 struct tcp_iter_state* st = seq->private;
2311 st->state = TCP_SEQ_STATE_LISTENING;
2312 st->num = 0;
2313 return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2314}
2315
2316static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2317{
2318 void *rc = NULL;
2319 struct tcp_iter_state* st;
2320
2321 if (v == SEQ_START_TOKEN) {
2322 rc = tcp_get_idx(seq, 0);
2323 goto out;
2324 }
2325 st = seq->private;
2326
2327 switch (st->state) {
2328 case TCP_SEQ_STATE_OPENREQ:
2329 case TCP_SEQ_STATE_LISTENING:
2330 rc = listening_get_next(seq, v);
2331 if (!rc) {
2332 tcp_listen_unlock();
2333 local_bh_disable();
2334 st->state = TCP_SEQ_STATE_ESTABLISHED;
2335 rc = established_get_first(seq);
2336 }
2337 break;
2338 case TCP_SEQ_STATE_ESTABLISHED:
2339 case TCP_SEQ_STATE_TIME_WAIT:
2340 rc = established_get_next(seq, v);
2341 break;
2342 }
2343out:
2344 ++*pos;
2345 return rc;
2346}
2347
2348static void tcp_seq_stop(struct seq_file *seq, void *v)
2349{
2350 struct tcp_iter_state* st = seq->private;
2351
2352 switch (st->state) {
2353 case TCP_SEQ_STATE_OPENREQ:
2354 if (v) {
2355 struct tcp_sock *tp = tcp_sk(st->syn_wait_sk);
Arnaldo Carvalho de Melo0e875062005-06-18 22:47:59 -07002356 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002357 }
2358 case TCP_SEQ_STATE_LISTENING:
2359 if (v != SEQ_START_TOKEN)
2360 tcp_listen_unlock();
2361 break;
2362 case TCP_SEQ_STATE_TIME_WAIT:
2363 case TCP_SEQ_STATE_ESTABLISHED:
2364 if (v)
2365 read_unlock(&tcp_ehash[st->bucket].lock);
2366 local_bh_enable();
2367 break;
2368 }
2369}
2370
2371static int tcp_seq_open(struct inode *inode, struct file *file)
2372{
2373 struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2374 struct seq_file *seq;
2375 struct tcp_iter_state *s;
2376 int rc;
2377
2378 if (unlikely(afinfo == NULL))
2379 return -EINVAL;
2380
2381 s = kmalloc(sizeof(*s), GFP_KERNEL);
2382 if (!s)
2383 return -ENOMEM;
2384 memset(s, 0, sizeof(*s));
2385 s->family = afinfo->family;
2386 s->seq_ops.start = tcp_seq_start;
2387 s->seq_ops.next = tcp_seq_next;
2388 s->seq_ops.show = afinfo->seq_show;
2389 s->seq_ops.stop = tcp_seq_stop;
2390
2391 rc = seq_open(file, &s->seq_ops);
2392 if (rc)
2393 goto out_kfree;
2394 seq = file->private_data;
2395 seq->private = s;
2396out:
2397 return rc;
2398out_kfree:
2399 kfree(s);
2400 goto out;
2401}
2402
2403int tcp_proc_register(struct tcp_seq_afinfo *afinfo)
2404{
2405 int rc = 0;
2406 struct proc_dir_entry *p;
2407
2408 if (!afinfo)
2409 return -EINVAL;
2410 afinfo->seq_fops->owner = afinfo->owner;
2411 afinfo->seq_fops->open = tcp_seq_open;
2412 afinfo->seq_fops->read = seq_read;
2413 afinfo->seq_fops->llseek = seq_lseek;
2414 afinfo->seq_fops->release = seq_release_private;
2415
2416 p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops);
2417 if (p)
2418 p->data = afinfo;
2419 else
2420 rc = -ENOMEM;
2421 return rc;
2422}
2423
2424void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo)
2425{
2426 if (!afinfo)
2427 return;
2428 proc_net_remove(afinfo->name);
2429 memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops));
2430}
2431
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07002432static void get_openreq4(struct sock *sk, struct request_sock *req,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002433 char *tmpbuf, int i, int uid)
2434{
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07002435 const struct inet_request_sock *ireq = inet_rsk(req);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002436 int ttd = req->expires - jiffies;
2437
2438 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2439 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p",
2440 i,
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07002441 ireq->loc_addr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002442 ntohs(inet_sk(sk)->sport),
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07002443 ireq->rmt_addr,
2444 ntohs(ireq->rmt_port),
Linus Torvalds1da177e2005-04-16 15:20:36 -07002445 TCP_SYN_RECV,
2446 0, 0, /* could print option size, but that is af dependent. */
2447 1, /* timers active (only the expire timer) */
2448 jiffies_to_clock_t(ttd),
2449 req->retrans,
2450 uid,
2451 0, /* non standard timer */
2452 0, /* open_requests have no inode */
2453 atomic_read(&sk->sk_refcnt),
2454 req);
2455}
2456
2457static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i)
2458{
2459 int timer_active;
2460 unsigned long timer_expires;
2461 struct tcp_sock *tp = tcp_sk(sp);
2462 struct inet_sock *inet = inet_sk(sp);
2463 unsigned int dest = inet->daddr;
2464 unsigned int src = inet->rcv_saddr;
2465 __u16 destp = ntohs(inet->dport);
2466 __u16 srcp = ntohs(inet->sport);
2467
2468 if (tp->pending == TCP_TIME_RETRANS) {
2469 timer_active = 1;
2470 timer_expires = tp->timeout;
2471 } else if (tp->pending == TCP_TIME_PROBE0) {
2472 timer_active = 4;
2473 timer_expires = tp->timeout;
2474 } else if (timer_pending(&sp->sk_timer)) {
2475 timer_active = 2;
2476 timer_expires = sp->sk_timer.expires;
2477 } else {
2478 timer_active = 0;
2479 timer_expires = jiffies;
2480 }
2481
2482 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2483 "%08X %5d %8d %lu %d %p %u %u %u %u %d",
2484 i, src, srcp, dest, destp, sp->sk_state,
2485 tp->write_seq - tp->snd_una, tp->rcv_nxt - tp->copied_seq,
2486 timer_active,
2487 jiffies_to_clock_t(timer_expires - jiffies),
2488 tp->retransmits,
2489 sock_i_uid(sp),
2490 tp->probes_out,
2491 sock_i_ino(sp),
2492 atomic_read(&sp->sk_refcnt), sp,
2493 tp->rto, tp->ack.ato, (tp->ack.quick << 1) | tp->ack.pingpong,
2494 tp->snd_cwnd,
2495 tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh);
2496}
2497
2498static void get_timewait4_sock(struct tcp_tw_bucket *tw, char *tmpbuf, int i)
2499{
2500 unsigned int dest, src;
2501 __u16 destp, srcp;
2502 int ttd = tw->tw_ttd - jiffies;
2503
2504 if (ttd < 0)
2505 ttd = 0;
2506
2507 dest = tw->tw_daddr;
2508 src = tw->tw_rcv_saddr;
2509 destp = ntohs(tw->tw_dport);
2510 srcp = ntohs(tw->tw_sport);
2511
2512 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2513 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p",
2514 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2515 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2516 atomic_read(&tw->tw_refcnt), tw);
2517}
2518
2519#define TMPSZ 150
2520
2521static int tcp4_seq_show(struct seq_file *seq, void *v)
2522{
2523 struct tcp_iter_state* st;
2524 char tmpbuf[TMPSZ + 1];
2525
2526 if (v == SEQ_START_TOKEN) {
2527 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2528 " sl local_address rem_address st tx_queue "
2529 "rx_queue tr tm->when retrnsmt uid timeout "
2530 "inode");
2531 goto out;
2532 }
2533 st = seq->private;
2534
2535 switch (st->state) {
2536 case TCP_SEQ_STATE_LISTENING:
2537 case TCP_SEQ_STATE_ESTABLISHED:
2538 get_tcp4_sock(v, tmpbuf, st->num);
2539 break;
2540 case TCP_SEQ_STATE_OPENREQ:
2541 get_openreq4(st->syn_wait_sk, v, tmpbuf, st->num, st->uid);
2542 break;
2543 case TCP_SEQ_STATE_TIME_WAIT:
2544 get_timewait4_sock(v, tmpbuf, st->num);
2545 break;
2546 }
2547 seq_printf(seq, "%-*s\n", TMPSZ - 1, tmpbuf);
2548out:
2549 return 0;
2550}
2551
2552static struct file_operations tcp4_seq_fops;
2553static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2554 .owner = THIS_MODULE,
2555 .name = "tcp",
2556 .family = AF_INET,
2557 .seq_show = tcp4_seq_show,
2558 .seq_fops = &tcp4_seq_fops,
2559};
2560
2561int __init tcp4_proc_init(void)
2562{
2563 return tcp_proc_register(&tcp4_seq_afinfo);
2564}
2565
2566void tcp4_proc_exit(void)
2567{
2568 tcp_proc_unregister(&tcp4_seq_afinfo);
2569}
2570#endif /* CONFIG_PROC_FS */
2571
2572struct proto tcp_prot = {
2573 .name = "TCP",
2574 .owner = THIS_MODULE,
2575 .close = tcp_close,
2576 .connect = tcp_v4_connect,
2577 .disconnect = tcp_disconnect,
2578 .accept = tcp_accept,
2579 .ioctl = tcp_ioctl,
2580 .init = tcp_v4_init_sock,
2581 .destroy = tcp_v4_destroy_sock,
2582 .shutdown = tcp_shutdown,
2583 .setsockopt = tcp_setsockopt,
2584 .getsockopt = tcp_getsockopt,
2585 .sendmsg = tcp_sendmsg,
2586 .recvmsg = tcp_recvmsg,
2587 .backlog_rcv = tcp_v4_do_rcv,
2588 .hash = tcp_v4_hash,
2589 .unhash = tcp_unhash,
2590 .get_port = tcp_v4_get_port,
2591 .enter_memory_pressure = tcp_enter_memory_pressure,
2592 .sockets_allocated = &tcp_sockets_allocated,
2593 .memory_allocated = &tcp_memory_allocated,
2594 .memory_pressure = &tcp_memory_pressure,
2595 .sysctl_mem = sysctl_tcp_mem,
2596 .sysctl_wmem = sysctl_tcp_wmem,
2597 .sysctl_rmem = sysctl_tcp_rmem,
2598 .max_header = MAX_TCP_HEADER,
2599 .obj_size = sizeof(struct tcp_sock),
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07002600 .rsk_prot = &tcp_request_sock_ops,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002601};
2602
2603
2604
2605void __init tcp_v4_init(struct net_proto_family *ops)
2606{
2607 int err = sock_create_kern(PF_INET, SOCK_RAW, IPPROTO_TCP, &tcp_socket);
2608 if (err < 0)
2609 panic("Failed to create the TCP control socket.\n");
2610 tcp_socket->sk->sk_allocation = GFP_ATOMIC;
2611 inet_sk(tcp_socket->sk)->uc_ttl = -1;
2612
2613 /* Unhash it so that IP input processing does not even
2614 * see it, we do not wish this socket to see incoming
2615 * packets.
2616 */
2617 tcp_socket->sk->sk_prot->unhash(tcp_socket->sk);
2618}
2619
2620EXPORT_SYMBOL(ipv4_specific);
2621EXPORT_SYMBOL(tcp_bind_hash);
2622EXPORT_SYMBOL(tcp_bucket_create);
2623EXPORT_SYMBOL(tcp_hashinfo);
2624EXPORT_SYMBOL(tcp_inherit_port);
2625EXPORT_SYMBOL(tcp_listen_wlock);
2626EXPORT_SYMBOL(tcp_port_rover);
2627EXPORT_SYMBOL(tcp_prot);
2628EXPORT_SYMBOL(tcp_put_port);
2629EXPORT_SYMBOL(tcp_unhash);
2630EXPORT_SYMBOL(tcp_v4_conn_request);
2631EXPORT_SYMBOL(tcp_v4_connect);
2632EXPORT_SYMBOL(tcp_v4_do_rcv);
2633EXPORT_SYMBOL(tcp_v4_rebuild_header);
2634EXPORT_SYMBOL(tcp_v4_remember_stamp);
2635EXPORT_SYMBOL(tcp_v4_send_check);
2636EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2637
2638#ifdef CONFIG_PROC_FS
2639EXPORT_SYMBOL(tcp_proc_register);
2640EXPORT_SYMBOL(tcp_proc_unregister);
2641#endif
2642EXPORT_SYMBOL(sysctl_local_port_range);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002643EXPORT_SYMBOL(sysctl_tcp_low_latency);
2644EXPORT_SYMBOL(sysctl_tcp_tw_reuse);
2645