From 8d987e5c75107ca7515fa19e857cfa24aab6ec8f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 9 Nov 2010 23:24:26 +0000 Subject: net: avoid limits overflow Robin Holt tried to boot a 16TB machine and found some limits were reached : sysctl_tcp_mem[2], sysctl_udp_mem[2] We can switch infrastructure to use long "instead" of "int", now atomic_long_t primitives are available for free. Signed-off-by: Eric Dumazet Reported-by: Robin Holt Reviewed-by: Robin Holt Signed-off-by: Andrew Morton Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 3357f69e353d..6d8ab1c4efc3 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -259,8 +259,11 @@ static void tcp_fixup_sndbuf(struct sock *sk) int sndmem = tcp_sk(sk)->rx_opt.mss_clamp + MAX_TCP_HEADER + 16 + sizeof(struct sk_buff); - if (sk->sk_sndbuf < 3 * sndmem) - sk->sk_sndbuf = min(3 * sndmem, sysctl_tcp_wmem[2]); + if (sk->sk_sndbuf < 3 * sndmem) { + sk->sk_sndbuf = 3 * sndmem; + if (sk->sk_sndbuf > sysctl_tcp_wmem[2]) + sk->sk_sndbuf = sysctl_tcp_wmem[2]; + } } /* 2. Tuning advertised window (window_clamp, rcv_ssthresh) @@ -396,7 +399,7 @@ static void tcp_clamp_window(struct sock *sk) if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] && !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) && !tcp_memory_pressure && - atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) { + atomic_long_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) { sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc), sysctl_tcp_rmem[2]); } @@ -4861,7 +4864,7 @@ static int tcp_should_expand_sndbuf(struct sock *sk) return 0; /* If we are under soft global TCP memory pressure, do not expand. */ - if (atomic_read(&tcp_memory_allocated) >= sysctl_tcp_mem[0]) + if (atomic_long_read(&tcp_memory_allocated) >= sysctl_tcp_mem[0]) return 0; /* If we filled the congestion window, do not expand. */ -- cgit v1.2.3 From defb3519a64141608725e2dac5a5aa9a3c644bae Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Wed, 8 Dec 2010 21:16:57 -0800 Subject: net: Abstract away all dst_entry metrics accesses. Use helper functions to hide all direct accesses, especially writes, to dst_entry metrics values. This will allow us to: 1) More easily change how the metrics are stored. 2) Implement COW for metrics. In particular this will help us put metrics into the inetpeer cache if that is what we end up doing. We can make the _metrics member a pointer instead of an array, initially have it point at the read-only metrics in the FIB, and then on the first set grab an inetpeer entry and point the _metrics member there. Signed-off-by: David S. Miller Acked-by: Eric Dumazet --- net/ipv4/tcp_input.c | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 6d8ab1c4efc3..824e8c8a17ad 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -734,7 +734,7 @@ void tcp_update_metrics(struct sock *sk) * Reset our results. */ if (!(dst_metric_locked(dst, RTAX_RTT))) - dst->metrics[RTAX_RTT - 1] = 0; + dst_metric_set(dst, RTAX_RTT, 0); return; } @@ -776,34 +776,38 @@ void tcp_update_metrics(struct sock *sk) if (dst_metric(dst, RTAX_SSTHRESH) && !dst_metric_locked(dst, RTAX_SSTHRESH) && (tp->snd_cwnd >> 1) > dst_metric(dst, RTAX_SSTHRESH)) - dst->metrics[RTAX_SSTHRESH-1] = tp->snd_cwnd >> 1; + dst_metric_set(dst, RTAX_SSTHRESH, tp->snd_cwnd >> 1); if (!dst_metric_locked(dst, RTAX_CWND) && tp->snd_cwnd > dst_metric(dst, RTAX_CWND)) - dst->metrics[RTAX_CWND - 1] = tp->snd_cwnd; + dst_metric_set(dst, RTAX_CWND, tp->snd_cwnd); } else if (tp->snd_cwnd > tp->snd_ssthresh && icsk->icsk_ca_state == TCP_CA_Open) { /* Cong. avoidance phase, cwnd is reliable. */ if (!dst_metric_locked(dst, RTAX_SSTHRESH)) - dst->metrics[RTAX_SSTHRESH-1] = - max(tp->snd_cwnd >> 1, tp->snd_ssthresh); + dst_metric_set(dst, RTAX_SSTHRESH, + max(tp->snd_cwnd >> 1, tp->snd_ssthresh)); if (!dst_metric_locked(dst, RTAX_CWND)) - dst->metrics[RTAX_CWND-1] = (dst_metric(dst, RTAX_CWND) + tp->snd_cwnd) >> 1; + dst_metric_set(dst, RTAX_CWND, + (dst_metric(dst, RTAX_CWND) + + tp->snd_cwnd) >> 1); } else { /* Else slow start did not finish, cwnd is non-sense, ssthresh may be also invalid. */ if (!dst_metric_locked(dst, RTAX_CWND)) - dst->metrics[RTAX_CWND-1] = (dst_metric(dst, RTAX_CWND) + tp->snd_ssthresh) >> 1; + dst_metric_set(dst, RTAX_CWND, + (dst_metric(dst, RTAX_CWND) + + tp->snd_ssthresh) >> 1); if (dst_metric(dst, RTAX_SSTHRESH) && !dst_metric_locked(dst, RTAX_SSTHRESH) && tp->snd_ssthresh > dst_metric(dst, RTAX_SSTHRESH)) - dst->metrics[RTAX_SSTHRESH-1] = tp->snd_ssthresh; + dst_metric_set(dst, RTAX_SSTHRESH, tp->snd_ssthresh); } if (!dst_metric_locked(dst, RTAX_REORDERING)) { if (dst_metric(dst, RTAX_REORDERING) < tp->reordering && tp->reordering != sysctl_tcp_reordering) - dst->metrics[RTAX_REORDERING-1] = tp->reordering; + dst_metric_set(dst, RTAX_REORDERING, tp->reordering); } } } -- cgit v1.2.3 From d9f4fbaf7053af43e6c72909c2aff18654717aed Mon Sep 17 00:00:00 2001 From: Jiri Kosina Date: Wed, 22 Dec 2010 23:23:38 +0000 Subject: tcp: cleanup of cwnd initialization in tcp_init_metrics() Commit 86bcebafc5e7f5 ("tcp: fix >2 iw selection") fixed a case when congestion window initialization has been mistakenly omitted by introducing cwnd label and putting backwards goto from the end of the function. This makes the code unnecessarily tricky to read and understand on a first sight. Shuffle the code around a little bit to make it more obvious. Signed-off-by: Jiri Kosina Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 29 ++++++++++++----------------- 1 file changed, 12 insertions(+), 17 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 824e8c8a17ad..2549b29b062d 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -916,25 +916,20 @@ static void tcp_init_metrics(struct sock *sk) tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk)); } tcp_set_rto(sk); - if (inet_csk(sk)->icsk_rto < TCP_TIMEOUT_INIT && !tp->rx_opt.saw_tstamp) - goto reset; - -cwnd: - tp->snd_cwnd = tcp_init_cwnd(tp, dst); - tp->snd_cwnd_stamp = tcp_time_stamp; - return; - + if (inet_csk(sk)->icsk_rto < TCP_TIMEOUT_INIT && !tp->rx_opt.saw_tstamp) { reset: - /* Play conservative. If timestamps are not - * supported, TCP will fail to recalculate correct - * rtt, if initial rto is too small. FORGET ALL AND RESET! - */ - if (!tp->rx_opt.saw_tstamp && tp->srtt) { - tp->srtt = 0; - tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_INIT; - inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT; + /* Play conservative. If timestamps are not + * supported, TCP will fail to recalculate correct + * rtt, if initial rto is too small. FORGET ALL AND RESET! + */ + if (!tp->rx_opt.saw_tstamp && tp->srtt) { + tp->srtt = 0; + tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_INIT; + inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT; + } } - goto cwnd; + tp->snd_cwnd = tcp_init_cwnd(tp, dst); + tp->snd_cwnd_stamp = tcp_time_stamp; } static void tcp_update_reordering(struct sock *sk, const int metric, -- cgit v1.2.3 From 44f5324b5d13ef2187729d949eca442689627f39 Mon Sep 17 00:00:00 2001 From: Jerry Chu Date: Tue, 25 Jan 2011 13:46:30 -0800 Subject: TCP: fix a bug that triggers large number of TCP RST by mistake This patch fixes a bug that causes TCP RST packets to be generated on otherwise correctly behaved applications, e.g., no unread data on close,..., etc. To trigger the bug, at least two conditions must be met: 1. The FIN flag is set on the last data packet, i.e., it's not on a separate, FIN only packet. 2. The size of the last data chunk on the receive side matches exactly with the size of buffer posted by the receiver, and the receiver closes the socket without any further read attempt. This bug was first noticed on our netperf based testbed for our IW10 proposal to IETF where a large number of RST packets were observed. netperf's read side code meets the condition 2 above 100%. Before the fix, tcp_data_queue() will queue the last skb that meets condition 1 to sk_receive_queue even though it has fully copied out (skb_copy_datagram_iovec()) the data. Then if condition 2 is also met, tcp_recvmsg() often returns all the copied out data successfully without actually consuming the skb, due to a check "if ((chunk = len - tp->ucopy.len) != 0) {" and "len -= chunk;" after tcp_prequeue_process() that causes "len" to become 0 and an early exit from the big while loop. I don't see any reason not to free the skb whose data have been fully consumed in tcp_data_queue(), regardless of the FIN flag. We won't get there if MSG_PEEK is on. Am I missing some arcane cases related to urgent data? Signed-off-by: H.K. Jerry Chu Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 2549b29b062d..eb7f82ebf4a3 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4399,7 +4399,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) if (!skb_copy_datagram_iovec(skb, 0, tp->ucopy.iov, chunk)) { tp->ucopy.len -= chunk; tp->copied_seq += chunk; - eaten = (chunk == skb->len && !th->fin); + eaten = (chunk == skb->len); tcp_rcv_space_adjust(sk); } local_bh_disable(); -- cgit v1.2.3