From 51b77cae0d5aa8e1546fca855dcfe48ddfadfa9c Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Tue, 3 Jun 2008 16:36:01 -0700 Subject: route: Mark unused route cache flags as such. Also removes an obsolete check for the unused flag RTCF_MASQ. Signed-off-by: Thomas Graf Signed-off-by: David S. Miller --- net/ipv4/route.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index df41026b60db..96be336064fb 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1792,7 +1792,7 @@ static int __mkroute_input(struct sk_buff *skb, if (err) flags |= RTCF_DIRECTSRC; - if (out_dev == in_dev && err && !(flags & RTCF_MASQ) && + if (out_dev == in_dev && err && (IN_DEV_SHARED_MEDIA(out_dev) || inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res)))) flags |= RTCF_DOREDIRECT; -- cgit v1.2.3 From 1f9d11c7c99da706e33646c3a9080dd5a8ef9a0b Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Tue, 3 Jun 2008 16:36:27 -0700 Subject: route: Mark unused routing attributes as such Also removes an unused policy entry for an attribute which is only used in kernel->user direction. Signed-off-by: Thomas Graf Signed-off-by: David S. Miller --- net/ipv4/fib_frontend.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 0f1557a4ac7a..0b2ac6a3d903 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -506,7 +506,6 @@ const struct nla_policy rtm_ipv4_policy[RTA_MAX+1] = { [RTA_PREFSRC] = { .type = NLA_U32 }, [RTA_METRICS] = { .type = NLA_NESTED }, [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) }, - [RTA_PROTOINFO] = { .type = NLA_U32 }, [RTA_FLOW] = { .type = NLA_U32 }, }; -- cgit v1.2.3 From ab32cd793dca21eec846a8204390d9594ed994d5 Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Tue, 3 Jun 2008 16:37:33 -0700 Subject: route: Remove unused ifa_anycast field The field was supposed to allow the creation of an anycast route by assigning an anycast address to an address prefix. It was never implemented so this field is unused and serves no purpose. Remove it. Signed-off-by: Thomas Graf Signed-off-by: David S. Miller --- net/ipv4/devinet.c | 9 --------- 1 file changed, 9 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 6848e4760f34..79a7ef6209ff 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -90,7 +90,6 @@ static const struct nla_policy ifa_ipv4_policy[IFA_MAX+1] = { [IFA_LOCAL] = { .type = NLA_U32 }, [IFA_ADDRESS] = { .type = NLA_U32 }, [IFA_BROADCAST] = { .type = NLA_U32 }, - [IFA_ANYCAST] = { .type = NLA_U32 }, [IFA_LABEL] = { .type = NLA_STRING, .len = IFNAMSIZ - 1 }, }; @@ -536,9 +535,6 @@ static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh) if (tb[IFA_BROADCAST]) ifa->ifa_broadcast = nla_get_be32(tb[IFA_BROADCAST]); - if (tb[IFA_ANYCAST]) - ifa->ifa_anycast = nla_get_be32(tb[IFA_ANYCAST]); - if (tb[IFA_LABEL]) nla_strlcpy(ifa->ifa_label, tb[IFA_LABEL], IFNAMSIZ); else @@ -745,7 +741,6 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg) break; inet_del_ifa(in_dev, ifap, 0); ifa->ifa_broadcast = 0; - ifa->ifa_anycast = 0; ifa->ifa_scope = 0; } @@ -1113,7 +1108,6 @@ static inline size_t inet_nlmsg_size(void) + nla_total_size(4) /* IFA_ADDRESS */ + nla_total_size(4) /* IFA_LOCAL */ + nla_total_size(4) /* IFA_BROADCAST */ - + nla_total_size(4) /* IFA_ANYCAST */ + nla_total_size(IFNAMSIZ); /* IFA_LABEL */ } @@ -1143,9 +1137,6 @@ static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa, if (ifa->ifa_broadcast) NLA_PUT_BE32(skb, IFA_BROADCAST, ifa->ifa_broadcast); - if (ifa->ifa_anycast) - NLA_PUT_BE32(skb, IFA_ANYCAST, ifa->ifa_anycast); - if (ifa->ifa_label[0]) NLA_PUT_STRING(skb, IFA_LABEL, ifa->ifa_label); -- cgit v1.2.3 From 8aca6cb1179ed9bef9351028c8d8af852903eae2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Wed, 4 Jun 2008 11:34:22 -0700 Subject: tcp: Fix inconsistency source (CA_Open only when !tcp_left_out(tp)) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It is possible that this skip path causes TCP to end up into an invalid state where ca_state was left to CA_Open while some segments already came into sacked_out. If next valid ACK doesn't contain new SACK information TCP fails to enter into tcp_fastretrans_alert(). Thus at least high_seq is set incorrectly to a too high seqno because some new data segments could be sent in between (and also, limited transmit is not being correctly invoked there). Reordering in both directions can easily cause this situation to occur. I guess we would want to use tcp_moderate_cwnd(tp) there as well as it may be possible to use this to trigger oversized burst to network by sending an old ACK with huge amount of SACK info, but I'm a bit unsure about its effects (mainly to FlightSize), so to be on the safe side I just currently fixed it minimally to keep TCP's state consistent (obviously, such nasty ACKs have been possible this far). Though it seems that FlightSize is already underestimated by some amount, so probably on the long term we might want to trigger recovery there too, if appropriate, to make FlightSize calculation to resemble reality at the time when the losses where discovered (but such change scares me too much now and requires some more thinking anyway how to do that as it likely involves some code shuffling). This bug was found by Brian Vowell while running my TCP debug patch to find cause of another TCP issue (fackets_out miscount). Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index b54d9d37b636..54a0b7412782 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2483,6 +2483,20 @@ static inline void tcp_complete_cwr(struct sock *sk) tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR); } +static void tcp_try_keep_open(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + int state = TCP_CA_Open; + + if (tcp_left_out(tp) || tp->retrans_out || tp->undo_marker) + state = TCP_CA_Disorder; + + if (inet_csk(sk)->icsk_ca_state != state) { + tcp_set_ca_state(sk, state); + tp->high_seq = tp->snd_nxt; + } +} + static void tcp_try_to_open(struct sock *sk, int flag) { struct tcp_sock *tp = tcp_sk(sk); @@ -2496,15 +2510,7 @@ static void tcp_try_to_open(struct sock *sk, int flag) tcp_enter_cwr(sk, 1); if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) { - int state = TCP_CA_Open; - - if (tcp_left_out(tp) || tp->retrans_out || tp->undo_marker) - state = TCP_CA_Disorder; - - if (inet_csk(sk)->icsk_ca_state != state) { - tcp_set_ca_state(sk, state); - tp->high_seq = tp->snd_nxt; - } + tcp_try_keep_open(sk); tcp_moderate_cwnd(tp); } else { tcp_cwnd_down(sk, flag); @@ -3310,8 +3316,11 @@ no_queue: return 1; old_ack: - if (TCP_SKB_CB(skb)->sacked) + if (TCP_SKB_CB(skb)->sacked) { tcp_sacktag_write_queue(sk, skb, prior_snd_una); + if (icsk->icsk_ca_state == TCP_CA_Open) + tcp_try_keep_open(sk); + } uninteresting_ack: SOCK_DEBUG(sk, "Ack %u out of %u:%u\n", ack, tp->snd_una, tp->snd_nxt); -- cgit v1.2.3 From baa2bfb8aef24bb7fe1875b256918724b3884662 Mon Sep 17 00:00:00 2001 From: YOSHIFUJI Hideaki Date: Fri, 30 May 2008 11:35:03 +0900 Subject: [IPV4] TUNNEL4: Fix incoming packet length check for inter-protocol tunnel. Signed-off-by: YOSHIFUJI Hideaki --- net/ipv4/tunnel4.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tunnel4.c b/net/ipv4/tunnel4.c index d3b709a6f264..cb1f0e83830b 100644 --- a/net/ipv4/tunnel4.c +++ b/net/ipv4/tunnel4.c @@ -97,7 +97,7 @@ static int tunnel64_rcv(struct sk_buff *skb) { struct xfrm_tunnel *handler; - if (!pskb_may_pull(skb, sizeof(struct iphdr))) + if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) goto drop; for (handler = tunnel64_handlers; handler; handler = handler->next) -- cgit v1.2.3 From 36d926b94a9908937593e5669162305a071b9cc3 Mon Sep 17 00:00:00 2001 From: "Denis V. Lunev" Date: Wed, 4 Jun 2008 15:49:07 +0400 Subject: [IPV6]: inet_sk(sk)->cork.opt leak IPv6 UDP sockets wth IPv4 mapped address use udp_sendmsg to send the data actually. In this case ip_flush_pending_frames should be called instead of ip6_flush_pending_frames. Signed-off-by: Denis V. Lunev Signed-off-by: YOSHIFUJI Hideaki --- net/ipv4/udp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index db1cb7c96d63..56fcda3694ba 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -420,7 +420,7 @@ void udp_err(struct sk_buff *skb, u32 info) /* * Throw away all pending data and cancel the corking. Socket is locked. */ -static void udp_flush_pending_frames(struct sock *sk) +void udp_flush_pending_frames(struct sock *sk) { struct udp_sock *up = udp_sk(sk); @@ -430,6 +430,7 @@ static void udp_flush_pending_frames(struct sock *sk) ip_flush_pending_frames(sk); } } +EXPORT_SYMBOL(udp_flush_pending_frames); /** * udp4_hwcsum_outgoing - handle outgoing HW checksumming -- cgit v1.2.3 From a6604471db5e7a33474a7f16c64d6b118fae3e74 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Wed, 4 Jun 2008 12:07:44 -0700 Subject: tcp: fix skb vs fack_count out-of-sync condition MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This bug is able to corrupt fackets_out in very rare cases. In order for this to cause corruption: 1) DSACK in the middle of previous SACK block must be generated. 2) In order to take that particular branch, part or all of the DSACKed segment must already be SACKed so that we have that in cache in the first place. 3) The new info must be top enough so that fackets_out will be updated on this iteration. ...then fack_count is updated while skb wasn't, then we walk again that particular segment thus updating fack_count twice for a single skb and finally that value is assigned to fackets_out by tcp_sacktag_one. It is safe to call tcp_sacktag_one just once for a segment (at DSACK), no need to call again for plain SACK. Potential problem of the miscount are limited to premature entry to recovery and to inflated reordering metric (which could even cancel each other out in the most the luckiest scenarios :-)). Both are quite insignificant in worst case too and there exists also code to reset them (fackets_out once sacked_out becomes zero and reordering metric on RTO). This has been reported by a number of people, because it occurred quite rarely, it has been very evasive. Andy Furniss was able to get it to occur couple of times so that a bit more info was collected about the problem using a debug patch, though it still required lot of checking around. Thanks also to others who have tried to help here. This is listed as Bugzilla #10346. The bug was introduced by me in commit 68f8353b48 ([TCP]: Rewrite SACK block processing & sack_recv_cache use), I probably thought back then that there's need to scan that entry twice or didn't dare to make it go through it just once there. Going through twice would have required restoring fack_count after the walk but as noted above, I chose to drop the additional walk step altogether here. Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 54a0b7412782..eba873e9b560 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1392,9 +1392,9 @@ static struct sk_buff *tcp_maybe_skipping_dsack(struct sk_buff *skb, if (before(next_dup->start_seq, skip_to_seq)) { skb = tcp_sacktag_skip(skb, sk, next_dup->start_seq, fack_count); - tcp_sacktag_walk(skb, sk, NULL, - next_dup->start_seq, next_dup->end_seq, - 1, fack_count, reord, flag); + skb = tcp_sacktag_walk(skb, sk, NULL, + next_dup->start_seq, next_dup->end_seq, + 1, fack_count, reord, flag); } return skb; -- cgit v1.2.3 From 22dd485022f3d0b162ceb5e67d85de7c3806aa20 Mon Sep 17 00:00:00 2001 From: "Denis V. Lunev" Date: Wed, 4 Jun 2008 15:16:12 -0700 Subject: raw: Raw socket leak. The program below just leaks the raw kernel socket int main() { int fd = socket(PF_INET, SOCK_RAW, IPPROTO_UDP); struct sockaddr_in addr; memset(&addr, 0, sizeof(addr)); inet_aton("127.0.0.1", &addr.sin_addr); addr.sin_family = AF_INET; addr.sin_port = htons(2048); sendto(fd, "a", 1, MSG_MORE, &addr, sizeof(addr)); return 0; } Corked packet is allocated via sock_wmalloc which holds the owner socket, so one should uncork it and flush all pending data on close. Do this in the same way as in UDP. Signed-off-by: Denis V. Lunev Acked-by: Alexey Kuznetsov Signed-off-by: David S. Miller --- net/ipv4/raw.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'net/ipv4') diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index fead049daf43..e7e091d365ff 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -608,6 +608,14 @@ static void raw_close(struct sock *sk, long timeout) sk_common_release(sk); } +static int raw_destroy(struct sock *sk) +{ + lock_sock(sk); + ip_flush_pending_frames(sk); + release_sock(sk); + return 0; +} + /* This gets rid of all the nasties in af_inet. -DaveM */ static int raw_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) { @@ -820,6 +828,7 @@ struct proto raw_prot = { .name = "RAW", .owner = THIS_MODULE, .close = raw_close, + .destroy = raw_destroy, .connect = ip4_datagram_connect, .disconnect = udp_disconnect, .ioctl = raw_ioctl, -- cgit v1.2.3 From 26af65cbeb2467a486ae4fc7242c94e470c67c50 Mon Sep 17 00:00:00 2001 From: Sridhar Samudrala Date: Wed, 4 Jun 2008 15:19:35 -0700 Subject: tcp: Increment OUTRSTS in tcp_send_active_reset() TCP "resets sent" counter is not incremented when a TCP Reset is sent via tcp_send_active_reset(). Signed-off-by: Sridhar Samudrala Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index e399bde7813a..ad993ecb4810 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2131,6 +2131,8 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority) TCP_SKB_CB(skb)->when = tcp_time_stamp; if (tcp_transmit_skb(sk, skb, 0, priority)) NET_INC_STATS(LINUX_MIB_TCPABORTFAILED); + + TCP_INC_STATS(TCP_MIB_OUTRSTS); } /* WARNING: This routine must only be called when we have already sent -- cgit v1.2.3 From 293ad60401da621b8b329abbe8c388edb25f658a Mon Sep 17 00:00:00 2001 From: Octavian Purdila Date: Wed, 4 Jun 2008 15:45:58 -0700 Subject: tcp: Fix for race due to temporary drop of the socket lock in skb_splice_bits. skb_splice_bits temporary drops the socket lock while iterating over the socket queue in order to break a reverse locking condition which happens with sendfile. This, however, opens a window of opportunity for tcp_collapse() to aggregate skbs and thus potentially free the current skb used in skb_splice_bits and tcp_read_sock. This patch fixes the problem by (re-)getting the same "logical skb" after the lock has been temporary dropped. Based on idea and initial patch from Evgeniy Polyakov. Signed-off-by: Octavian Purdila Acked-by: Evgeniy Polyakov Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index f88653138621..ab66683b8043 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1227,7 +1227,14 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, copied += used; offset += used; } - if (offset != skb->len) + /* + * If recv_actor drops the lock (e.g. TCP splice + * receive) the skb pointer might be invalid when + * getting here: tcp_collapse might have deleted it + * while aggregating skbs from the socket queue. + */ + skb = tcp_recv_skb(sk, seq-1, &offset); + if (!skb || (offset+1 != skb->len)) break; } if (tcp_hdr(skb)->fin) { -- cgit v1.2.3 From ddb2c43594f22843e9f3153da151deaba1a834c5 Mon Sep 17 00:00:00 2001 From: Chris Wright Date: Wed, 4 Jun 2008 09:16:33 -0700 Subject: asn1: additional sanity checking during BER decoding - Don't trust a length which is greater than the working buffer. An invalid length could cause overflow when calculating buffer size for decoding oid. - An oid length of zero is invalid and allows for an off-by-one error when decoding oid because the first subid actually encodes first 2 subids. - A primitive encoding may not have an indefinite length. Thanks to Wei Wang from McAfee for report. Cc: Steven French Cc: stable@kernel.org Acked-by: Patrick McHardy Signed-off-by: Chris Wright Signed-off-by: Linus Torvalds --- net/ipv4/netfilter/nf_nat_snmp_basic.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic.c b/net/ipv4/netfilter/nf_nat_snmp_basic.c index 5daefad3d193..7750c97fde7b 100644 --- a/net/ipv4/netfilter/nf_nat_snmp_basic.c +++ b/net/ipv4/netfilter/nf_nat_snmp_basic.c @@ -232,6 +232,11 @@ static unsigned char asn1_length_decode(struct asn1_ctx *ctx, } } } + + /* don't trust len bigger than ctx buffer */ + if (*len > ctx->end - ctx->pointer) + return 0; + return 1; } @@ -250,6 +255,10 @@ static unsigned char asn1_header_decode(struct asn1_ctx *ctx, if (!asn1_length_decode(ctx, &def, &len)) return 0; + /* primitive shall be definite, indefinite shall be constructed */ + if (*con == ASN1_PRI && !def) + return 0; + if (def) *eoc = ctx->pointer + len; else @@ -434,6 +443,11 @@ static unsigned char asn1_oid_decode(struct asn1_ctx *ctx, unsigned long *optr; size = eoc - ctx->pointer + 1; + + /* first subid actually encodes first two subids */ + if (size < 2 || size > ULONG_MAX/sizeof(unsigned long)) + return 0; + *oid = kmalloc(size * sizeof(unsigned long), GFP_ATOMIC); if (*oid == NULL) { if (net_ratelimit()) -- cgit v1.2.3 From ce4a7d0d48bbaed78ccbb0bafb9229651a40303a Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 10 Jun 2008 12:39:35 -0700 Subject: inet{6}_request_sock: Init ->opt and ->pktopts in the constructor Wei Yongjun noticed that we may call reqsk_free on request sock objects where the opt fields may not be initialized, fix it by introducing inet_reqsk_alloc where we initialize ->opt to NULL and set ->pktopts to NULL in inet6_reqsk_alloc. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- net/ipv4/syncookies.c | 3 +-- net/ipv4/tcp_ipv4.c | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 73ba98921d64..d182a2a26291 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -285,7 +285,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, cookie_check_timestamp(&tcp_opt); ret = NULL; - req = reqsk_alloc(&tcp_request_sock_ops); /* for safety */ + req = inet_reqsk_alloc(&tcp_request_sock_ops); /* for safety */ if (!req) goto out; @@ -301,7 +301,6 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, ireq->rmt_port = th->source; ireq->loc_addr = ip_hdr(skb)->daddr; ireq->rmt_addr = ip_hdr(skb)->saddr; - ireq->opt = NULL; ireq->snd_wscale = tcp_opt.snd_wscale; ireq->rcv_wscale = tcp_opt.rcv_wscale; ireq->sack_ok = tcp_opt.sack_ok; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index cd601a866c2f..4f8485c67d1a 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1285,7 +1285,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) goto drop; - req = reqsk_alloc(&tcp_request_sock_ops); + req = inet_reqsk_alloc(&tcp_request_sock_ops); if (!req) goto drop; -- cgit v1.2.3 From 709772e6e06564ed94ba740de70185ac3d792773 Mon Sep 17 00:00:00 2001 From: Krzysztof Piotr Oledzki Date: Tue, 10 Jun 2008 15:44:49 -0700 Subject: net: Fix routing tables with id > 255 for legacy software Most legacy software do not like tables > 255 as rtm_table is u8 so tb_id is sent &0xff and it is possible to mismatch for example table 510 with table 254 (main). This patch introduces RT_TABLE_COMPAT=252 so the code uses it if tb_id > 255. It makes such old applications happy, new ones are still able to use RTA_TABLE to get a proper table id. Signed-off-by: Krzysztof Piotr Oledzki Acked-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/fib_semantics.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 3b83c34019fc..0d4d72827e4b 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -960,7 +960,10 @@ int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, rtm->rtm_dst_len = dst_len; rtm->rtm_src_len = 0; rtm->rtm_tos = tos; - rtm->rtm_table = tb_id; + if (tb_id < 256) + rtm->rtm_table = tb_id; + else + rtm->rtm_table = RT_TABLE_COMPAT; NLA_PUT_U32(skb, RTA_TABLE, tb_id); rtm->rtm_type = type; rtm->rtm_flags = fi->fib_flags; -- cgit v1.2.3 From ec0a196626bd12e0ba108d7daa6d95a4fb25c2c5 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Thu, 12 Jun 2008 16:31:35 -0700 Subject: tcp: Revert 'process defer accept as established' changes. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts two changesets, ec3c0982a2dd1e671bad8e9d26c28dcba0039d87 ("[TCP]: TCP_DEFER_ACCEPT updates - process as established") and the follow-on bug fix 9ae27e0adbf471c7a6b80102e38e1d5a346b3b38 ("tcp: Fix slab corruption with ipv6 and tcp6fuzz"). This change causes several problems, first reported by Ingo Molnar as a distcc-over-loopback regression where connections were getting stuck. Ilpo Järvinen first spotted the locking problems. The new function added by this code, tcp_defer_accept_check(), only has the child socket locked, yet it is modifying state of the parent listening socket. Fixing that is non-trivial at best, because we can't simply just grab the parent listening socket lock at this point, because it would create an ABBA deadlock. The normal ordering is parent listening socket --> child socket, but this code path would require the reverse lock ordering. Next is a problem noticed by Vitaliy Gusev, he noted: ---------------------------------------- >--- a/net/ipv4/tcp_timer.c >+++ b/net/ipv4/tcp_timer.c >@@ -481,6 +481,11 @@ static void tcp_keepalive_timer (unsigned long data) > goto death; > } > >+ if (tp->defer_tcp_accept.request && sk->sk_state == TCP_ESTABLISHED) { >+ tcp_send_active_reset(sk, GFP_ATOMIC); >+ goto death; Here socket sk is not attached to listening socket's request queue. tcp_done() will not call inet_csk_destroy_sock() (and tcp_v4_destroy_sock() which should release this sk) as socket is not DEAD. Therefore socket sk will be lost for freeing. ---------------------------------------- Finally, Alexey Kuznetsov argues that there might not even be any real value or advantage to these new semantics even if we fix all of the bugs: ---------------------------------------- Hiding from accept() sockets with only out-of-order data only is the only thing which is impossible with old approach. Is this really so valuable? My opinion: no, this is nothing but a new loophole to consume memory without control. ---------------------------------------- So revert this thing for now. Signed-off-by: David S. Miller --- net/ipv4/inet_connection_sock.c | 11 +++++++--- net/ipv4/tcp.c | 18 ++++++++++------- net/ipv4/tcp_input.c | 45 ----------------------------------------- net/ipv4/tcp_ipv4.c | 8 -------- net/ipv4/tcp_minisocks.c | 32 +++++++++++------------------ net/ipv4/tcp_timer.c | 5 ----- 6 files changed, 31 insertions(+), 88 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 828ea211ff21..045e799d3e1d 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -419,7 +419,8 @@ void inet_csk_reqsk_queue_prune(struct sock *parent, struct inet_connection_sock *icsk = inet_csk(parent); struct request_sock_queue *queue = &icsk->icsk_accept_queue; struct listen_sock *lopt = queue->listen_opt; - int thresh = icsk->icsk_syn_retries ? : sysctl_tcp_synack_retries; + int max_retries = icsk->icsk_syn_retries ? : sysctl_tcp_synack_retries; + int thresh = max_retries; unsigned long now = jiffies; struct request_sock **reqp, *req; int i, budget; @@ -455,6 +456,9 @@ void inet_csk_reqsk_queue_prune(struct sock *parent, } } + if (queue->rskq_defer_accept) + max_retries = queue->rskq_defer_accept; + budget = 2 * (lopt->nr_table_entries / (timeout / interval)); i = lopt->clock_hand; @@ -462,8 +466,9 @@ void inet_csk_reqsk_queue_prune(struct sock *parent, reqp=&lopt->syn_table[i]; while ((req = *reqp) != NULL) { if (time_after_eq(now, req->expires)) { - if (req->retrans < thresh && - !req->rsk_ops->rtx_syn_ack(parent, req)) { + if ((req->retrans < (inet_rsk(req)->acked ? max_retries : thresh)) && + (inet_rsk(req)->acked || + !req->rsk_ops->rtx_syn_ack(parent, req))) { unsigned long timeo; if (req->retrans++ == 0) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index ab66683b8043..fc54a48fde1e 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2112,12 +2112,15 @@ static int do_tcp_setsockopt(struct sock *sk, int level, break; case TCP_DEFER_ACCEPT: - if (val < 0) { - err = -EINVAL; - } else { - if (val > MAX_TCP_ACCEPT_DEFERRED) - val = MAX_TCP_ACCEPT_DEFERRED; - icsk->icsk_accept_queue.rskq_defer_accept = val; + icsk->icsk_accept_queue.rskq_defer_accept = 0; + if (val > 0) { + /* Translate value in seconds to number of + * retransmits */ + while (icsk->icsk_accept_queue.rskq_defer_accept < 32 && + val > ((TCP_TIMEOUT_INIT / HZ) << + icsk->icsk_accept_queue.rskq_defer_accept)) + icsk->icsk_accept_queue.rskq_defer_accept++; + icsk->icsk_accept_queue.rskq_defer_accept++; } break; @@ -2299,7 +2302,8 @@ static int do_tcp_getsockopt(struct sock *sk, int level, val = (val ? : sysctl_tcp_fin_timeout) / HZ; break; case TCP_DEFER_ACCEPT: - val = icsk->icsk_accept_queue.rskq_defer_accept; + val = !icsk->icsk_accept_queue.rskq_defer_accept ? 0 : + ((TCP_TIMEOUT_INIT / HZ) << (icsk->icsk_accept_queue.rskq_defer_accept - 1)); break; case TCP_WINDOW_CLAMP: val = tp->window_clamp; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index eba873e9b560..cad73b7dfef0 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4541,49 +4541,6 @@ static void tcp_urg(struct sock *sk, struct sk_buff *skb, struct tcphdr *th) } } -static int tcp_defer_accept_check(struct sock *sk) -{ - struct tcp_sock *tp = tcp_sk(sk); - - if (tp->defer_tcp_accept.request) { - int queued_data = tp->rcv_nxt - tp->copied_seq; - int hasfin = !skb_queue_empty(&sk->sk_receive_queue) ? - tcp_hdr((struct sk_buff *) - sk->sk_receive_queue.prev)->fin : 0; - - if (queued_data && hasfin) - queued_data--; - - if (queued_data && - tp->defer_tcp_accept.listen_sk->sk_state == TCP_LISTEN) { - if (sock_flag(sk, SOCK_KEEPOPEN)) { - inet_csk_reset_keepalive_timer(sk, - keepalive_time_when(tp)); - } else { - inet_csk_delete_keepalive_timer(sk); - } - - inet_csk_reqsk_queue_add( - tp->defer_tcp_accept.listen_sk, - tp->defer_tcp_accept.request, - sk); - - tp->defer_tcp_accept.listen_sk->sk_data_ready( - tp->defer_tcp_accept.listen_sk, 0); - - sock_put(tp->defer_tcp_accept.listen_sk); - sock_put(sk); - tp->defer_tcp_accept.listen_sk = NULL; - tp->defer_tcp_accept.request = NULL; - } else if (hasfin || - tp->defer_tcp_accept.listen_sk->sk_state != TCP_LISTEN) { - tcp_reset(sk); - return -1; - } - } - return 0; -} - static int tcp_copy_to_iovec(struct sock *sk, struct sk_buff *skb, int hlen) { struct tcp_sock *tp = tcp_sk(sk); @@ -4944,8 +4901,6 @@ step5: tcp_data_snd_check(sk); tcp_ack_snd_check(sk); - - tcp_defer_accept_check(sk); return 0; csum_error: diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 4f8485c67d1a..97a230026e13 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1918,14 +1918,6 @@ int tcp_v4_destroy_sock(struct sock *sk) sk->sk_sndmsg_page = NULL; } - if (tp->defer_tcp_accept.request) { - reqsk_free(tp->defer_tcp_accept.request); - sock_put(tp->defer_tcp_accept.listen_sk); - sock_put(sk); - tp->defer_tcp_accept.listen_sk = NULL; - tp->defer_tcp_accept.request = NULL; - } - atomic_dec(&tcp_sockets_allocated); return 0; diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 019c8c16e5cc..8245247a6ceb 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -571,8 +571,10 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb, does sequence test, SYN is truncated, and thus we consider it a bare ACK. - Both ends (listening sockets) accept the new incoming - connection and try to talk to each other. 8-) + If icsk->icsk_accept_queue.rskq_defer_accept, we silently drop this + bare ACK. Otherwise, we create an established connection. Both + ends (listening sockets) accept the new incoming connection and try + to talk to each other. 8-) Note: This case is both harmless, and rare. Possibility is about the same as us discovering intelligent life on another plant tomorrow. @@ -640,6 +642,13 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb, if (!(flg & TCP_FLAG_ACK)) return NULL; + /* If TCP_DEFER_ACCEPT is set, drop bare ACK. */ + if (inet_csk(sk)->icsk_accept_queue.rskq_defer_accept && + TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) { + inet_rsk(req)->acked = 1; + return NULL; + } + /* OK, ACK is valid, create big socket and * feed this segment to it. It will repeat all * the tests. THIS SEGMENT MUST MOVE SOCKET TO @@ -678,24 +687,7 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb, inet_csk_reqsk_queue_unlink(sk, req, prev); inet_csk_reqsk_queue_removed(sk, req); - if (inet_csk(sk)->icsk_accept_queue.rskq_defer_accept && - TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) { - - /* the accept queue handling is done is est recv slow - * path so lets make sure to start there - */ - tcp_sk(child)->pred_flags = 0; - sock_hold(sk); - sock_hold(child); - tcp_sk(child)->defer_tcp_accept.listen_sk = sk; - tcp_sk(child)->defer_tcp_accept.request = req; - - inet_csk_reset_keepalive_timer(child, - inet_csk(sk)->icsk_accept_queue.rskq_defer_accept * HZ); - } else { - inet_csk_reqsk_queue_add(sk, req, child); - } - + inet_csk_reqsk_queue_add(sk, req, child); return child; listen_overflow: diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 4de68cf5f2aa..63ed9d6830e7 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -489,11 +489,6 @@ static void tcp_keepalive_timer (unsigned long data) goto death; } - if (tp->defer_tcp_accept.request && sk->sk_state == TCP_ESTABLISHED) { - tcp_send_active_reset(sk, GFP_ATOMIC); - goto death; - } - if (!sock_flag(sk, SOCK_KEEPOPEN) || sk->sk_state == TCP_CLOSE) goto out; -- cgit v1.2.3 From 93653e0448196344d7699ccad395eaebd30359d1 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Mon, 16 Jun 2008 16:57:40 -0700 Subject: tcp: Revert reset of deferred accept changes in 2.6.26 Ingo's system is still seeing strange behavior, and he reports that is goes away if the rest of the deferred accept changes are reverted too. Therefore this reverts e4c78840284f3f51b1896cf3936d60a6033c4d2c ("[TCP]: TCP_DEFER_ACCEPT updates - dont retxmt synack") and 539fae89bebd16ebeafd57a87169bc56eb530d76 ("[TCP]: TCP_DEFER_ACCEPT updates - defer timeout conflicts with max_thresh"). Just like the other revert, these ideas can be revisited for 2.6.27 Signed-off-by: David S. Miller --- net/ipv4/inet_connection_sock.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 045e799d3e1d..ec834480abe7 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -466,9 +466,9 @@ void inet_csk_reqsk_queue_prune(struct sock *parent, reqp=&lopt->syn_table[i]; while ((req = *reqp) != NULL) { if (time_after_eq(now, req->expires)) { - if ((req->retrans < (inet_rsk(req)->acked ? max_retries : thresh)) && - (inet_rsk(req)->acked || - !req->rsk_ops->rtx_syn_ack(parent, req))) { + if ((req->retrans < thresh || + (inet_rsk(req)->acked && req->retrans < max_retries)) + && !req->rsk_ops->rtx_syn_ack(parent, req)) { unsigned long timeo; if (req->retrans++ == 0) -- cgit v1.2.3 From 68be802cd5ad040fe8cfa33ce3031405df2d9117 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 16 Jun 2008 17:03:32 -0700 Subject: raw: Restore /proc/net/raw correct behavior I just noticed "cat /proc/net/raw" was buggy, missing '\n' separators. I believe this was introduced by commit 8cd850efa4948d57a2ed836911cfd1ab299e89c6 ([RAW]: Cleanup IPv4 raw_seq_show.) This trivial patch restores correct behavior, and applies to current Linus tree (should also be applied to stable tree as well.) Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/raw.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index e7e091d365ff..37a1ecd9d600 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -934,7 +934,7 @@ static void raw_sock_seq_show(struct seq_file *seq, struct sock *sp, int i) srcp = inet->num; seq_printf(seq, "%4d: %08X:%04X %08X:%04X" - " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p %d", + " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p %d\n", i, src, srcp, dest, destp, sp->sk_state, atomic_read(&sp->sk_wmem_alloc), atomic_read(&sp->sk_rmem_alloc), -- cgit v1.2.3 From a9d246dbb07cf0bd32bbfc5d184ed738bf2af4f8 Mon Sep 17 00:00:00 2001 From: Rami Rosen Date: Mon, 16 Jun 2008 17:07:16 -0700 Subject: ipv4: Remove unused definitions in net/ipv4/tcp_ipv4.c. 1) Remove ICMP_MIN_LENGTH, as it is unused. 2) Remove unneeded tcp_v4_send_check() declaration. Signed-off-by: Rami Rosen Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 97a230026e13..12695be2c255 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -85,10 +85,6 @@ int sysctl_tcp_tw_reuse __read_mostly; int sysctl_tcp_low_latency __read_mostly; -/* Check TCP sequence numbers in ICMP packets. */ -#define ICMP_MIN_LENGTH 8 - -void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb); #ifdef CONFIG_TCP_MD5SIG static struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk, -- cgit v1.2.3 From 68b80f11380889996aa7eadba29dbbb5c29a5864 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Tue, 17 Jun 2008 15:51:47 -0700 Subject: netfilter: nf_nat: fix RCU races Fix three ct_extend/NAT extension related races: - When cleaning up the extension area and removing it from the bysource hash, the nat->ct pointer must not be set to NULL since it may still be used in a RCU read side - When replacing a NAT extension area in the bysource hash, the nat->ct pointer must be assigned before performing the replacement - When reallocating extension storage in ct_extend, the old memory must not be freed immediately since it may still be used by a RCU read side Possibly fixes https://bugzilla.redhat.com/show_bug.cgi?id=449315 and/or http://bugzilla.kernel.org/show_bug.cgi?id=10875 Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/netfilter/nf_nat_core.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c index 04578593e100..d2a887fc8d9b 100644 --- a/net/ipv4/netfilter/nf_nat_core.c +++ b/net/ipv4/netfilter/nf_nat_core.c @@ -556,7 +556,6 @@ static void nf_nat_cleanup_conntrack(struct nf_conn *ct) spin_lock_bh(&nf_nat_lock); hlist_del_rcu(&nat->bysource); - nat->ct = NULL; spin_unlock_bh(&nf_nat_lock); } @@ -570,8 +569,8 @@ static void nf_nat_move_storage(void *new, void *old) return; spin_lock_bh(&nf_nat_lock); - hlist_replace_rcu(&old_nat->bysource, &new_nat->bysource); new_nat->ct = ct; + hlist_replace_rcu(&old_nat->bysource, &new_nat->bysource); spin_unlock_bh(&nf_nat_lock); } -- cgit v1.2.3 From fe833fca2eac6b3d3ad5e35f44ad4638362f1da8 Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Tue, 17 Jun 2008 16:37:13 -0700 Subject: xfrm: fix fragmentation for ipv4 xfrm tunnel When generating the ip header for the transformed packet we just copy the frag_off field of the ip header from the original packet to the ip header of the new generated packet. If we receive a packet as a chain of fragments, all but the last of the new generated packets have the IP_MF flag set. We have to mask the frag_off field to only keep the IP_DF flag from the original packet. This got lost with git commit 36cf9acf93e8561d9faec24849e57688a81eb9c5 ("[IPSEC]: Separate inner/outer mode processing on output") Signed-off-by: Steffen Klassert Acked-by: Herbert Xu Signed-off-by: David S. Miller --- net/ipv4/xfrm4_mode_tunnel.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c index 584e6d74e3a9..7135279f3f84 100644 --- a/net/ipv4/xfrm4_mode_tunnel.c +++ b/net/ipv4/xfrm4_mode_tunnel.c @@ -52,7 +52,7 @@ static int xfrm4_mode_tunnel_output(struct xfrm_state *x, struct sk_buff *skb) IP_ECN_clear(top_iph); top_iph->frag_off = (flags & XFRM_STATE_NOPMTUDISC) ? - 0 : XFRM_MODE_SKB_CB(skb)->frag_off; + 0 : (XFRM_MODE_SKB_CB(skb)->frag_off & htons(IP_DF)); ip_select_ident(top_iph, dst->child, NULL); top_iph->ttl = dst_metric(dst->child, RTAX_HOPLIMIT); -- cgit v1.2.3