From 9cfaa8def1c795a512bc04f2aec333b03724ca2e Mon Sep 17 00:00:00 2001
From: Xufeng Zhang <xufeng.zhang@windriver.com>
Date: Tue, 21 Jun 2011 10:43:40 +0000
Subject: udp/recvmsg: Clear MSG_TRUNC flag when starting over for a new packet

Consider this scenario: When the size of the first received udp packet
is bigger than the receive buffer, MSG_TRUNC bit is set in msg->msg_flags.
However, if checksum error happens and this is a blocking socket, it will
goto try_again loop to receive the next packet.  But if the size of the
next udp packet is smaller than receive buffer, MSG_TRUNC flag should not
be set, but because MSG_TRUNC bit is not cleared in msg->msg_flags before
receive the next packet, MSG_TRUNC is still set, which is wrong.

Fix this problem by clearing MSG_TRUNC flag when starting over for a
new packet.

Signed-off-by: Xufeng Zhang <xufeng.zhang@windriver.com>
Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/udp.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'net/ipv4')

diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index abca870d8ff6..48cd88e62553 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1249,6 +1249,9 @@ csum_copy_err:
 
 	if (noblock)
 		return -EAGAIN;
+
+	/* starting over for a new packet */
+	msg->msg_flags &= ~MSG_TRUNC;
 	goto try_again;
 }
 
-- 
cgit v1.2.3


From 33f99dc7fd948bbc808a24a0989c167f8973b643 Mon Sep 17 00:00:00 2001
From: Steffen Klassert <steffen.klassert@secunet.com>
Date: Wed, 22 Jun 2011 01:04:37 +0000
Subject: ipv4: Fix packet size calculation in __ip_append_data

Git commit 59104f06 (ip: take care of last fragment in ip_append_data)
added a check to see if we exceed the mtu when we add trailer_len.
However, the mtu is already subtracted by the trailer length when the
xfrm transfomation bundles are set up. So IPsec packets with mtu
size get fragmented, or if the DF bit is set the packets will not
be send even though they match the mtu perfectly fine. This patch
actually reverts commit 59104f06.

Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_output.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index a8024eaa0e87..6b894d430e61 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -888,12 +888,9 @@ alloc_new_skb:
 			 * because we have no idea what fragment will be
 			 * the last.
 			 */
-			if (datalen == length + fraggap) {
+			if (datalen == length + fraggap)
 				alloclen += rt->dst.trailer_len;
-				/* make sure mtu is not reached */
-				if (datalen > mtu - fragheaderlen - rt->dst.trailer_len)
-					datalen -= ALIGN(rt->dst.trailer_len, 8);
-			}
+
 			if (transhdrlen) {
 				skb = sock_alloc_send_skb(sk,
 						alloclen + hh_len + 15,
-- 
cgit v1.2.3


From 353e5c9abd900de3d1a40925386ffe4abf76111e Mon Sep 17 00:00:00 2001
From: Steffen Klassert <steffen.klassert@secunet.com>
Date: Wed, 22 Jun 2011 01:05:37 +0000
Subject: ipv4: Fix IPsec slowpath fragmentation problem

ip_append_data() builds packets based on the mtu from dst_mtu(rt->dst.path).
On IPsec the effective mtu is lower because we need to add the protocol
headers and trailers later when we do the IPsec transformations. So after
the IPsec transformations the packet might be too big, which leads to a
slowpath fragmentation then. This patch fixes this by building the packets
based on the lower IPsec mtu from dst_mtu(&rt->dst) and adapts the exthdr
handling to this.

Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_output.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 6b894d430e61..4a7e16b5d3f3 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -802,8 +802,6 @@ static int __ip_append_data(struct sock *sk,
 	skb = skb_peek_tail(queue);
 
 	exthdrlen = !skb ? rt->dst.header_len : 0;
-	length += exthdrlen;
-	transhdrlen += exthdrlen;
 	mtu = cork->fragsize;
 
 	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
@@ -883,6 +881,8 @@ alloc_new_skb:
 			else
 				alloclen = fraglen;
 
+			alloclen += exthdrlen;
+
 			/* The last fragment gets additional space at tail.
 			 * Note, with MSG_MORE we overallocate on fragments,
 			 * because we have no idea what fragment will be
@@ -923,11 +923,11 @@ alloc_new_skb:
 			/*
 			 *	Find where to start putting bytes.
 			 */
-			data = skb_put(skb, fraglen);
+			data = skb_put(skb, fraglen + exthdrlen);
 			skb_set_network_header(skb, exthdrlen);
 			skb->transport_header = (skb->network_header +
 						 fragheaderlen);
-			data += fragheaderlen;
+			data += fragheaderlen + exthdrlen;
 
 			if (fraggap) {
 				skb->csum = skb_copy_and_csum_bits(
@@ -1061,7 +1061,7 @@ static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
 	 */
 	*rtp = NULL;
 	cork->fragsize = inet->pmtudisc == IP_PMTUDISC_PROBE ?
-			 rt->dst.dev->mtu : dst_mtu(rt->dst.path);
+			 rt->dst.dev->mtu : dst_mtu(&rt->dst);
 	cork->dst = &rt->dst;
 	cork->length = 0;
 	cork->tx_flags = ipc->tx_flags;
-- 
cgit v1.2.3


From ed6e4ef836d425bc35e33bf20fcec95e68203afa Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Sat, 18 Jun 2011 07:53:59 +0000
Subject: netfilter: Fix ip_route_me_harder triggering ip_rt_bug

	Avoid creating input routes with ip_route_me_harder.
It does not work for locally generated packets. Instead,
restrict sockets to provide valid saddr for output route (or
unicast saddr for transparent proxy). For other traffic
allow saddr to be unicast or local but if callers forget
to check saddr type use 0 for the output route.

	The resulting handling should be:

- REJECT TCP:
	- in INPUT we can provide addr_type = RTN_LOCAL but
	better allow rejecting traffic delivered with
	local route (no IP address => use RTN_UNSPEC to
	allow also RTN_UNICAST).
	- FORWARD: RTN_UNSPEC => allow RTN_LOCAL/RTN_UNICAST
	saddr, add fix to ignore RTN_BROADCAST and RTN_MULTICAST
	- OUTPUT: RTN_UNSPEC

- NAT, mangle, ip_queue, nf_ip_reroute: RTN_UNSPEC in LOCAL_OUT

- IPVS:
	- use RTN_LOCAL in LOCAL_OUT and FORWARD after SNAT
	to restrict saddr to be local

Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/netfilter.c            | 60 +++++++++++++++--------------------------
 net/ipv4/netfilter/ipt_REJECT.c | 14 +++-------
 2 files changed, 26 insertions(+), 48 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c
index 4614babdc45f..2e97e3ec1eb7 100644
--- a/net/ipv4/netfilter.c
+++ b/net/ipv4/netfilter.c
@@ -17,51 +17,35 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type)
 	const struct iphdr *iph = ip_hdr(skb);
 	struct rtable *rt;
 	struct flowi4 fl4 = {};
-	unsigned long orefdst;
+	__be32 saddr = iph->saddr;
+	__u8 flags = 0;
 	unsigned int hh_len;
-	unsigned int type;
 
-	type = inet_addr_type(net, iph->saddr);
-	if (skb->sk && inet_sk(skb->sk)->transparent)
-		type = RTN_LOCAL;
-	if (addr_type == RTN_UNSPEC)
-		addr_type = type;
+	if (!skb->sk && addr_type != RTN_LOCAL) {
+		if (addr_type == RTN_UNSPEC)
+			addr_type = inet_addr_type(net, saddr);
+		if (addr_type == RTN_LOCAL || addr_type == RTN_UNICAST)
+			flags |= FLOWI_FLAG_ANYSRC;
+		else
+			saddr = 0;
+	}
 
 	/* some non-standard hacks like ipt_REJECT.c:send_reset() can cause
 	 * packets with foreign saddr to appear on the NF_INET_LOCAL_OUT hook.
 	 */
-	if (addr_type == RTN_LOCAL) {
-		fl4.daddr = iph->daddr;
-		if (type == RTN_LOCAL)
-			fl4.saddr = iph->saddr;
-		fl4.flowi4_tos = RT_TOS(iph->tos);
-		fl4.flowi4_oif = skb->sk ? skb->sk->sk_bound_dev_if : 0;
-		fl4.flowi4_mark = skb->mark;
-		fl4.flowi4_flags = skb->sk ? inet_sk_flowi_flags(skb->sk) : 0;
-		rt = ip_route_output_key(net, &fl4);
-		if (IS_ERR(rt))
-			return -1;
-
-		/* Drop old route. */
-		skb_dst_drop(skb);
-		skb_dst_set(skb, &rt->dst);
-	} else {
-		/* non-local src, find valid iif to satisfy
-		 * rp-filter when calling ip_route_input. */
-		fl4.daddr = iph->saddr;
-		rt = ip_route_output_key(net, &fl4);
-		if (IS_ERR(rt))
-			return -1;
+	fl4.daddr = iph->daddr;
+	fl4.saddr = saddr;
+	fl4.flowi4_tos = RT_TOS(iph->tos);
+	fl4.flowi4_oif = skb->sk ? skb->sk->sk_bound_dev_if : 0;
+	fl4.flowi4_mark = skb->mark;
+	fl4.flowi4_flags = skb->sk ? inet_sk_flowi_flags(skb->sk) : flags;
+	rt = ip_route_output_key(net, &fl4);
+	if (IS_ERR(rt))
+		return -1;
 
-		orefdst = skb->_skb_refdst;
-		if (ip_route_input(skb, iph->daddr, iph->saddr,
-				   RT_TOS(iph->tos), rt->dst.dev) != 0) {
-			dst_release(&rt->dst);
-			return -1;
-		}
-		dst_release(&rt->dst);
-		refdst_drop(orefdst);
-	}
+	/* Drop old route. */
+	skb_dst_drop(skb);
+	skb_dst_set(skb, &rt->dst);
 
 	if (skb_dst(skb)->error)
 		return -1;
diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c
index 1ff79e557f96..51f13f8ec724 100644
--- a/net/ipv4/netfilter/ipt_REJECT.c
+++ b/net/ipv4/netfilter/ipt_REJECT.c
@@ -40,7 +40,6 @@ static void send_reset(struct sk_buff *oldskb, int hook)
 	struct iphdr *niph;
 	const struct tcphdr *oth;
 	struct tcphdr _otcph, *tcph;
-	unsigned int addr_type;
 
 	/* IP header checks: fragment. */
 	if (ip_hdr(oldskb)->frag_off & htons(IP_OFFSET))
@@ -55,6 +54,9 @@ static void send_reset(struct sk_buff *oldskb, int hook)
 	if (oth->rst)
 		return;
 
+	if (skb_rtable(oldskb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
+		return;
+
 	/* Check checksum */
 	if (nf_ip_checksum(oldskb, hook, ip_hdrlen(oldskb), IPPROTO_TCP))
 		return;
@@ -101,19 +103,11 @@ static void send_reset(struct sk_buff *oldskb, int hook)
 	nskb->csum_start = (unsigned char *)tcph - nskb->head;
 	nskb->csum_offset = offsetof(struct tcphdr, check);
 
-	addr_type = RTN_UNSPEC;
-	if (hook != NF_INET_FORWARD
-#ifdef CONFIG_BRIDGE_NETFILTER
-	    || (nskb->nf_bridge && nskb->nf_bridge->mask & BRNF_BRIDGED)
-#endif
-	   )
-		addr_type = RTN_LOCAL;
-
 	/* ip_route_me_harder expects skb->dst to be set */
 	skb_dst_set_noref(nskb, skb_dst(oldskb));
 
 	nskb->protocol = htons(ETH_P_IP);
-	if (ip_route_me_harder(nskb, addr_type))
+	if (ip_route_me_harder(nskb, RTN_UNSPEC))
 		goto free_nskb;
 
 	niph->ttl	= ip4_dst_hoplimit(skb_dst(nskb));
-- 
cgit v1.2.3


From c146066ab80267c3305de5dda6a4083f06df9265 Mon Sep 17 00:00:00 2001
From: Steffen Klassert <steffen.klassert@secunet.com>
Date: Wed, 29 Jun 2011 23:19:32 +0000
Subject: ipv4: Don't use ufo handling on later transformed packets

We might call ip_ufo_append_data() for packets that will be IPsec
transformed later. This function should be used just for real
udp packets. So we check for rt->dst.header_len which is only
nonzero on IPsec handling and call ip_ufo_append_data() just
if rt->dst.header_len is zero.

Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_output.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 4a7e16b5d3f3..84f26e8e6c60 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -828,7 +828,7 @@ static int __ip_append_data(struct sock *sk,
 	cork->length += length;
 	if (((length > mtu) || (skb && skb_is_gso(skb))) &&
 	    (sk->sk_protocol == IPPROTO_UDP) &&
-	    (rt->dst.dev->features & NETIF_F_UFO)) {
+	    (rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len) {
 		err = ip_ufo_append_data(sk, queue, getfrag, from, length,
 					 hh_len, fragheaderlen, transhdrlen,
 					 mtu, flags);
-- 
cgit v1.2.3


From b00897b881f775040653955fda99dcf7c167b382 Mon Sep 17 00:00:00 2001
From: Steffen Klassert <steffen.klassert@secunet.com>
Date: Wed, 29 Jun 2011 23:20:41 +0000
Subject: xfrm4: Don't call icmp_send on local error

Calling icmp_send() on a local message size error leads to
an incorrect update of the path mtu. So use ip_local_error()
instead to notify the socket about the error.

Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/xfrm4_output.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c
index 2d51840e53a1..327a617d594c 100644
--- a/net/ipv4/xfrm4_output.c
+++ b/net/ipv4/xfrm4_output.c
@@ -32,7 +32,12 @@ static int xfrm4_tunnel_check_size(struct sk_buff *skb)
 	dst = skb_dst(skb);
 	mtu = dst_mtu(dst);
 	if (skb->len > mtu) {
-		icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
+		if (skb->sk)
+			ip_local_error(skb->sk, EMSGSIZE, ip_hdr(skb)->daddr,
+				       inet_sk(skb->sk)->inet_dport, mtu);
+		else
+			icmp_send(skb, ICMP_DEST_UNREACH,
+				  ICMP_FRAG_NEEDED, htonl(mtu));
 		ret = -EMSGSIZE;
 	}
 out:
-- 
cgit v1.2.3


From c349a528cd47e2272ded0ea358363855e86180da Mon Sep 17 00:00:00 2001
From: Marcus Meissner <meissner@novell.com>
Date: Mon, 4 Jul 2011 01:30:29 +0000
Subject: net: bind() fix error return on wrong address family

Hi,

Reinhard Max also pointed out that the error should EAFNOSUPPORT according
to POSIX.

The Linux manpages have it as EINVAL, some other OSes (Minix, HPUX, perhaps BSD) use
EAFNOSUPPORT. Windows uses WSAEFAULT according to MSDN.

Other protocols error values in their af bind() methods in current mainline git as far
as a brief look shows:
	EAFNOSUPPORT: atm, appletalk, l2tp, llc, phonet, rxrpc
	EINVAL: ax25, bluetooth, decnet, econet, ieee802154, iucv, netlink, netrom, packet, rds, rose, unix, x25,
	No check?: can/raw, ipv6/raw, irda, l2tp/l2tp_ip

Ciao, Marcus

Signed-off-by: Marcus Meissner <meissner@suse.de>
Cc: Reinhard Max <max@suse.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/af_inet.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index eae1f676f870..ef1528af7abf 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -465,8 +465,10 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 	if (addr_len < sizeof(struct sockaddr_in))
 		goto out;
 
-	if (addr->sin_family != AF_INET)
+	if (addr->sin_family != AF_INET) {
+		err = -EAFNOSUPPORT;
 		goto out;
+	}
 
 	chk_addr_ret = inet_addr_type(sock_net(sk), addr->sin_addr.s_addr);
 
-- 
cgit v1.2.3