From dcc997738e538919101d8756f19ca23110b25d8d Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@vyatta.com>
Date: Wed, 14 May 2008 22:33:38 -0700
Subject: net: handle errors from device_rename

device_rename can fail with -EEXIST or -ENOMEM, so handle any
problems.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/core/dev.c b/net/core/dev.c
index a1607bc0cd4c..ce88c0d3e354 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -903,7 +903,11 @@ int dev_change_name(struct net_device *dev, char *newname)
 		strlcpy(dev->name, newname, IFNAMSIZ);
 
 rollback:
-	device_rename(&dev->dev, dev->name);
+	err = device_rename(&dev->dev, dev->name);
+	if (err) {
+		memcpy(dev->name, oldname, IFNAMSIZ);
+		return err;
+	}
 
 	write_lock_bh(&dev_base_lock);
 	hlist_del(&dev->name_hlist);
-- 
cgit v1.2.3


From 34a961f7db36f10abd6b153411fe8c810f21f6b3 Mon Sep 17 00:00:00 2001
From: Abhijeet Kolekar <abhijeet.kolekar@intel.com>
Date: Fri, 9 May 2008 09:35:41 -0700
Subject: mac80211 : Association with 11n hidden ssid ap.

This patch fixes the association problem with 11n hidden ssid ap.
Patch fixes the problem of associating with hidden ssid when
all three parameters ap,essid and channel are given to iwconfig.
This patch removes the condition of checking three parameters
and always checks for bss in bss list while associating.

Signed-off-by: Abhijeet Kolekar <abhijeet.kolekar@intel.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/mlme.c | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 4adba09e80ca..e470bf12b765 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -3446,21 +3446,17 @@ static int ieee80211_sta_config_auth(struct net_device *dev,
 	struct ieee80211_sta_bss *bss, *selected = NULL;
 	int top_rssi = 0, freq;
 
-	if (!(ifsta->flags & (IEEE80211_STA_AUTO_SSID_SEL |
-	    IEEE80211_STA_AUTO_BSSID_SEL | IEEE80211_STA_AUTO_CHANNEL_SEL))) {
-		ifsta->state = IEEE80211_AUTHENTICATE;
-		ieee80211_sta_reset_auth(dev, ifsta);
-		return 0;
-	}
-
 	spin_lock_bh(&local->sta_bss_lock);
 	freq = local->oper_channel->center_freq;
 	list_for_each_entry(bss, &local->sta_bss_list, list) {
 		if (!(bss->capability & WLAN_CAPABILITY_ESS))
 			continue;
 
-		if (!!(bss->capability & WLAN_CAPABILITY_PRIVACY) ^
-		    !!sdata->default_key)
+		if ((ifsta->flags & (IEEE80211_STA_AUTO_SSID_SEL |
+			IEEE80211_STA_AUTO_BSSID_SEL |
+			IEEE80211_STA_AUTO_CHANNEL_SEL)) &&
+		    (!!(bss->capability & WLAN_CAPABILITY_PRIVACY) ^
+		     !!sdata->default_key))
 			continue;
 
 		if (!(ifsta->flags & IEEE80211_STA_AUTO_CHANNEL_SEL) &&
-- 
cgit v1.2.3


From 2f561feb386d6adefbad63c59a1fcd298ac6a79c Mon Sep 17 00:00:00 2001
From: Ivo van Doorn <ivdoorn@gmail.com>
Date: Sat, 10 May 2008 13:40:49 +0200
Subject: mac80211: Add RTNL version of ieee80211_iterate_active_interfaces

Since commit e38bad4766a110b61fa6038f10be16ced8c6cc38
	mac80211: make ieee80211_iterate_active_interfaces not need rtnl
rt2500usb and rt73usb broke down due to attempting register access
in atomic context (which is not possible for USB hardware).

This patch restores ieee80211_iterate_active_interfaces() to use RTNL lock,
and provides the non-RTNL version under a new name:
	ieee80211_iterate_active_interfaces_atomic()

So far only rt2x00 uses ieee80211_iterate_active_interfaces(), and those
drivers require the RTNL version of ieee80211_iterate_active_interfaces().
Since they already call that function directly, this patch will automatically
fix the USB rt2x00 drivers.

v2: Rename ieee80211_iterate_active_interfaces_rtnl

Signed-off-by: Ivo van Doorn <IvDoorn@gmail.com>
Acked-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/util.c | 37 ++++++++++++++++++++++++++++++++++++-
 1 file changed, 36 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index 24a465c4df09..131e9e6c8a32 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -389,6 +389,41 @@ void ieee80211_iterate_active_interfaces(
 	struct ieee80211_local *local = hw_to_local(hw);
 	struct ieee80211_sub_if_data *sdata;
 
+	rtnl_lock();
+
+	list_for_each_entry(sdata, &local->interfaces, list) {
+		switch (sdata->vif.type) {
+		case IEEE80211_IF_TYPE_INVALID:
+		case IEEE80211_IF_TYPE_MNTR:
+		case IEEE80211_IF_TYPE_VLAN:
+			continue;
+		case IEEE80211_IF_TYPE_AP:
+		case IEEE80211_IF_TYPE_STA:
+		case IEEE80211_IF_TYPE_IBSS:
+		case IEEE80211_IF_TYPE_WDS:
+		case IEEE80211_IF_TYPE_MESH_POINT:
+			break;
+		}
+		if (sdata->dev == local->mdev)
+			continue;
+		if (netif_running(sdata->dev))
+			iterator(data, sdata->dev->dev_addr,
+				 &sdata->vif);
+	}
+
+	rtnl_unlock();
+}
+EXPORT_SYMBOL_GPL(ieee80211_iterate_active_interfaces);
+
+void ieee80211_iterate_active_interfaces_atomic(
+	struct ieee80211_hw *hw,
+	void (*iterator)(void *data, u8 *mac,
+			 struct ieee80211_vif *vif),
+	void *data)
+{
+	struct ieee80211_local *local = hw_to_local(hw);
+	struct ieee80211_sub_if_data *sdata;
+
 	rcu_read_lock();
 
 	list_for_each_entry_rcu(sdata, &local->interfaces, list) {
@@ -413,4 +448,4 @@ void ieee80211_iterate_active_interfaces(
 
 	rcu_read_unlock();
 }
-EXPORT_SYMBOL_GPL(ieee80211_iterate_active_interfaces);
+EXPORT_SYMBOL_GPL(ieee80211_iterate_active_interfaces_atomic);
-- 
cgit v1.2.3


From b4528762ca92261c6ed3f03e76adeb1dc587aacb Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sun, 11 May 2008 12:18:51 -0700
Subject: SUNRPC: AUTH_SYS "machine creds" shouldn't use negative valued
 uid/gid

Apparently this causes Solaris 10 servers to refuse our NFSv4 SETCLIENTID
calls. Fall back to root creds for now, since most servers that care are
very likely to have root squashing enabled.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/auth_generic.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/auth_generic.c b/net/sunrpc/auth_generic.c
index d927d9f57412..744b79fdcb19 100644
--- a/net/sunrpc/auth_generic.c
+++ b/net/sunrpc/auth_generic.c
@@ -17,8 +17,8 @@
 # define RPCDBG_FACILITY	RPCDBG_AUTH
 #endif
 
-#define RPC_ANONYMOUS_USERID	((uid_t)-2)
-#define RPC_ANONYMOUS_GROUPID	((gid_t)-2)
+#define RPC_MACHINE_CRED_USERID		((uid_t)0)
+#define RPC_MACHINE_CRED_GROUPID	((gid_t)0)
 
 struct generic_cred {
 	struct rpc_cred gc_base;
@@ -44,8 +44,8 @@ EXPORT_SYMBOL_GPL(rpc_lookup_cred);
 struct rpc_cred *rpc_lookup_machine_cred(void)
 {
 	struct auth_cred acred = {
-		.uid = RPC_ANONYMOUS_USERID,
-		.gid = RPC_ANONYMOUS_GROUPID,
+		.uid = RPC_MACHINE_CRED_USERID,
+		.gid = RPC_MACHINE_CRED_GROUPID,
 		.machine_cred = 1,
 	};
 
-- 
cgit v1.2.3


From d71a4dd72e67210ae0767ccae69c79f1c933ff64 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Fri, 9 May 2008 12:01:19 -0700
Subject: svcrpc: fix proc/net/rpc/auth.unix.ip/content display

Commit f15364bd4cf8799a7677b6daeed7b67d9139d974 ("IPv6 support for NFS
server export caches") dropped a couple spaces, rendering the output
here difficult to read.

(However note that we expect the output to be parsed only by humans, not
machines, so this shouldn't have broken any userland software.)

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 net/sunrpc/svcauth_unix.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c
index 3f30ee6006ae..f24800f2c098 100644
--- a/net/sunrpc/svcauth_unix.c
+++ b/net/sunrpc/svcauth_unix.c
@@ -278,7 +278,7 @@ static int ip_map_show(struct seq_file *m,
 		dom = im->m_client->h.name;
 
 	if (ipv6_addr_v4mapped(&addr)) {
-		seq_printf(m, "%s" NIPQUAD_FMT "%s\n",
+		seq_printf(m, "%s " NIPQUAD_FMT " %s\n",
 			im->m_class,
 			ntohl(addr.s6_addr32[3]) >> 24 & 0xff,
 			ntohl(addr.s6_addr32[3]) >> 16 & 0xff,
@@ -286,7 +286,7 @@ static int ip_map_show(struct seq_file *m,
 			ntohl(addr.s6_addr32[3]) >>  0 & 0xff,
 			dom);
 	} else {
-		seq_printf(m, "%s" NIP6_FMT "%s\n",
+		seq_printf(m, "%s " NIP6_FMT " %s\n",
 			im->m_class, NIP6(addr), dom);
 	}
 	return 0;
-- 
cgit v1.2.3


From fc63a050861a53ba99a6222229cda555796d669e Mon Sep 17 00:00:00 2001
From: Tom Tucker <tom@opengridcomputing.com>
Date: Fri, 25 Apr 2008 11:07:10 -0500
Subject: svc: Remove extra check for XPT_DEAD bit in svc_xprt_enqueue

Remove a redundant check for the XPT_DEAD bit in the svc_xprt_enqueue
function. This same bit is checked below while holding the pool lock
and prints a debug message if found to be dead.

Signed-off-by: Tom Tucker <tom@opengridcomputing.com>
---
 net/sunrpc/svc_xprt.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index d8e8d79a8451..cac3f82fca61 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -296,8 +296,6 @@ void svc_xprt_enqueue(struct svc_xprt *xprt)
 	if (!(xprt->xpt_flags &
 	      ((1<<XPT_CONN)|(1<<XPT_DATA)|(1<<XPT_CLOSE)|(1<<XPT_DEFERRED))))
 		return;
-	if (test_bit(XPT_DEAD, &xprt->xpt_flags))
-		return;
 
 	cpu = get_cpu();
 	pool = svc_pool_for_cpu(xprt->xpt_server, cpu);
-- 
cgit v1.2.3


From aa3314c8d6da673b3454549eed45547a79f7cbe1 Mon Sep 17 00:00:00 2001
From: Tom Tucker <tom@opengridcomputing.com>
Date: Thu, 24 Apr 2008 21:30:47 -0500
Subject: svc: Remove unused header files from svc_xprt.c

This cosmetic patch removes unused header files that svc_xprt.c
inherited from svcsock.c

Signed-off-by: Tom Tucker <tom@opengridcomputing.com>
---
 net/sunrpc/svc_xprt.c | 21 ---------------------
 1 file changed, 21 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index cac3f82fca61..e46c825f4954 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -6,30 +6,9 @@
 
 #include <linux/sched.h>
 #include <linux/errno.h>
-#include <linux/fcntl.h>
-#include <linux/net.h>
-#include <linux/in.h>
-#include <linux/inet.h>
-#include <linux/udp.h>
-#include <linux/tcp.h>
-#include <linux/unistd.h>
-#include <linux/slab.h>
-#include <linux/netdevice.h>
-#include <linux/skbuff.h>
-#include <linux/file.h>
 #include <linux/freezer.h>
 #include <linux/kthread.h>
 #include <net/sock.h>
-#include <net/checksum.h>
-#include <net/ip.h>
-#include <net/ipv6.h>
-#include <net/tcp_states.h>
-#include <linux/uaccess.h>
-#include <asm/ioctls.h>
-
-#include <linux/sunrpc/types.h>
-#include <linux/sunrpc/clnt.h>
-#include <linux/sunrpc/xdr.h>
 #include <linux/sunrpc/stats.h>
 #include <linux/sunrpc/svc_xprt.h>
 
-- 
cgit v1.2.3


From 0e7f011a19696cc25d68a8d6631fc6c5aa60a54c Mon Sep 17 00:00:00 2001
From: Tom Tucker <tom@opengridcomputing.com>
Date: Wed, 23 Apr 2008 16:49:54 -0500
Subject: svcrdma: Simplify receive buffer posting

The svcrdma transport provider currently allocates receive buffers
to the RQ through the xpo_release_rqst method. This approach is overly
complicated since it means that the rqstp rq_xprt_ctxt has to be
selectively set based on whether the RPC is going to be processed
immediately or deferred. Instead, just post the receive buffer when
we are certain that we are replying in the send_reply function.

Signed-off-by: Tom Tucker <tom@opengridcomputing.com>
---
 net/sunrpc/xprtrdma/svc_rdma_recvfrom.c  | 17 +----------------
 net/sunrpc/xprtrdma/svc_rdma_sendto.c    | 10 ++++++++++
 net/sunrpc/xprtrdma/svc_rdma_transport.c | 19 -------------------
 3 files changed, 11 insertions(+), 35 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index c22d6b6f2db4..f3a108a864ad 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -457,8 +457,6 @@ static int rdma_read_complete(struct svc_rqst *rqstp,
 		ret, rqstp->rq_arg.len,	rqstp->rq_arg.head[0].iov_base,
 		rqstp->rq_arg.head[0].iov_len);
 
-	/* Indicate that we've consumed an RQ credit */
-	rqstp->rq_xprt_ctxt = rqstp->rq_xprt;
 	svc_xprt_received(rqstp->rq_xprt);
 	return ret;
 }
@@ -480,13 +478,6 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
 
 	dprintk("svcrdma: rqstp=%p\n", rqstp);
 
-	/*
-	 * The rq_xprt_ctxt indicates if we've consumed an RQ credit
-	 * or not. It is used in the rdma xpo_release_rqst function to
-	 * determine whether or not to return an RQ WQE to the RQ.
-	 */
-	rqstp->rq_xprt_ctxt = NULL;
-
 	spin_lock_bh(&rdma_xprt->sc_read_complete_lock);
 	if (!list_empty(&rdma_xprt->sc_read_complete_q)) {
 		ctxt = list_entry(rdma_xprt->sc_read_complete_q.next,
@@ -550,9 +541,6 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
 		return 0;
 	}
 
-	/* Indicate we've consumed an RQ credit */
-	rqstp->rq_xprt_ctxt = rqstp->rq_xprt;
-
 	ret = rqstp->rq_arg.head[0].iov_len
 		+ rqstp->rq_arg.page_len
 		+ rqstp->rq_arg.tail[0].iov_len;
@@ -569,11 +557,8 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
 	return ret;
 
  close_out:
-	if (ctxt) {
+	if (ctxt)
 		svc_rdma_put_context(ctxt, 1);
-		/* Indicate we've consumed an RQ credit */
-		rqstp->rq_xprt_ctxt = rqstp->rq_xprt;
-	}
 	dprintk("svcrdma: transport %p is closing\n", xprt);
 	/*
 	 * Set the close bit and enqueue it. svc_recv will see the
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index 981f190c1b39..f61d7bd105fb 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -389,6 +389,16 @@ static int send_reply(struct svcxprt_rdma *rdma,
 	int page_no;
 	int ret;
 
+	/* Post a recv buffer to handle another request. */
+	ret = svc_rdma_post_recv(rdma);
+	if (ret) {
+		printk(KERN_INFO
+		       "svcrdma: could not post a receive buffer, err=%d."
+		       "Closing transport %p.\n", ret, rdma);
+		set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags);
+		return 0;
+	}
+
 	/* Prepare the context */
 	ctxt->pages[0] = page;
 	ctxt->count = 1;
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index af408fc12634..1e0af2f205e9 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -910,27 +910,8 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
 	return NULL;
 }
 
-/*
- * Post an RQ WQE to the RQ when the rqst is being released. This
- * effectively returns an RQ credit to the client. The rq_xprt_ctxt
- * will be null if the request is deferred due to an RDMA_READ or the
- * transport had no data ready (EAGAIN). Note that an RPC deferred in
- * svc_process will still return the credit, this is because the data
- * is copied and no longer consume a WQE/WC.
- */
 static void svc_rdma_release_rqst(struct svc_rqst *rqstp)
 {
-	int err;
-	struct svcxprt_rdma *rdma =
-		container_of(rqstp->rq_xprt, struct svcxprt_rdma, sc_xprt);
-	if (rqstp->rq_xprt_ctxt) {
-		BUG_ON(rqstp->rq_xprt_ctxt != rdma);
-		err = svc_rdma_post_recv(rdma);
-		if (err)
-			dprintk("svcrdma: failed to post an RQ WQE error=%d\n",
-				err);
-	}
-	rqstp->rq_xprt_ctxt = NULL;
 }
 
 /*
-- 
cgit v1.2.3


From dbcd00eba99945acfc433508a58eadc5dcd18cad Mon Sep 17 00:00:00 2001
From: Tom Tucker <tom@opengridcomputing.com>
Date: Tue, 6 May 2008 11:33:11 -0500
Subject: svcrdma: Fix race with dto_tasklet in svc_rdma_send

The svc_rdma_send function will attempt to reap SQ WR to make room for
a new request if it finds the SQ full. This function races with the
dto_tasklet that also reaps SQ WR. To avoid polling and arming the CQ
unnecessarily move the test_and_clear_bit of the RDMAXPRT_SQ_PENDING
flag and arming of the CQ to the sq_cq_reap function.

Refactor the rq_cq_reap function to match sq_cq_reap so that the
code is easier to follow.

Signed-off-by: Tom Tucker <tom@opengridcomputing.com>
---
 net/sunrpc/xprtrdma/svc_rdma_transport.c | 40 ++++++++++++++++++--------------
 1 file changed, 22 insertions(+), 18 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index 1e0af2f205e9..73734173f994 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -228,23 +228,8 @@ static void dto_tasklet_func(unsigned long data)
 		list_del_init(&xprt->sc_dto_q);
 		spin_unlock_irqrestore(&dto_lock, flags);
 
-		if (test_and_clear_bit(RDMAXPRT_RQ_PENDING, &xprt->sc_flags)) {
-			ib_req_notify_cq(xprt->sc_rq_cq, IB_CQ_NEXT_COMP);
-			rq_cq_reap(xprt);
-			set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags);
-			/*
-			 * If data arrived before established event,
-			 * don't enqueue. This defers RPC I/O until the
-			 * RDMA connection is complete.
-			 */
-			if (!test_bit(RDMAXPRT_CONN_PENDING, &xprt->sc_flags))
-				svc_xprt_enqueue(&xprt->sc_xprt);
-		}
-
-		if (test_and_clear_bit(RDMAXPRT_SQ_PENDING, &xprt->sc_flags)) {
-			ib_req_notify_cq(xprt->sc_sq_cq, IB_CQ_NEXT_COMP);
-			sq_cq_reap(xprt);
-		}
+		rq_cq_reap(xprt);
+		sq_cq_reap(xprt);
 
 		svc_xprt_put(&xprt->sc_xprt);
 		spin_lock_irqsave(&dto_lock, flags);
@@ -297,6 +282,10 @@ static void rq_cq_reap(struct svcxprt_rdma *xprt)
 	struct ib_wc wc;
 	struct svc_rdma_op_ctxt *ctxt = NULL;
 
+	if (!test_and_clear_bit(RDMAXPRT_RQ_PENDING, &xprt->sc_flags))
+		return;
+
+	ib_req_notify_cq(xprt->sc_rq_cq, IB_CQ_NEXT_COMP);
 	atomic_inc(&rdma_stat_rq_poll);
 
 	spin_lock_bh(&xprt->sc_rq_dto_lock);
@@ -316,6 +305,15 @@ static void rq_cq_reap(struct svcxprt_rdma *xprt)
 
 	if (ctxt)
 		atomic_inc(&rdma_stat_rq_prod);
+
+	set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags);
+	/*
+	 * If data arrived before established event,
+	 * don't enqueue. This defers RPC I/O until the
+	 * RDMA connection is complete.
+	 */
+	if (!test_bit(RDMAXPRT_CONN_PENDING, &xprt->sc_flags))
+		svc_xprt_enqueue(&xprt->sc_xprt);
 }
 
 /*
@@ -328,6 +326,11 @@ static void sq_cq_reap(struct svcxprt_rdma *xprt)
 	struct ib_cq *cq = xprt->sc_sq_cq;
 	int ret;
 
+
+	if (!test_and_clear_bit(RDMAXPRT_SQ_PENDING, &xprt->sc_flags))
+		return;
+
+	ib_req_notify_cq(xprt->sc_sq_cq, IB_CQ_NEXT_COMP);
 	atomic_inc(&rdma_stat_sq_poll);
 	while ((ret = ib_poll_cq(cq, 1, &wc)) > 0) {
 		ctxt = (struct svc_rdma_op_ctxt *)(unsigned long)wc.wr_id;
@@ -1010,7 +1013,8 @@ int svc_rdma_send(struct svcxprt_rdma *xprt, struct ib_send_wr *wr)
 		if (xprt->sc_sq_depth == atomic_read(&xprt->sc_sq_count)) {
 			spin_unlock_bh(&xprt->sc_lock);
 			atomic_inc(&rdma_stat_sq_starve);
-			/* See if we can reap some SQ WR */
+
+			/* See if we can opportunistically reap SQ WR to make room */
 			sq_cq_reap(xprt);
 
 			/* Wait until SQ WR available if SQ still full */
-- 
cgit v1.2.3


From 9d6347acd2134373c3a4c65a4d43e4f1d59aa012 Mon Sep 17 00:00:00 2001
From: Tom Tucker <tom@opengridcomputing.com>
Date: Fri, 25 Apr 2008 15:51:27 -0500
Subject: svcrdma: Fix return value in svc_rdma_send

Fix the return value on close to -ENOTCONN so caller knows to free context.
Also if a thread is waiting for free SQ space, check for close when waking
to avoid posting WR to a closing transport.

Signed-off-by: Tom Tucker <tom@opengridcomputing.com>
---
 net/sunrpc/xprtrdma/svc_rdma_transport.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index 73734173f994..17f036b23a96 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -1002,7 +1002,7 @@ int svc_rdma_send(struct svcxprt_rdma *xprt, struct ib_send_wr *wr)
 	int ret;
 
 	if (test_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags))
-		return 0;
+		return -ENOTCONN;
 
 	BUG_ON(wr->send_flags != IB_SEND_SIGNALED);
 	BUG_ON(((struct svc_rdma_op_ctxt *)(unsigned long)wr->wr_id)->wr_op !=
-- 
cgit v1.2.3


From 120693d12cde0cc735d784c951b53381efec918f Mon Sep 17 00:00:00 2001
From: Tom Tucker <tom@opengridcomputing.com>
Date: Thu, 24 Apr 2008 14:17:21 -0500
Subject: svcrdma: Add put of connection ESTABLISHED reference in
 rdma_cma_handler

The svcrdma transport takes a reference when it gets the ESTABLISHED
event from the provider. This reference is supposed to be removed when
the DISCONNECT event is received, however, the call to svc_xprt_put
was missing in the switch statement. This results in the memory
associated with the transport never being freed.

Signed-off-by: Tom Tucker <tom@opengridcomputing.com>
---
 net/sunrpc/xprtrdma/svc_rdma_transport.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index 17f036b23a96..4bf8b5ad1675 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -630,6 +630,7 @@ static int rdma_cma_handler(struct rdma_cm_id *cma_id,
 		if (xprt) {
 			set_bit(XPT_CLOSE, &xprt->xpt_flags);
 			svc_xprt_enqueue(xprt);
+			svc_xprt_put(xprt);
 		}
 		break;
 	case RDMA_CM_EVENT_DEVICE_REMOVAL:
-- 
cgit v1.2.3


From 05a0826a6e6d95ab6e9c3e4a10b58e10f233cc2b Mon Sep 17 00:00:00 2001
From: Tom Tucker <tom@opengridcomputing.com>
Date: Fri, 25 Apr 2008 14:11:31 -0500
Subject: svcrdma: Free context on ib_post_recv error

If there is an error posting the recv WR to the RQ, free the
context associated with the WR. This would leak a context when
asynchronous errors occurred on the transport while conccurent threads
were processing their RPC.

Signed-off-by: Tom Tucker <tom@opengridcomputing.com>
---
 net/sunrpc/xprtrdma/svc_rdma_transport.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net')

diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index 4bf8b5ad1675..e85ac77f4954 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -524,6 +524,8 @@ int svc_rdma_post_recv(struct svcxprt_rdma *xprt)
 	recv_wr.wr_id = (u64)(unsigned long)ctxt;
 
 	ret = ib_post_recv(xprt->sc_qp, &recv_wr, &bad_recv_wr);
+	if (ret)
+		svc_rdma_put_context(ctxt, 1);
 	return ret;
 }
 
-- 
cgit v1.2.3


From 5ac461a6f05499fa233ea43b1de80b679d1eec21 Mon Sep 17 00:00:00 2001
From: Tom Tucker <tom@opengridcomputing.com>
Date: Fri, 25 Apr 2008 18:08:59 -0500
Subject: svcrdma: Free context on post_recv error in send_reply

If an error is encountered trying to post a recv buffer in send_reply,
free the passed in context. Return an error to the caller so it is
aware that the request was not posted.

Signed-off-by: Tom Tucker <tom@opengridcomputing.com>
---
 net/sunrpc/xprtrdma/svc_rdma_sendto.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index f61d7bd105fb..fb82b1b683f8 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -396,7 +396,8 @@ static int send_reply(struct svcxprt_rdma *rdma,
 		       "svcrdma: could not post a receive buffer, err=%d."
 		       "Closing transport %p.\n", ret, rdma);
 		set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags);
-		return 0;
+		svc_rdma_put_context(ctxt, 0);
+		return -ENOTCONN;
 	}
 
 	/* Prepare the context */
-- 
cgit v1.2.3


From 58e8f62137f1c55fe3d31234167660f2ce509297 Mon Sep 17 00:00:00 2001
From: Tom Tucker <tom@opengridcomputing.com>
Date: Tue, 6 May 2008 09:45:54 -0500
Subject: svcrdma: Fix error handling during listening endpoint creation

A listening endpoint isn't known to the generic transport switch until
the svc_create_xprt function returns without error. Calling
svc_xprt_put within the xpo_create function causes the module reference
count to be erroneously decremented.

Signed-off-by: Tom Tucker <tom@opengridcomputing.com>
---
 net/sunrpc/xprtrdma/svc_rdma_transport.c | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index e85ac77f4954..d9ed5f24c362 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -667,31 +667,27 @@ static struct svc_xprt *svc_rdma_create(struct svc_serv *serv,
 
 	cma_xprt = rdma_create_xprt(serv, 1);
 	if (!cma_xprt)
-		return ERR_PTR(ENOMEM);
+		return ERR_PTR(-ENOMEM);
 	xprt = &cma_xprt->sc_xprt;
 
 	listen_id = rdma_create_id(rdma_listen_handler, cma_xprt, RDMA_PS_TCP);
 	if (IS_ERR(listen_id)) {
-		svc_xprt_put(&cma_xprt->sc_xprt);
-		dprintk("svcrdma: rdma_create_id failed = %ld\n",
-			PTR_ERR(listen_id));
-		return (void *)listen_id;
+		ret = PTR_ERR(listen_id);
+		dprintk("svcrdma: rdma_create_id failed = %d\n", ret);
+		goto err0;
 	}
+
 	ret = rdma_bind_addr(listen_id, sa);
 	if (ret) {
-		rdma_destroy_id(listen_id);
-		svc_xprt_put(&cma_xprt->sc_xprt);
 		dprintk("svcrdma: rdma_bind_addr failed = %d\n", ret);
-		return ERR_PTR(ret);
+		goto err1;
 	}
 	cma_xprt->sc_cm_id = listen_id;
 
 	ret = rdma_listen(listen_id, RPCRDMA_LISTEN_BACKLOG);
 	if (ret) {
-		rdma_destroy_id(listen_id);
-		svc_xprt_put(&cma_xprt->sc_xprt);
 		dprintk("svcrdma: rdma_listen failed = %d\n", ret);
-		return ERR_PTR(ret);
+		goto err1;
 	}
 
 	/*
@@ -702,6 +698,12 @@ static struct svc_xprt *svc_rdma_create(struct svc_serv *serv,
 	svc_xprt_set_local(&cma_xprt->sc_xprt, sa, salen);
 
 	return &cma_xprt->sc_xprt;
+
+ err1:
+	rdma_destroy_id(listen_id);
+ err0:
+	kfree(cma_xprt);
+	return ERR_PTR(ret);
 }
 
 /*
-- 
cgit v1.2.3


From d16d40093a95f2b31007d7a7abefc50e6b27e236 Mon Sep 17 00:00:00 2001
From: Tom Tucker <tom@opengridcomputing.com>
Date: Tue, 6 May 2008 10:04:50 -0500
Subject: svcrdma: Return error from rdma_read_xdr so caller knows to free
 context

The rdma_read_xdr function did not discriminate between no read-list and
an error posting the read-list. This results in a leak of a page if there
is an error posting the read-list.

Signed-off-by: Tom Tucker <tom@opengridcomputing.com>
---
 net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 23 ++++++++++++++++-------
 1 file changed, 16 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index f3a108a864ad..5e03d95b25e2 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -260,11 +260,16 @@ static int rdma_read_max_sge(struct svcxprt_rdma *xprt, int sge_count)
  * On our side, we need to read into a pagelist. The first page immediately
  * follows the RPC header.
  *
- * This function returns 1 to indicate success. The data is not yet in
+ * This function returns:
+ * 0 - No error and no read-list found.
+ *
+ * 1 - Successful read-list processing. The data is not yet in
  * the pagelist and therefore the RPC request must be deferred. The
  * I/O completion will enqueue the transport again and
  * svc_rdma_recvfrom will complete the request.
  *
+ * <0 - Error processing/posting read-list.
+ *
  * NOTE: The ctxt must not be touched after the last WR has been posted
  * because the I/O completion processing may occur on another
  * processor and free / modify the context. Ne touche pas!
@@ -398,7 +403,7 @@ next_sge:
 			svc_rdma_put_context(head, 1);
 			head = ctxt;
 		}
-		return 0;
+		return err;
 	}
 
 	return 1;
@@ -532,14 +537,18 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
 		goto close_out;
 	}
 
-	/* Read read-list data. If we would need to wait, defer
-	 * it. Not that in this case, we don't return the RQ credit
-	 * until after the read completes.
-	 */
-	if (rdma_read_xdr(rdma_xprt, rmsgp, rqstp, ctxt)) {
+	/* Read read-list data. */
+	ret = rdma_read_xdr(rdma_xprt, rmsgp, rqstp, ctxt);
+	if (ret > 0) {
+		/* read-list posted, defer until data received from client. */
 		svc_xprt_received(xprt);
 		return 0;
 	}
+	if (ret < 0) {
+		/* Post of read-list failed, free context. */
+		svc_rdma_put_context(ctxt, 1);
+		return 0;
+	}
 
 	ret = rqstp->rq_arg.head[0].iov_len
 		+ rqstp->rq_arg.page_len
-- 
cgit v1.2.3


From 10a38c33f46d128d11e299acba744bc325cde420 Mon Sep 17 00:00:00 2001
From: Tom Tucker <tom@opengridcomputing.com>
Date: Wed, 30 Apr 2008 17:32:17 -0500
Subject: svcrdma: Remove unused READ_DONE context flags bit

The RDMACTXT_F_READ_DONE bit is not longer used. Remove it.

Signed-off-by: Tom Tucker <tom@opengridcomputing.com>
---
 net/sunrpc/xprtrdma/svc_rdma_recvfrom.c  | 1 -
 net/sunrpc/xprtrdma/svc_rdma_transport.c | 1 -
 2 files changed, 2 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index 5e03d95b25e2..80c6ee82c34b 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -325,7 +325,6 @@ next_sge:
 		}
 		ctxt->next = NULL;
 		ctxt->direction = DMA_FROM_DEVICE;
-		clear_bit(RDMACTXT_F_READ_DONE, &ctxt->flags);
 		clear_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags);
 
 		/* Prepare READ WR */
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index d9ed5f24c362..4a79dfda1465 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -353,7 +353,6 @@ static void sq_cq_reap(struct svcxprt_rdma *xprt)
 		case IB_WR_RDMA_READ:
 			if (test_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags)) {
 				set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags);
-				set_bit(RDMACTXT_F_READ_DONE, &ctxt->flags);
 				spin_lock_bh(&xprt->sc_read_complete_lock);
 				list_add_tail(&ctxt->dto_q,
 					      &xprt->sc_read_complete_q);
-- 
cgit v1.2.3


From 02e7452de74d308ca642f54f7e5ef801ced60a92 Mon Sep 17 00:00:00 2001
From: Tom Tucker <tom@opengridcomputing.com>
Date: Wed, 30 Apr 2008 19:50:56 -0500
Subject: svcrdma: Simplify RDMA_READ deferral buffer management

An NFS_WRITE requires a set of RDMA_READ requests to fetch the write
data from the client. There are two principal pieces of data that
need to be tracked: the list of pages that comprise the completed RPC
and the SGE of dma mapped pages to refer to this list of pages. Previously
this whole bit was managed as a linked list of contexts with the
context containing the page list buried in this list. This patch
simplifies this processing by not keeping a linked list, but rather only
a pionter from the last submitted RDMA_READ's context to the context
that maps the set of pages that describe the RPC.  This significantly
simplifies this code path. SGE contexts are cleaned up inline in the DTO
path instead of at read completion time.

Signed-off-by: Tom Tucker <tom@opengridcomputing.com>
---
 net/sunrpc/xprtrdma/svc_rdma_recvfrom.c  | 58 ++++++--------------------------
 net/sunrpc/xprtrdma/svc_rdma_transport.c |  5 ++-
 2 files changed, 15 insertions(+), 48 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index 80c6ee82c34b..21a1e625ef03 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -289,7 +289,6 @@ static int rdma_read_xdr(struct svcxprt_rdma *xprt,
 	u64 sgl_offset;
 	struct rpcrdma_read_chunk *ch;
 	struct svc_rdma_op_ctxt *ctxt = NULL;
-	struct svc_rdma_op_ctxt *head;
 	struct svc_rdma_op_ctxt *tmp_sge_ctxt;
 	struct svc_rdma_op_ctxt *tmp_ch_ctxt;
 	struct chunk_sge *ch_sge_ary;
@@ -310,20 +309,13 @@ static int rdma_read_xdr(struct svcxprt_rdma *xprt,
 	sge_count = rdma_rcl_to_sge(xprt, rqstp, hdr_ctxt, rmsgp,
 				    sge, ch_sge_ary,
 				    ch_count, byte_count);
-	head = svc_rdma_get_context(xprt);
 	sgl_offset = 0;
 	ch_no = 0;
 
 	for (ch = (struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0];
 	     ch->rc_discrim != 0; ch++, ch_no++) {
 next_sge:
-		if (!ctxt)
-			ctxt = head;
-		else {
-			ctxt->next = svc_rdma_get_context(xprt);
-			ctxt = ctxt->next;
-		}
-		ctxt->next = NULL;
+		ctxt = svc_rdma_get_context(xprt);
 		ctxt->direction = DMA_FROM_DEVICE;
 		clear_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags);
 
@@ -351,20 +343,15 @@ next_sge:
 			 * the client and the RPC needs to be enqueued.
 			 */
 			set_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags);
-			ctxt->next = hdr_ctxt;
-			hdr_ctxt->next = head;
+			ctxt->read_hdr = hdr_ctxt;
 		}
 		/* Post the read */
 		err = svc_rdma_send(xprt, &read_wr);
 		if (err) {
-			printk(KERN_ERR "svcrdma: Error posting send = %d\n",
+			printk(KERN_ERR "svcrdma: Error %d posting RDMA_READ\n",
 			       err);
-			/*
-			 * Break the circular list so free knows when
-			 * to stop if the error happened to occur on
-			 * the last read
-			 */
-			ctxt->next = NULL;
+			set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
+			svc_rdma_put_context(ctxt, 0);
 			goto out;
 		}
 		atomic_inc(&rdma_stat_read);
@@ -375,7 +362,7 @@ next_sge:
 			goto next_sge;
 		}
 		sgl_offset = 0;
-		err = 0;
+		err = 1;
 	}
 
  out:
@@ -393,25 +380,12 @@ next_sge:
 	while (rqstp->rq_resused)
 		rqstp->rq_respages[--rqstp->rq_resused] = NULL;
 
-	if (err) {
-		printk(KERN_ERR "svcrdma : RDMA_READ error = %d\n", err);
-		set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
-		/* Free the linked list of read contexts */
-		while (head != NULL) {
-			ctxt = head->next;
-			svc_rdma_put_context(head, 1);
-			head = ctxt;
-		}
-		return err;
-	}
-
-	return 1;
+	return err;
 }
 
 static int rdma_read_complete(struct svc_rqst *rqstp,
-			      struct svc_rdma_op_ctxt *data)
+			      struct svc_rdma_op_ctxt *head)
 {
-	struct svc_rdma_op_ctxt *head = data->next;
 	int page_no;
 	int ret;
 
@@ -437,22 +411,12 @@ static int rdma_read_complete(struct svc_rqst *rqstp,
 	rqstp->rq_arg.len = head->arg.len;
 	rqstp->rq_arg.buflen = head->arg.buflen;
 
+	/* Free the context */
+	svc_rdma_put_context(head, 0);
+
 	/* XXX: What should this be? */
 	rqstp->rq_prot = IPPROTO_MAX;
 
-	/*
-	 * Free the contexts we used to build the RDMA_READ. We have
-	 * to be careful here because the context list uses the same
-	 * next pointer used to chain the contexts associated with the
-	 * RDMA_READ
-	 */
-	data->next = NULL;	/* terminate circular list */
-	do {
-		data = head->next;
-		svc_rdma_put_context(head, 0);
-		head = data;
-	} while (head != NULL);
-
 	ret = rqstp->rq_arg.head[0].iov_len
 		+ rqstp->rq_arg.page_len
 		+ rqstp->rq_arg.tail[0].iov_len;
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index 4a79dfda1465..34141eaf25a0 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -352,13 +352,16 @@ static void sq_cq_reap(struct svcxprt_rdma *xprt)
 
 		case IB_WR_RDMA_READ:
 			if (test_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags)) {
+				struct svc_rdma_op_ctxt *read_hdr = ctxt->read_hdr;
+				BUG_ON(!read_hdr);
 				set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags);
 				spin_lock_bh(&xprt->sc_read_complete_lock);
-				list_add_tail(&ctxt->dto_q,
+				list_add_tail(&read_hdr->dto_q,
 					      &xprt->sc_read_complete_q);
 				spin_unlock_bh(&xprt->sc_read_complete_lock);
 				svc_xprt_enqueue(&xprt->sc_xprt);
 			}
+			svc_rdma_put_context(ctxt, 0);
 			break;
 
 		default:
-- 
cgit v1.2.3


From 8740767376b32a7772607e1b2b07cde0c24120cc Mon Sep 17 00:00:00 2001
From: Tom Tucker <tom@opengridcomputing.com>
Date: Wed, 30 Apr 2008 20:44:39 -0500
Subject: svcrdma: Use standard Linux lists for context cache

Replace the one-off linked list implementation used to implement the
context cache with the standard Linux list_head lists. Add a context
counter to catch resource leaks. A WARN_ON will be added later to
ensure that we've freed all contexts.

Signed-off-by: Tom Tucker <tom@opengridcomputing.com>
---
 net/sunrpc/xprtrdma/svc_rdma_transport.c | 47 ++++++++++++++++++--------------
 1 file changed, 26 insertions(+), 21 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index 34141eaf25a0..817cf4de746c 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -103,8 +103,8 @@ static int rdma_bump_context_cache(struct svcxprt_rdma *xprt)
 		spin_lock_bh(&xprt->sc_ctxt_lock);
 		if (ctxt) {
 			at_least_one = 1;
-			ctxt->next = xprt->sc_ctxt_head;
-			xprt->sc_ctxt_head = ctxt;
+			INIT_LIST_HEAD(&ctxt->free_list);
+			list_add(&ctxt->free_list, &xprt->sc_ctxt_free);
 		} else {
 			/* kmalloc failed...give up for now */
 			xprt->sc_ctxt_cnt--;
@@ -123,7 +123,7 @@ struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *xprt)
 
 	while (1) {
 		spin_lock_bh(&xprt->sc_ctxt_lock);
-		if (unlikely(xprt->sc_ctxt_head == NULL)) {
+		if (unlikely(list_empty(&xprt->sc_ctxt_free))) {
 			/* Try to bump my cache. */
 			spin_unlock_bh(&xprt->sc_ctxt_lock);
 
@@ -136,12 +136,15 @@ struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *xprt)
 			schedule_timeout_uninterruptible(msecs_to_jiffies(500));
 			continue;
 		}
-		ctxt = xprt->sc_ctxt_head;
-		xprt->sc_ctxt_head = ctxt->next;
+		ctxt = list_entry(xprt->sc_ctxt_free.next,
+				  struct svc_rdma_op_ctxt,
+				  free_list);
+		list_del_init(&ctxt->free_list);
 		spin_unlock_bh(&xprt->sc_ctxt_lock);
 		ctxt->xprt = xprt;
 		INIT_LIST_HEAD(&ctxt->dto_q);
 		ctxt->count = 0;
+		atomic_inc(&xprt->sc_ctxt_used);
 		break;
 	}
 	return ctxt;
@@ -163,10 +166,11 @@ void svc_rdma_put_context(struct svc_rdma_op_ctxt *ctxt, int free_pages)
 				 ctxt->sge[i].addr,
 				 ctxt->sge[i].length,
 				 ctxt->direction);
+
 	spin_lock_bh(&xprt->sc_ctxt_lock);
-	ctxt->next = xprt->sc_ctxt_head;
-	xprt->sc_ctxt_head = ctxt;
+	list_add(&ctxt->free_list, &xprt->sc_ctxt_free);
 	spin_unlock_bh(&xprt->sc_ctxt_lock);
+	atomic_dec(&xprt->sc_ctxt_used);
 }
 
 /* ib_cq event handler */
@@ -412,28 +416,29 @@ static void create_context_cache(struct svcxprt_rdma *xprt,
 	xprt->sc_ctxt_max = ctxt_max;
 	xprt->sc_ctxt_bump = ctxt_bump;
 	xprt->sc_ctxt_cnt = 0;
-	xprt->sc_ctxt_head = NULL;
+	atomic_set(&xprt->sc_ctxt_used, 0);
+
+	INIT_LIST_HEAD(&xprt->sc_ctxt_free);
 	for (i = 0; i < ctxt_count; i++) {
 		ctxt = kmalloc(sizeof(*ctxt), GFP_KERNEL);
 		if (ctxt) {
-			ctxt->next = xprt->sc_ctxt_head;
-			xprt->sc_ctxt_head = ctxt;
+			INIT_LIST_HEAD(&ctxt->free_list);
+			list_add(&ctxt->free_list, &xprt->sc_ctxt_free);
 			xprt->sc_ctxt_cnt++;
 		}
 	}
 }
 
-static void destroy_context_cache(struct svc_rdma_op_ctxt *ctxt)
+static void destroy_context_cache(struct svcxprt_rdma *xprt)
 {
-	struct svc_rdma_op_ctxt *next;
-	if (!ctxt)
-		return;
-
-	do {
-		next = ctxt->next;
+	while (!list_empty(&xprt->sc_ctxt_free)) {
+		struct svc_rdma_op_ctxt *ctxt;
+		ctxt = list_entry(xprt->sc_ctxt_free.next,
+				  struct svc_rdma_op_ctxt,
+				  free_list);
+		list_del_init(&ctxt->free_list);
 		kfree(ctxt);
-		ctxt = next;
-	} while (next);
+	}
 }
 
 static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv,
@@ -470,7 +475,7 @@ static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv,
 				     reqs +
 				     cma_xprt->sc_sq_depth +
 				     RPCRDMA_MAX_THREADS + 1); /* max */
-		if (!cma_xprt->sc_ctxt_head) {
+		if (list_empty(&cma_xprt->sc_ctxt_free)) {
 			kfree(cma_xprt);
 			return NULL;
 		}
@@ -976,7 +981,7 @@ static void svc_rdma_free(struct svc_xprt *xprt)
 	if (rdma->sc_pd && !IS_ERR(rdma->sc_pd))
 		ib_dealloc_pd(rdma->sc_pd);
 
-	destroy_context_cache(rdma->sc_ctxt_head);
+	destroy_context_cache(rdma);
 	kfree(rdma);
 }
 
-- 
cgit v1.2.3


From 47698e083e40bbd3ef87f5561390ae33abb13cd0 Mon Sep 17 00:00:00 2001
From: Tom Tucker <tom@opengridcomputing.com>
Date: Tue, 6 May 2008 11:49:05 -0500
Subject: svcrdma: Shrink scope of spinlock on RQ CQ

The rq_cq_reap function is only called from the dto_tasklet. The
only resource shared with other threads is the sc_rq_dto_q. Move the
spin lock to protect only this list.

Signed-off-by: Tom Tucker <tom@opengridcomputing.com>
---
 net/sunrpc/xprtrdma/svc_rdma_transport.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index 817cf4de746c..78303f0fad92 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -292,7 +292,6 @@ static void rq_cq_reap(struct svcxprt_rdma *xprt)
 	ib_req_notify_cq(xprt->sc_rq_cq, IB_CQ_NEXT_COMP);
 	atomic_inc(&rdma_stat_rq_poll);
 
-	spin_lock_bh(&xprt->sc_rq_dto_lock);
 	while ((ret = ib_poll_cq(xprt->sc_rq_cq, 1, &wc)) > 0) {
 		ctxt = (struct svc_rdma_op_ctxt *)(unsigned long)wc.wr_id;
 		ctxt->wc_status = wc.status;
@@ -303,9 +302,10 @@ static void rq_cq_reap(struct svcxprt_rdma *xprt)
 			svc_rdma_put_context(ctxt, 1);
 			continue;
 		}
+		spin_lock_bh(&xprt->sc_rq_dto_lock);
 		list_add_tail(&ctxt->dto_q, &xprt->sc_rq_dto_q);
+		spin_unlock_bh(&xprt->sc_rq_dto_lock);
 	}
-	spin_unlock_bh(&xprt->sc_rq_dto_lock);
 
 	if (ctxt)
 		atomic_inc(&rdma_stat_rq_prod);
-- 
cgit v1.2.3


From 8da91ea8de873ee8be82377ff18637d05e882058 Mon Sep 17 00:00:00 2001
From: Tom Tucker <tom@opengridcomputing.com>
Date: Wed, 30 Apr 2008 22:00:46 -0500
Subject: svcrdma: Move destroy to kernel thread

Some providers may wait while destroying adapter resources.
Since it is possible that the last reference is put on the
dto_tasklet, the actual destroy must be scheduled as a work item.

Signed-off-by: Tom Tucker <tom@opengridcomputing.com>
---
 net/sunrpc/xprtrdma/svc_rdma_transport.c | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index 78303f0fad92..028c6cf89364 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -963,12 +963,15 @@ static void svc_rdma_detach(struct svc_xprt *xprt)
 	rdma_destroy_id(rdma->sc_cm_id);
 }
 
-static void svc_rdma_free(struct svc_xprt *xprt)
+static void __svc_rdma_free(struct work_struct *work)
 {
-	struct svcxprt_rdma *rdma = (struct svcxprt_rdma *)xprt;
+	struct svcxprt_rdma *rdma =
+		container_of(work, struct svcxprt_rdma, sc_work);
 	dprintk("svcrdma: svc_rdma_free(%p)\n", rdma);
+
 	/* We should only be called from kref_put */
-	BUG_ON(atomic_read(&xprt->xpt_ref.refcount) != 0);
+	BUG_ON(atomic_read(&rdma->sc_xprt.xpt_ref.refcount) != 0);
+
 	if (rdma->sc_sq_cq && !IS_ERR(rdma->sc_sq_cq))
 		ib_destroy_cq(rdma->sc_sq_cq);
 
@@ -985,6 +988,14 @@ static void svc_rdma_free(struct svc_xprt *xprt)
 	kfree(rdma);
 }
 
+static void svc_rdma_free(struct svc_xprt *xprt)
+{
+	struct svcxprt_rdma *rdma =
+		container_of(xprt, struct svcxprt_rdma, sc_xprt);
+	INIT_WORK(&rdma->sc_work, __svc_rdma_free);
+	schedule_work(&rdma->sc_work);
+}
+
 static int svc_rdma_has_wspace(struct svc_xprt *xprt)
 {
 	struct svcxprt_rdma *rdma =
-- 
cgit v1.2.3


From 0905c0f0a2346516ecd12f0a4f33dca571b0dccd Mon Sep 17 00:00:00 2001
From: Tom Tucker <tom@opengridcomputing.com>
Date: Thu, 1 May 2008 10:49:03 -0500
Subject: svcrdma: Add reference for each SQ/RQ WR

Add a reference on the transport for every outstanding WR.

Signed-off-by: Tom Tucker <tom@opengridcomputing.com>
---
 net/sunrpc/xprtrdma/svc_rdma_transport.c | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index 028c6cf89364..31b1927b5ee6 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -279,6 +279,8 @@ static void rq_comp_handler(struct ib_cq *cq, void *cq_context)
  *
  * Take all completing WC off the CQE and enqueue the associated DTO
  * context on the dto_q for the transport.
+ *
+ * Note that caller must hold a transport reference.
  */
 static void rq_cq_reap(struct svcxprt_rdma *xprt)
 {
@@ -298,13 +300,16 @@ static void rq_cq_reap(struct svcxprt_rdma *xprt)
 		ctxt->byte_len = wc.byte_len;
 		if (wc.status != IB_WC_SUCCESS) {
 			/* Close the transport */
+			dprintk("svcrdma: transport closing putting ctxt %p\n", ctxt);
 			set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
 			svc_rdma_put_context(ctxt, 1);
+			svc_xprt_put(&xprt->sc_xprt);
 			continue;
 		}
 		spin_lock_bh(&xprt->sc_rq_dto_lock);
 		list_add_tail(&ctxt->dto_q, &xprt->sc_rq_dto_q);
 		spin_unlock_bh(&xprt->sc_rq_dto_lock);
+		svc_xprt_put(&xprt->sc_xprt);
 	}
 
 	if (ctxt)
@@ -322,6 +327,8 @@ static void rq_cq_reap(struct svcxprt_rdma *xprt)
 
 /*
  * Send Queue Completion Handler - potentially called on interrupt context.
+ *
+ * Note that caller must hold a transport reference.
  */
 static void sq_cq_reap(struct svcxprt_rdma *xprt)
 {
@@ -374,6 +381,7 @@ static void sq_cq_reap(struct svcxprt_rdma *xprt)
 			       wc.opcode, wc.status);
 			break;
 		}
+		svc_xprt_put(&xprt->sc_xprt);
 	}
 
 	if (ctxt)
@@ -530,9 +538,12 @@ int svc_rdma_post_recv(struct svcxprt_rdma *xprt)
 	recv_wr.num_sge = ctxt->count;
 	recv_wr.wr_id = (u64)(unsigned long)ctxt;
 
+	svc_xprt_get(&xprt->sc_xprt);
 	ret = ib_post_recv(xprt->sc_qp, &recv_wr, &bad_recv_wr);
-	if (ret)
+	if (ret) {
+		svc_xprt_put(&xprt->sc_xprt);
 		svc_rdma_put_context(ctxt, 1);
+	}
 	return ret;
 }
 
@@ -1049,14 +1060,17 @@ int svc_rdma_send(struct svcxprt_rdma *xprt, struct ib_send_wr *wr)
 			continue;
 		}
 		/* Bumped used SQ WR count and post */
+		svc_xprt_get(&xprt->sc_xprt);
 		ret = ib_post_send(xprt->sc_qp, wr, &bad_wr);
 		if (!ret)
 			atomic_inc(&xprt->sc_sq_count);
-		else
+		else {
+			svc_xprt_put(&xprt->sc_xprt);
 			dprintk("svcrdma: failed to post SQ WR rc=%d, "
 			       "sc_sq_count=%d, sc_sq_depth=%d\n",
 			       ret, atomic_read(&xprt->sc_sq_count),
 			       xprt->sc_sq_depth);
+		}
 		spin_unlock_bh(&xprt->sc_lock);
 		break;
 	}
-- 
cgit v1.2.3


From 1711386c62c97f7fb086a2247d44cdb1f8867640 Mon Sep 17 00:00:00 2001
From: Tom Tucker <tom@opengridcomputing.com>
Date: Thu, 1 May 2008 11:13:50 -0500
Subject: svcrdma: Move the QP and cm_id destruction to svc_rdma_free

Move the destruction of the QP and CM_ID to the free path so that the
QP cleanup code doesn't race with the dto_tasklet handling flushed WR.
The QP reference is not needed because we now have a reference for
every WR.

Also add a guard in the SQ and RQ completion handlers to ignore
calls generated by some providers when the QP is destroyed.

Signed-off-by: Tom Tucker <tom@opengridcomputing.com>
---
 net/sunrpc/xprtrdma/svc_rdma_transport.c | 40 ++++++++++++++++----------------
 1 file changed, 20 insertions(+), 20 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index 31b1927b5ee6..b412a49c46fc 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -252,11 +252,15 @@ static void rq_comp_handler(struct ib_cq *cq, void *cq_context)
 	struct svcxprt_rdma *xprt = cq_context;
 	unsigned long flags;
 
+	/* Guard against unconditional flush call for destroyed QP */
+	if (atomic_read(&xprt->sc_xprt.xpt_ref.refcount)==0)
+		return;
+
 	/*
 	 * Set the bit regardless of whether or not it's on the list
 	 * because it may be on the list already due to an SQ
 	 * completion.
-	*/
+	 */
 	set_bit(RDMAXPRT_RQ_PENDING, &xprt->sc_flags);
 
 	/*
@@ -393,11 +397,15 @@ static void sq_comp_handler(struct ib_cq *cq, void *cq_context)
 	struct svcxprt_rdma *xprt = cq_context;
 	unsigned long flags;
 
+	/* Guard against unconditional flush call for destroyed QP */
+	if (atomic_read(&xprt->sc_xprt.xpt_ref.refcount)==0)
+		return;
+
 	/*
 	 * Set the bit regardless of whether or not it's on the list
 	 * because it may be on the list already due to an RQ
 	 * completion.
-	*/
+	 */
 	set_bit(RDMAXPRT_SQ_PENDING, &xprt->sc_flags);
 
 	/*
@@ -852,7 +860,6 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
 		newxprt->sc_sq_depth = qp_attr.cap.max_send_wr;
 		newxprt->sc_max_requests = qp_attr.cap.max_recv_wr;
 	}
-	svc_xprt_get(&newxprt->sc_xprt);
 	newxprt->sc_qp = newxprt->sc_cm_id->qp;
 
 	/* Register all of physical memory */
@@ -926,10 +933,8 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
 	dprintk("svcrdma: failure accepting new connection rc=%d.\n", ret);
 	/* Take a reference in case the DTO handler runs */
 	svc_xprt_get(&newxprt->sc_xprt);
-	if (newxprt->sc_qp && !IS_ERR(newxprt->sc_qp)) {
+	if (newxprt->sc_qp && !IS_ERR(newxprt->sc_qp))
 		ib_destroy_qp(newxprt->sc_qp);
-		svc_xprt_put(&newxprt->sc_xprt);
-	}
 	rdma_destroy_id(newxprt->sc_cm_id);
 	/* This call to put will destroy the transport */
 	svc_xprt_put(&newxprt->sc_xprt);
@@ -941,10 +946,7 @@ static void svc_rdma_release_rqst(struct svc_rqst *rqstp)
 }
 
 /*
- * When connected, an svc_xprt has at least three references:
- *
- * - A reference held by the QP. We still hold that here because this
- *   code deletes the QP and puts the reference.
+ * When connected, an svc_xprt has at least two references:
  *
  * - A reference held by the cm_id between the ESTABLISHED and
  *   DISCONNECTED events. If the remote peer disconnected first, this
@@ -953,7 +955,7 @@ static void svc_rdma_release_rqst(struct svc_rqst *rqstp)
  * - A reference held by the svc_recv code that called this function
  *   as part of close processing.
  *
- * At a minimum two references should still be held.
+ * At a minimum one references should still be held.
  */
 static void svc_rdma_detach(struct svc_xprt *xprt)
 {
@@ -963,15 +965,6 @@ static void svc_rdma_detach(struct svc_xprt *xprt)
 
 	/* Disconnect and flush posted WQE */
 	rdma_disconnect(rdma->sc_cm_id);
-
-	/* Destroy the QP if present (not a listener) */
-	if (rdma->sc_qp && !IS_ERR(rdma->sc_qp)) {
-		ib_destroy_qp(rdma->sc_qp);
-		svc_xprt_put(xprt);
-	}
-
-	/* Destroy the CM ID */
-	rdma_destroy_id(rdma->sc_cm_id);
 }
 
 static void __svc_rdma_free(struct work_struct *work)
@@ -983,6 +976,13 @@ static void __svc_rdma_free(struct work_struct *work)
 	/* We should only be called from kref_put */
 	BUG_ON(atomic_read(&rdma->sc_xprt.xpt_ref.refcount) != 0);
 
+	/* Destroy the QP if present (not a listener) */
+	if (rdma->sc_qp && !IS_ERR(rdma->sc_qp))
+		ib_destroy_qp(rdma->sc_qp);
+
+	/* Destroy the CM ID */
+	rdma_destroy_id(rdma->sc_cm_id);
+
 	if (rdma->sc_sq_cq && !IS_ERR(rdma->sc_sq_cq))
 		ib_destroy_cq(rdma->sc_sq_cq);
 
-- 
cgit v1.2.3


From 356d0a1519867422c3f17f79e2183f8c2d44f8ee Mon Sep 17 00:00:00 2001
From: Tom Tucker <tom@opengridcomputing.com>
Date: Thu, 1 May 2008 11:25:02 -0500
Subject: svcrdma: Cleanup queued, but unprocessed I/O in svc_rdma_free

When the transport is closing, the DTO tasklet may queue data
that never gets processed. Clean up resources associated with
this I/O.

Signed-off-by: Tom Tucker <tom@opengridcomputing.com>
---
 net/sunrpc/xprtrdma/svc_rdma_transport.c | 38 +++++++++++++++++++++++++++++---
 1 file changed, 35 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index b412a49c46fc..b1ff08d7da6c 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -976,13 +976,42 @@ static void __svc_rdma_free(struct work_struct *work)
 	/* We should only be called from kref_put */
 	BUG_ON(atomic_read(&rdma->sc_xprt.xpt_ref.refcount) != 0);
 
+	/*
+	 * Destroy queued, but not processed read completions. Note
+	 * that this cleanup has to be done before destroying the
+	 * cm_id because the device ptr is needed to unmap the dma in
+	 * svc_rdma_put_context.
+	 */
+	spin_lock_bh(&rdma->sc_read_complete_lock);
+	while (!list_empty(&rdma->sc_read_complete_q)) {
+		struct svc_rdma_op_ctxt *ctxt;
+		ctxt = list_entry(rdma->sc_read_complete_q.next,
+				  struct svc_rdma_op_ctxt,
+				  dto_q);
+		list_del_init(&ctxt->dto_q);
+		svc_rdma_put_context(ctxt, 1);
+	}
+	spin_unlock_bh(&rdma->sc_read_complete_lock);
+
+	/* Destroy queued, but not processed recv completions */
+	spin_lock_bh(&rdma->sc_rq_dto_lock);
+	while (!list_empty(&rdma->sc_rq_dto_q)) {
+		struct svc_rdma_op_ctxt *ctxt;
+		ctxt = list_entry(rdma->sc_rq_dto_q.next,
+				  struct svc_rdma_op_ctxt,
+				  dto_q);
+		list_del_init(&ctxt->dto_q);
+		svc_rdma_put_context(ctxt, 1);
+	}
+	spin_unlock_bh(&rdma->sc_rq_dto_lock);
+
+	/* Warn if we leaked a resource or under-referenced */
+	WARN_ON(atomic_read(&rdma->sc_ctxt_used) != 0);
+
 	/* Destroy the QP if present (not a listener) */
 	if (rdma->sc_qp && !IS_ERR(rdma->sc_qp))
 		ib_destroy_qp(rdma->sc_qp);
 
-	/* Destroy the CM ID */
-	rdma_destroy_id(rdma->sc_cm_id);
-
 	if (rdma->sc_sq_cq && !IS_ERR(rdma->sc_sq_cq))
 		ib_destroy_cq(rdma->sc_sq_cq);
 
@@ -995,6 +1024,9 @@ static void __svc_rdma_free(struct work_struct *work)
 	if (rdma->sc_pd && !IS_ERR(rdma->sc_pd))
 		ib_dealloc_pd(rdma->sc_pd);
 
+	/* Destroy the CM ID */
+	rdma_destroy_id(rdma->sc_cm_id);
+
 	destroy_context_cache(rdma);
 	kfree(rdma);
 }
-- 
cgit v1.2.3


From 97a3df382e01c49555ea844bd7c4e5a08f245b9d Mon Sep 17 00:00:00 2001
From: Tom Tucker <tom@opengridcomputing.com>
Date: Thu, 1 May 2008 14:02:45 -0500
Subject: svcrdma: Use ib verbs version of dma_unmap

Use the ib_verbs version of the dma_unmap service in the
svc_rdma_put_context function. This should support providers
using software rdma.

Signed-off-by: Tom Tucker <tom@opengridcomputing.com>
---
 net/sunrpc/xprtrdma/svc_rdma_transport.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index b1ff08d7da6c..0b72c4c7d7cb 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -162,10 +162,10 @@ void svc_rdma_put_context(struct svc_rdma_op_ctxt *ctxt, int free_pages)
 			put_page(ctxt->pages[i]);
 
 	for (i = 0; i < ctxt->count; i++)
-		dma_unmap_single(xprt->sc_cm_id->device->dma_device,
-				 ctxt->sge[i].addr,
-				 ctxt->sge[i].length,
-				 ctxt->direction);
+		ib_dma_unmap_single(xprt->sc_cm_id->device,
+				    ctxt->sge[i].addr,
+				    ctxt->sge[i].length,
+				    ctxt->direction);
 
 	spin_lock_bh(&xprt->sc_ctxt_lock);
 	list_add(&ctxt->free_list, &xprt->sc_ctxt_free);
-- 
cgit v1.2.3


From 69500c43b45f7155b72dcadad31cd55cda789c93 Mon Sep 17 00:00:00 2001
From: Tom Tucker <tom@opengridcomputing.com>
Date: Wed, 7 May 2008 13:49:58 -0500
Subject: svcrdma: Set rqstp transport address in rdma_read_complete function

The rdma_read_complete function needs to copy the rqstp transport address
from the transport. Failure to do so can result in using the wrong
authentication method for the RPC or bug checking if the rqstp address
is not valid.

Signed-off-by: Tom Tucker <tom@opengridcomputing.com>
---
 net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index 21a1e625ef03..c016f5ca0ce5 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -416,6 +416,7 @@ static int rdma_read_complete(struct svc_rqst *rqstp,
 
 	/* XXX: What should this be? */
 	rqstp->rq_prot = IPPROTO_MAX;
+	svc_xprt_copy_addrs(rqstp, rqstp->rq_xprt);
 
 	ret = rqstp->rq_arg.head[0].iov_len
 		+ rqstp->rq_arg.page_len
-- 
cgit v1.2.3


From af261af4db14230fb35bcdc0ba9ef78ed6cf7bc1 Mon Sep 17 00:00:00 2001
From: Tom Tucker <tom@opengridcomputing.com>
Date: Wed, 7 May 2008 13:52:42 -0500
Subject: svcrdma: Copy transport address and arm CQ before calling rdma_accept

This race was found by inspection. Messages can be received from the peer
immediately following the rdma_accept call, however, the CQ have not yet
been armed and the transport address has not yet been set.

Set the transport address in the connect request handler and arm the CQ
prior to calling rdma_accept.

Signed-off-by: Tom Tucker <tom@opengridcomputing.com>
---
 net/sunrpc/xprtrdma/svc_rdma_transport.c | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index 0b72c4c7d7cb..c7545203f4b3 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -570,6 +570,7 @@ static void handle_connect_req(struct rdma_cm_id *new_cma_id)
 {
 	struct svcxprt_rdma *listen_xprt = new_cma_id->context;
 	struct svcxprt_rdma *newxprt;
+	struct sockaddr *sa;
 
 	/* Create a new transport */
 	newxprt = rdma_create_xprt(listen_xprt->sc_xprt.xpt_server, 0);
@@ -582,6 +583,12 @@ static void handle_connect_req(struct rdma_cm_id *new_cma_id)
 	dprintk("svcrdma: Creating newxprt=%p, cm_id=%p, listenxprt=%p\n",
 		newxprt, newxprt->sc_cm_id, listen_xprt);
 
+	/* Set the local and remote addresses in the transport */
+	sa = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.dst_addr;
+	svc_xprt_set_remote(&newxprt->sc_xprt, sa, svc_addr_len(sa));
+	sa = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.src_addr;
+	svc_xprt_set_local(&newxprt->sc_xprt, sa, svc_addr_len(sa));
+
 	/*
 	 * Enqueue the new transport on the accept queue of the listening
 	 * transport
@@ -750,7 +757,6 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
 	struct rdma_conn_param conn_param;
 	struct ib_qp_init_attr qp_attr;
 	struct ib_device_attr devattr;
-	struct sockaddr *sa;
 	int ret;
 	int i;
 
@@ -883,6 +889,13 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
 	/* Swap out the handler */
 	newxprt->sc_cm_id->event_handler = rdma_cma_handler;
 
+	/*
+	 * Arm the CQs for the SQ and RQ before accepting so we can't
+	 * miss the first message
+	 */
+	ib_req_notify_cq(newxprt->sc_sq_cq, IB_CQ_NEXT_COMP);
+	ib_req_notify_cq(newxprt->sc_rq_cq, IB_CQ_NEXT_COMP);
+
 	/* Accept Connection */
 	set_bit(RDMAXPRT_CONN_PENDING, &newxprt->sc_flags);
 	memset(&conn_param, 0, sizeof conn_param);
@@ -919,14 +932,6 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
 		newxprt->sc_max_requests,
 		newxprt->sc_ord);
 
-	/* Set the local and remote addresses in the transport */
-	sa = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.dst_addr;
-	svc_xprt_set_remote(&newxprt->sc_xprt, sa, svc_addr_len(sa));
-	sa = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.src_addr;
-	svc_xprt_set_local(&newxprt->sc_xprt, sa, svc_addr_len(sa));
-
-	ib_req_notify_cq(newxprt->sc_sq_cq, IB_CQ_NEXT_COMP);
-	ib_req_notify_cq(newxprt->sc_rq_cq, IB_CQ_NEXT_COMP);
 	return &newxprt->sc_xprt;
 
  errout:
-- 
cgit v1.2.3


From 008fdbc57164b0ac237ad6ee2766944f02ac9c28 Mon Sep 17 00:00:00 2001
From: Tom Tucker <tom@opengridcomputing.com>
Date: Wed, 7 May 2008 15:47:42 -0500
Subject: svcrdma: Change svc_rdma_send_error return type to void

The svc_rdma_send_error function is called when an RPCRDMA protocol
error is detected. This function attempts to post an error reply message.
Since an error posting to a transport in error is ignored, change
the return type to void.

Signed-off-by: Tom Tucker <tom@opengridcomputing.com>
---
 net/sunrpc/xprtrdma/svc_rdma_recvfrom.c  | 2 +-
 net/sunrpc/xprtrdma/svc_rdma_transport.c | 9 ++++-----
 2 files changed, 5 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index c016f5ca0ce5..6b16d8cd5682 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -497,7 +497,7 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
 	/* If the request is invalid, reply with an error */
 	if (len < 0) {
 		if (len == -ENOSYS)
-			(void)svc_rdma_send_error(rdma_xprt, rmsgp, ERR_VERS);
+			svc_rdma_send_error(rdma_xprt, rmsgp, ERR_VERS);
 		goto close_out;
 	}
 
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index c7545203f4b3..e132509d1db0 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -1114,8 +1114,8 @@ int svc_rdma_send(struct svcxprt_rdma *xprt, struct ib_send_wr *wr)
 	return ret;
 }
 
-int svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp,
-			enum rpcrdma_errcode err)
+void svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp,
+			 enum rpcrdma_errcode err)
 {
 	struct ib_send_wr err_wr;
 	struct ib_sge sge;
@@ -1153,9 +1153,8 @@ int svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp,
 	/* Post It */
 	ret = svc_rdma_send(xprt, &err_wr);
 	if (ret) {
-		dprintk("svcrdma: Error posting send = %d\n", ret);
+		dprintk("svcrdma: Error %d posting send for protocol error\n",
+			ret);
 		svc_rdma_put_context(ctxt, 1);
 	}
-
-	return ret;
 }
-- 
cgit v1.2.3


From a6f911c04e20b98feb4b33d3aba2976851977d6a Mon Sep 17 00:00:00 2001
From: Tom Tucker <tom@opengridcomputing.com>
Date: Tue, 13 May 2008 09:16:05 -0500
Subject: svcrdma: Verify read-list fits within RPCSVC_MAXPAGES

A RDMA read-list cannot contain more elements than RPCSVC_MAXPAGES or
it will overflow the DTO context. Verify this when processing the
protocol header.

Signed-off-by: Tom Tucker <tom@opengridcomputing.com>
---
 net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net')

diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index 6b16d8cd5682..06ab4841537b 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -306,6 +306,8 @@ static int rdma_read_xdr(struct svcxprt_rdma *xprt,
 	ch_sge_ary = (struct chunk_sge *)tmp_ch_ctxt->sge;
 
 	svc_rdma_rcl_chunk_counts(ch, &ch_count, &byte_count);
+	if (ch_count > RPCSVC_MAXPAGES)
+		return -EINVAL;
 	sge_count = rdma_rcl_to_sge(xprt, rqstp, hdr_ctxt, rmsgp,
 				    sge, ch_sge_ary,
 				    ch_count, byte_count);
-- 
cgit v1.2.3


From 0686caa35ed17cf5b9043f453957e702a7eb588d Mon Sep 17 00:00:00 2001
From: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Date: Mon, 19 May 2008 16:25:42 -0700
Subject: ndisc: Add missing strategies for per-device retrans timer/reachable
 time settings.

Noticed from Al Viro <viro@ftp.linux.org.uk> via David Miller
<davem@davemloft.net>.

Signed-off-by: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/addrconf.c | 2 +-
 net/ipv6/ndisc.c    | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index e591e09e5e4e..8dd9155502b5 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -4242,7 +4242,7 @@ static void addrconf_sysctl_register(struct inet6_dev *idev)
 	neigh_sysctl_register(idev->dev, idev->nd_parms, NET_IPV6,
 			      NET_IPV6_NEIGH, "ipv6",
 			      &ndisc_ifinfo_sysctl_change,
-			      NULL);
+			      ndisc_ifinfo_sysctl_strategy);
 	__addrconf_sysctl_register(dev_net(idev->dev), idev->dev->name,
 			idev->dev->ifindex, idev, &idev->cnf);
 }
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index a55fc05b8125..282fdb31f8ed 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -1727,10 +1727,10 @@ int ndisc_ifinfo_sysctl_change(struct ctl_table *ctl, int write, struct file * f
 	return ret;
 }
 
-static int ndisc_ifinfo_sysctl_strategy(ctl_table *ctl, int __user *name,
-					int nlen, void __user *oldval,
-					size_t __user *oldlenp,
-					void __user *newval, size_t newlen)
+int ndisc_ifinfo_sysctl_strategy(ctl_table *ctl, int __user *name,
+				 int nlen, void __user *oldval,
+				 size_t __user *oldlenp,
+				 void __user *newval, size_t newlen)
 {
 	struct net_device *dev = ctl->extra1;
 	struct inet6_dev *idev;
-- 
cgit v1.2.3


From a3264435b4ca1ccee54cbef2970f2ba4bef39e2d Mon Sep 17 00:00:00 2001
From: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Date: Mon, 19 May 2008 16:54:29 -0700
Subject: ipv6 addrconf: Fix route lifetime setting in corner case.

Because of arithmetic overflow avoidance, the actual lifetime setting
(vs the value given by RA) did not increase monotonically around
0x7fffffff/HZ.

Signed-off-by: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/route.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 12bba0880345..98aa50c11dd6 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -475,7 +475,7 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
 	lifetime = ntohl(rinfo->lifetime);
 	if (lifetime == 0xffffffff) {
 		/* infinity */
-	} else if (lifetime > 0x7fffffff/HZ) {
+	} else if (lifetime > 0x7fffffff/HZ - 1) {
 		/* Avoid arithmetic overflow */
 		lifetime = 0x7fffffff/HZ - 1;
 	}
-- 
cgit v1.2.3


From 69cdf8f92a8dd191eee0e834c631d84a140b1121 Mon Sep 17 00:00:00 2001
From: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Date: Mon, 19 May 2008 16:55:13 -0700
Subject: ipv6 route: Fix lifetime in netlink.

We could not see appropriate lifetime if the route had been scheduled
to expired at 0 (in jiffies).  We should check rt6i_flags instead of
rt6i_expires to determine whether lifetime is valid or not.

Signed-off-by: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/route.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 98aa50c11dd6..b45a7c0268c5 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2200,7 +2200,9 @@ static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt,
 
 	NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
 
-	expires = rt->rt6i_expires ? rt->rt6i_expires - jiffies : 0;
+	expires = (rt->rt6i_flags & RTF_EXPIRES) ?
+			rt->rt6i_expires - jiffies : 0;
+
 	if (rtnl_put_cacheinfo(skb, &rt->u.dst, 0, 0, 0,
 			       expires, rt->u.dst.error) < 0)
 		goto nla_put_failure;
-- 
cgit v1.2.3


From 6f704992d3658aadff9e506c7fd80957fce33c5f Mon Sep 17 00:00:00 2001
From: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Date: Mon, 19 May 2008 16:56:11 -0700
Subject: ipv6 addrconf: Allow infinite prefix lifetime.

We need to handle infinite prefix lifetime specially.
With help from original reporter "Bonitch, Joseph"
<Joseph.Bonitch@xerox.com>.

Signed-off-by: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/addrconf.c | 73 +++++++++++++++++++++++++++++++++++------------------
 net/ipv6/route.c    |  4 ++-
 2 files changed, 52 insertions(+), 25 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 8dd9155502b5..3a835578fd1c 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -1764,14 +1764,16 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len)
 	 *	2) Configure prefixes with the auto flag set
 	 */
 
-	/* Avoid arithmetic overflow. Really, we could
-	   save rt_expires in seconds, likely valid_lft,
-	   but it would require division in fib gc, that it
-	   not good.
-	 */
-	if (valid_lft >= 0x7FFFFFFF/HZ)
+	if (valid_lft == INFINITY_LIFE_TIME)
+		rt_expires = ~0UL;
+	else if (valid_lft >= 0x7FFFFFFF/HZ) {
+		/* Avoid arithmetic overflow. Really, we could
+		 * save rt_expires in seconds, likely valid_lft,
+		 * but it would require division in fib gc, that it
+		 * not good.
+		 */
 		rt_expires = 0x7FFFFFFF - (0x7FFFFFFF % HZ);
-	else
+	} else
 		rt_expires = valid_lft * HZ;
 
 	/*
@@ -1779,7 +1781,7 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len)
 	 * Avoid arithmetic overflow there as well.
 	 * Overflow can happen only if HZ < USER_HZ.
 	 */
-	if (HZ < USER_HZ && rt_expires > 0x7FFFFFFF / USER_HZ)
+	if (HZ < USER_HZ && ~rt_expires && rt_expires > 0x7FFFFFFF / USER_HZ)
 		rt_expires = 0x7FFFFFFF / USER_HZ;
 
 	if (pinfo->onlink) {
@@ -1788,17 +1790,28 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len)
 				dev->ifindex, 1);
 
 		if (rt && ((rt->rt6i_flags & (RTF_GATEWAY | RTF_DEFAULT)) == 0)) {
-			if (rt->rt6i_flags&RTF_EXPIRES) {
-				if (valid_lft == 0) {
-					ip6_del_rt(rt);
-					rt = NULL;
-				} else {
-					rt->rt6i_expires = jiffies + rt_expires;
-				}
+			/* Autoconf prefix route */
+			if (valid_lft == 0) {
+				ip6_del_rt(rt);
+				rt = NULL;
+			} else if (~rt_expires) {
+				/* not infinity */
+				rt->rt6i_expires = jiffies + rt_expires;
+				rt->rt6i_flags |= RTF_EXPIRES;
+			} else {
+				rt->rt6i_flags &= ~RTF_EXPIRES;
+				rt->rt6i_expires = 0;
 			}
 		} else if (valid_lft) {
+			int flags = RTF_ADDRCONF | RTF_PREFIX_RT;
+			clock_t expires = 0;
+			if (~rt_expires) {
+				/* not infinity */
+				flags |= RTF_EXPIRES;
+				expires = jiffies_to_clock_t(rt_expires);
+			}
 			addrconf_prefix_route(&pinfo->prefix, pinfo->prefix_len,
-					      dev, jiffies_to_clock_t(rt_expires), RTF_ADDRCONF|RTF_EXPIRES|RTF_PREFIX_RT);
+					      dev, expires, flags);
 		}
 		if (rt)
 			dst_release(&rt->u.dst);
@@ -2021,7 +2034,8 @@ static int inet6_addr_add(struct net *net, int ifindex, struct in6_addr *pfx,
 	struct inet6_dev *idev;
 	struct net_device *dev;
 	int scope;
-	u32 flags = RTF_EXPIRES;
+	u32 flags;
+	clock_t expires;
 
 	ASSERT_RTNL();
 
@@ -2041,8 +2055,13 @@ static int inet6_addr_add(struct net *net, int ifindex, struct in6_addr *pfx,
 	if (valid_lft == INFINITY_LIFE_TIME) {
 		ifa_flags |= IFA_F_PERMANENT;
 		flags = 0;
-	} else if (valid_lft >= 0x7FFFFFFF/HZ)
-		valid_lft = 0x7FFFFFFF/HZ;
+		expires = 0;
+	} else {
+		if (valid_lft >= 0x7FFFFFFF/HZ)
+			valid_lft = 0x7FFFFFFF/HZ;
+		flags = RTF_EXPIRES;
+		expires = jiffies_to_clock_t(valid_lft * HZ);
+	}
 
 	if (prefered_lft == 0)
 		ifa_flags |= IFA_F_DEPRECATED;
@@ -2060,7 +2079,7 @@ static int inet6_addr_add(struct net *net, int ifindex, struct in6_addr *pfx,
 		spin_unlock_bh(&ifp->lock);
 
 		addrconf_prefix_route(&ifp->addr, ifp->prefix_len, dev,
-				      jiffies_to_clock_t(valid_lft * HZ), flags);
+				      expires, flags);
 		/*
 		 * Note that section 3.1 of RFC 4429 indicates
 		 * that the Optimistic flag should not be set for
@@ -3148,7 +3167,8 @@ inet6_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
 static int inet6_addr_modify(struct inet6_ifaddr *ifp, u8 ifa_flags,
 			     u32 prefered_lft, u32 valid_lft)
 {
-	u32 flags = RTF_EXPIRES;
+	u32 flags;
+	clock_t expires;
 
 	if (!valid_lft || (prefered_lft > valid_lft))
 		return -EINVAL;
@@ -3156,8 +3176,13 @@ static int inet6_addr_modify(struct inet6_ifaddr *ifp, u8 ifa_flags,
 	if (valid_lft == INFINITY_LIFE_TIME) {
 		ifa_flags |= IFA_F_PERMANENT;
 		flags = 0;
-	} else if (valid_lft >= 0x7FFFFFFF/HZ)
-		valid_lft = 0x7FFFFFFF/HZ;
+		expires = 0;
+	} else {
+		if (valid_lft >= 0x7FFFFFFF/HZ)
+			valid_lft = 0x7FFFFFFF/HZ;
+		flags = RTF_EXPIRES;
+		expires = jiffies_to_clock_t(valid_lft * HZ);
+	}
 
 	if (prefered_lft == 0)
 		ifa_flags |= IFA_F_DEPRECATED;
@@ -3176,7 +3201,7 @@ static int inet6_addr_modify(struct inet6_ifaddr *ifp, u8 ifa_flags,
 		ipv6_ifa_notify(0, ifp);
 
 	addrconf_prefix_route(&ifp->addr, ifp->prefix_len, ifp->idev->dev,
-			      jiffies_to_clock_t(valid_lft * HZ), flags);
+			      expires, flags);
 	addrconf_verify(0);
 
 	return 0;
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index b45a7c0268c5..b7a4a875a26a 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1106,7 +1106,9 @@ int ip6_route_add(struct fib6_config *cfg)
 	}
 
 	rt->u.dst.obsolete = -1;
-	rt->rt6i_expires = jiffies + clock_t_to_jiffies(cfg->fc_expires);
+	rt->rt6i_expires = (cfg->fc_flags & RTF_EXPIRES) ?
+				jiffies + clock_t_to_jiffies(cfg->fc_expires) :
+				0;
 
 	if (cfg->fc_protocol == RTPROT_UNSPEC)
 		cfg->fc_protocol = RTPROT_BOOT;
-- 
cgit v1.2.3


From 1ac06e0306d0192a7a4d9ea1c9e06d355ce7e7d3 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Tue, 20 May 2008 14:32:14 -0700
Subject: ipsec: Use the correct ip_local_out function

Because the IPsec output function xfrm_output_resume does its
own dst_output call it should always call __ip_local_output
instead of ip_local_output as the latter may invoke dst_output
directly.  Otherwise the return values from nf_hook and dst_output
may clash as they both use the value 1 but for different purposes.

When that clash occurs this can cause a packet to be used after
it has been freed which usually leads to a crash.  Because the
offending value is only returned from dst_output with qdiscs
such as HTB, this bug is normally not visible.

Thanks to Marco Berizzi for his perseverance in tracking this
down.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/route.c | 2 +-
 net/ipv6/route.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 92f90ae46f4a..df41026b60db 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -160,7 +160,7 @@ static struct dst_ops ipv4_dst_ops = {
 	.negative_advice =	ipv4_negative_advice,
 	.link_failure =		ipv4_link_failure,
 	.update_pmtu =		ip_rt_update_pmtu,
-	.local_out =		ip_local_out,
+	.local_out =		__ip_local_out,
 	.entry_size =		sizeof(struct rtable),
 	.entries =		ATOMIC_INIT(0),
 };
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index b7a4a875a26a..48534c6c0735 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -109,7 +109,7 @@ static struct dst_ops ip6_dst_ops_template = {
 	.negative_advice	=	ip6_negative_advice,
 	.link_failure		=	ip6_link_failure,
 	.update_pmtu		=	ip6_rt_update_pmtu,
-	.local_out		=	ip6_local_out,
+	.local_out		=	__ip6_local_out,
 	.entry_size		=	sizeof(struct rt6_info),
 	.entries		=	ATOMIC_INIT(0),
 };
-- 
cgit v1.2.3


From f2df824948d559ea818e03486a8583e42ea6ab37 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Tue, 20 May 2008 14:34:46 -0700
Subject: net_sched: cls_api: fix return value for non-existant classifiers

cls_api should return ENOENT when the requested classifier doesn't
exist.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/cls_api.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 1086df7478bc..9360fc81e8c7 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -220,7 +220,7 @@ replay:
 		tp = kzalloc(sizeof(*tp), GFP_KERNEL);
 		if (tp == NULL)
 			goto errout;
-		err = -EINVAL;
+		err = -ENOENT;
 		tp_ops = tcf_proto_lookup_ops(tca[TCA_KIND]);
 		if (tp_ops == NULL) {
 #ifdef CONFIG_KMOD
-- 
cgit v1.2.3


From 0e91796eb46e29edc791131c832a2232bcaed9dd Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Tue, 20 May 2008 14:36:14 -0700
Subject: net: Fix call to ->change_rx_flags(dev, IFF_MULTICAST) in
 dev_change_flags()

Am I just being particularly dim today, or can the call to
dev->change_rx_flags(dev, IFF_MULTICAST) in dev_change_flags() never
happen?

We've just set dev->flags = flags & IFF_MULTICAST, effectively. So the
condition '(dev->flags ^ flags) & IFF_MULTICAST' is _never_ going to be
true.

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/core/dev.c b/net/core/dev.c
index ce88c0d3e354..582963077877 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3141,7 +3141,7 @@ int dev_change_flags(struct net_device *dev, unsigned flags)
 	 *	Load in the correct multicast list now the flags have changed.
 	 */
 
-	if (dev->change_rx_flags && (dev->flags ^ flags) & IFF_MULTICAST)
+	if (dev->change_rx_flags && (old_flags ^ flags) & IFF_MULTICAST)
 		dev->change_rx_flags(dev, IFF_MULTICAST);
 
 	dev_set_rx_mode(dev);
-- 
cgit v1.2.3


From 81d85346b3fcd8b3167eac8b5fb415a210bd4345 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Tue, 20 May 2008 14:37:36 -0700
Subject: vlan: Correctly handle device notifications for layered VLAN devices

Commit 30688a9 ([VLAN]: Handle vlan devices net namespace changing)
changed the device notifier to special-case notifications for VLAN
devices, effectively disabling state propagation to underlying VLAN
devices. This is needed for layered VLANs though, so restore the
original behaviour.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Acked-by: Pavel Emelyanov <xemul@openvz.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/8021q/vlan.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c
index 2a739adaa92b..b934159a6f07 100644
--- a/net/8021q/vlan.c
+++ b/net/8021q/vlan.c
@@ -410,10 +410,8 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event,
 	int i, flgs;
 	struct net_device *vlandev;
 
-	if (is_vlan_dev(dev)) {
+	if (is_vlan_dev(dev))
 		__vlan_device_event(dev, event);
-		goto out;
-	}
 
 	grp = __vlan_find_group(dev);
 	if (!grp)
-- 
cgit v1.2.3


From 5fb13570543f4ae022996c9d7c0c099c8abf22dd Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Tue, 20 May 2008 14:54:50 -0700
Subject: [VLAN]: Propagate selected feature bits to VLAN devices

Propagate feature bits from the NETDEV_FEAT_CHANGE notifier. For now
only TSO is propagated for devices that announce their ability to
support TSO in combination with VLAN accel by setting the NETIF_F_VLAN_TSO
flag.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/8021q/vlan.c     | 30 ++++++++++++++++++++++++++++++
 net/8021q/vlan.h     |  2 ++
 net/8021q/vlan_dev.c |  5 +++++
 3 files changed, 37 insertions(+)

(limited to 'net')

diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c
index b934159a6f07..51961300b586 100644
--- a/net/8021q/vlan.c
+++ b/net/8021q/vlan.c
@@ -382,6 +382,24 @@ static void vlan_sync_address(struct net_device *dev,
 	memcpy(vlan->real_dev_addr, dev->dev_addr, ETH_ALEN);
 }
 
+static void vlan_transfer_features(struct net_device *dev,
+				   struct net_device *vlandev)
+{
+	unsigned long old_features = vlandev->features;
+
+	if (dev->features & NETIF_F_VLAN_TSO) {
+		vlandev->features &= ~VLAN_TSO_FEATURES;
+		vlandev->features |= dev->features & VLAN_TSO_FEATURES;
+	}
+	if (dev->features & NETIF_F_VLAN_CSUM) {
+		vlandev->features &= ~NETIF_F_ALL_CSUM;
+		vlandev->features |= dev->features & NETIF_F_ALL_CSUM;
+	}
+
+	if (old_features != vlandev->features)
+		netdev_features_change(vlandev);
+}
+
 static void __vlan_device_event(struct net_device *dev, unsigned long event)
 {
 	switch (event) {
@@ -448,6 +466,18 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event,
 		}
 		break;
 
+	case NETDEV_FEAT_CHANGE:
+		/* Propagate device features to underlying device */
+		for (i = 0; i < VLAN_GROUP_ARRAY_LEN; i++) {
+			vlandev = vlan_group_get_device(grp, i);
+			if (!vlandev)
+				continue;
+
+			vlan_transfer_features(dev, vlandev);
+		}
+
+		break;
+
 	case NETDEV_DOWN:
 		/* Put all VLANs for this dev in the down state too.  */
 		for (i = 0; i < VLAN_GROUP_ARRAY_LEN; i++) {
diff --git a/net/8021q/vlan.h b/net/8021q/vlan.h
index 5229a72c7ea1..79625696e86a 100644
--- a/net/8021q/vlan.h
+++ b/net/8021q/vlan.h
@@ -7,6 +7,8 @@
 #define VLAN_GRP_HASH_SIZE	(1 << VLAN_GRP_HASH_SHIFT)
 #define VLAN_GRP_HASH_MASK	(VLAN_GRP_HASH_SIZE - 1)
 
+#define VLAN_TSO_FEATURES	(NETIF_F_TSO | NETIF_F_TSO6 | NETIF_F_SG)
+
 /*  Find a VLAN device by the MAC address of its Ethernet device, and
  *  it's VLAN ID.  The default configuration is to have VLAN's scope
  *  to be box-wide, so the MAC will be ignored.  The mac will only be
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index c961f0826005..b1cfbaa88db2 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -663,6 +663,11 @@ static int vlan_dev_init(struct net_device *dev)
 					  (1<<__LINK_STATE_DORMANT))) |
 		      (1<<__LINK_STATE_PRESENT);
 
+	if (real_dev->features & NETIF_F_VLAN_TSO)
+		dev->features |= real_dev->features & VLAN_TSO_FEATURES;
+	if (real_dev->features & NETIF_F_VLAN_CSUM)
+		dev->features |= real_dev->features & NETIF_F_ALL_CSUM;
+
 	/* ipv6 shared card related stuff */
 	dev->dev_id = real_dev->dev_id;
 
-- 
cgit v1.2.3


From 51e779f0daa5c712439d37b907d58543e4fcf12a Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Mon, 19 May 2008 07:18:10 +0200
Subject: mac80211: don't claim iwspy support

We removed iwspy support a very long time ago because it is useless, but
forgot to stop claiming to support it. Apparently, nobody cares, but
remove it nonetheless.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/wext.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'net')

diff --git a/net/mac80211/wext.c b/net/mac80211/wext.c
index 76e1de1dc735..457ebf9e85ae 100644
--- a/net/mac80211/wext.c
+++ b/net/mac80211/wext.c
@@ -209,7 +209,6 @@ static int ieee80211_ioctl_giwrange(struct net_device *dev,
 	range->num_frequency = c;
 
 	IW_EVENT_CAPA_SET_KERNEL(range->event_capa);
-	IW_EVENT_CAPA_SET(range->event_capa, SIOCGIWTHRSPY);
 	IW_EVENT_CAPA_SET(range->event_capa, SIOCGIWAP);
 	IW_EVENT_CAPA_SET(range->event_capa, SIOCGIWSCAN);
 
-- 
cgit v1.2.3


From d3ede327e83f202c3a0962e207318f65717c5eb7 Mon Sep 17 00:00:00 2001
From: "Denis V. Lunev" <den@openvz.org>
Date: Tue, 20 May 2008 15:12:44 -0700
Subject: pktgen: make sure that pktgen_thread_worker has been executed

The following courruption can happen during pktgen stop:
list_del corruption. prev->next should be ffff81007e8a5e70, but was 6b6b6b6b6b6b6b6b
kernel BUG at lib/list_debug.c:67!
      :pktgen:pktgen_thread_worker+0x374/0x10b0
      ? autoremove_wake_function+0x0/0x40
      ? _spin_unlock_irqrestore+0x42/0x80
      ? :pktgen:pktgen_thread_worker+0x0/0x10b0
      kthread+0x4d/0x80
      child_rip+0xa/0x12
      ? restore_args+0x0/0x30
      ? kthread+0x0/0x80
      ? child_rip+0x0/0x12
RIP  list_del+0x48/0x70

The problem is that pktgen_thread_worker can not be executed if kthread_stop
has been called too early. Insert a completion on the normal initialization
path to make sure that pktgen_thread_worker will gain the control for sure.

Signed-off-by: Denis V. Lunev <den@openvz.org>
Acked-by: Alexey Dobriyan <adobriyan@openvz.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/pktgen.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'net')

diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 8dca21110493..fdf537707e51 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -390,6 +390,7 @@ struct pktgen_thread {
 	int cpu;
 
 	wait_queue_head_t queue;
+	struct completion start_done;
 };
 
 #define REMOVE 1
@@ -3414,6 +3415,7 @@ static int pktgen_thread_worker(void *arg)
 	BUG_ON(smp_processor_id() != cpu);
 
 	init_waitqueue_head(&t->queue);
+	complete(&t->start_done);
 
 	pr_debug("pktgen: starting pktgen/%d:  pid=%d\n", cpu, task_pid_nr(current));
 
@@ -3615,6 +3617,7 @@ static int __init pktgen_create_thread(int cpu)
 	INIT_LIST_HEAD(&t->if_list);
 
 	list_add_tail(&t->th_list, &pktgen_threads);
+	init_completion(&t->start_done);
 
 	p = kthread_create(pktgen_thread_worker, t, "kpktgend_%d", cpu);
 	if (IS_ERR(p)) {
@@ -3639,6 +3642,7 @@ static int __init pktgen_create_thread(int cpu)
 	}
 
 	wake_up_process(p);
+	wait_for_completion(&t->start_done);
 
 	return 0;
 }
-- 
cgit v1.2.3


From 0d580a774b3682b8b2b5c89ab9b813d149ef28e7 Mon Sep 17 00:00:00 2001
From: Helmut Schaa <hschaa@suse.de>
Date: Tue, 20 May 2008 09:56:37 +0200
Subject: mac80211: fix NULL pointer dereference in ieee80211_compatible_rates

Fix a possible NULL pointer dereference in ieee80211_compatible_rates
introduced in the patch "mac80211: fix association with some APs". If no bss
is available just use all supported rates in the association request.

Signed-off-by: Helmut Schaa <hschaa@suse.de>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/mlme.c | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index e470bf12b765..7cfd12e0d1e2 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -730,7 +730,17 @@ static void ieee80211_send_assoc(struct net_device *dev,
 		if (bss->wmm_ie) {
 			wmm = 1;
 		}
+
+		/* get all rates supported by the device and the AP as
+		 * some APs don't like getting a superset of their rates
+		 * in the association request (e.g. D-Link DAP 1353 in
+		 * b-only mode) */
+		rates_len = ieee80211_compatible_rates(bss, sband, &rates);
+
 		ieee80211_rx_bss_put(dev, bss);
+	} else {
+		rates = ~0;
+		rates_len = sband->n_bitrates;
 	}
 
 	mgmt = (struct ieee80211_mgmt *) skb_put(skb, 24);
@@ -761,10 +771,7 @@ static void ieee80211_send_assoc(struct net_device *dev,
 	*pos++ = ifsta->ssid_len;
 	memcpy(pos, ifsta->ssid, ifsta->ssid_len);
 
-	/* all supported rates should be added here but some APs
-	 * (e.g. D-Link DAP 1353 in b-only mode) don't like that
-	 * Therefore only add rates the AP supports */
-	rates_len = ieee80211_compatible_rates(bss, sband, &rates);
+	/* add all rates which were marked to be used above */
 	supp_rates_len = rates_len;
 	if (supp_rates_len > 8)
 		supp_rates_len = 8;
-- 
cgit v1.2.3


From 4da5105687e0993a3bbdcffd89b2b94d9377faab Mon Sep 17 00:00:00 2001
From: Kazunori MIYAZAWA <kazunori@miyazawa.org>
Date: Wed, 21 May 2008 13:26:11 -0700
Subject: af_key: Fix selector family initialization.

This propagates the xfrm_user fix made in commit
bcf0dda8d2408fe1c1040cdec5a98e5fcad2ac72 ("[XFRM]: xfrm_user: fix
selector family initialization")

Based upon a bug report from, and tested by, Alan Swanson.

Signed-off-by: Kazunori MIYAZAWA <kazunori@miyazawa.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/key/af_key.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/key/af_key.c b/net/key/af_key.c
index 9e7236ff6bcc..9bba7ac5fee0 100644
--- a/net/key/af_key.c
+++ b/net/key/af_key.c
@@ -1251,7 +1251,7 @@ static struct xfrm_state * pfkey_msg2xfrm_state(struct sadb_msg *hdr,
 		x->sel.prefixlen_s = addr->sadb_address_prefixlen;
 	}
 
-	if (x->props.mode == XFRM_MODE_TRANSPORT)
+	if (!x->sel.family)
 		x->sel.family = x->props.family;
 
 	if (ext_hdrs[SADB_X_EXT_NAT_T_TYPE-1]) {
-- 
cgit v1.2.3


From 7d227cd235c809c36c847d6a597956ad9e9d2bae Mon Sep 17 00:00:00 2001
From: Sridhar Samudrala <sri@us.ibm.com>
Date: Wed, 21 May 2008 16:42:20 -0700
Subject: tcp: TCP connection times out if ICMP frag needed is delayed

We are seeing an issue with TCP in handling an ICMP frag needed
message that is received after net.ipv4.tcp_retries1 retransmits.
The default value of retries1 is 3. So if the path mtu changes
and ICMP frag needed is lost for the first 3 retransmits or if
it gets delayed until 3 retransmits are done, TCP doesn't update
MSS correctly and continues to retransmit the orginal message
until it timesout after tcp_retries2 retransmits.

I am seeing this issue even with the latest 2.6.25.4 kernel.

In tcp_retransmit_timer(), when retransmits counter exceeds
tcp_retries1 value, the dst cache entry of the socket is reset.
At this time, if we receive an ICMP frag needed message, the
dst entry gets updated with the new MTU, but the TCP sockets
dst_cache entry remains NULL.

So the next time when we try to retransmit after the ICMP frag
needed is received, tcp_retransmit_skb() gets called. Here the
cur_mss value is calculated at the start of the routine with
a NULL sk_dst_cache. Instead we should call tcp_current_mss after
the rebuild_header that caches the dst entry with the updated mtu.
Also the rebuild_header should be called before tcp_fragment
so that skb is fragmented if the mss goes down.

Signed-off-by: Sridhar Samudrala <sri@us.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_output.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index debf23581606..e399bde7813a 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1836,7 +1836,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct inet_connection_sock *icsk = inet_csk(sk);
-	unsigned int cur_mss = tcp_current_mss(sk, 0);
+	unsigned int cur_mss;
 	int err;
 
 	/* Inconslusive MTU probe */
@@ -1858,6 +1858,11 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
 			return -ENOMEM;
 	}
 
+	if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk))
+		return -EHOSTUNREACH; /* Routing failure or similar. */
+
+	cur_mss = tcp_current_mss(sk, 0);
+
 	/* If receiver has shrunk his window, and skb is out of
 	 * new window, do not retransmit it. The exception is the
 	 * case, when window is shrunk to zero. In this case
@@ -1884,9 +1889,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
 	    (sysctl_tcp_retrans_collapse != 0))
 		tcp_retrans_try_collapse(sk, skb, cur_mss);
 
-	if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk))
-		return -EHOSTUNREACH; /* Routing failure or similar. */
-
 	/* Some Solaris stacks overoptimize and ignore the FIN on a
 	 * retransmit when old data is attached.  So strip it off
 	 * since it is cheap to do so and saves bytes on the network.
-- 
cgit v1.2.3


From 51f82a2b128131c411880aed2cb802b166fe3445 Mon Sep 17 00:00:00 2001
From: Denis Cheng <crquan@gmail.com>
Date: Wed, 21 May 2008 17:34:32 -0700
Subject: net/ipv4/arp.c: Use common hex_asc helpers

Here the local hexbuf is a duplicate of global const char hex_asc from
lib/hexdump.c, except the hex letters' cases:

	const char hexbuf[] = "0123456789ABCDEF";

	const char hex_asc[] = "0123456789abcdef";

and here to print HW addresses, the hex cases are not significant.

Thanks to Harvey Harrison to introduce the hex_asc_hi/hex_asc_lo helpers.

Signed-off-by: Denis Cheng <crquan@gmail.com>
Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/arp.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 418862f1bf22..9b539fa9fe18 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -1288,7 +1288,6 @@ static void arp_format_neigh_entry(struct seq_file *seq,
 				   struct neighbour *n)
 {
 	char hbuffer[HBUFFERLEN];
-	const char hexbuf[] = "0123456789ABCDEF";
 	int k, j;
 	char tbuf[16];
 	struct net_device *dev = n->dev;
@@ -1302,8 +1301,8 @@ static void arp_format_neigh_entry(struct seq_file *seq,
 	else {
 #endif
 	for (k = 0, j = 0; k < HBUFFERLEN - 3 && j < dev->addr_len; j++) {
-		hbuffer[k++] = hexbuf[(n->ha[j] >> 4) & 15];
-		hbuffer[k++] = hexbuf[n->ha[j] & 15];
+		hbuffer[k++] = hex_asc_hi(n->ha[j]);
+		hbuffer[k++] = hex_asc_lo(n->ha[j]);
 		hbuffer[k++] = ':';
 	}
 	hbuffer[--k] = 0;
-- 
cgit v1.2.3


From 88860c9ef45963eb69411b0d2ace4e8ba0f7a32f Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Wed, 21 May 2008 17:36:21 -0700
Subject: xfrm_user: Remove zero length key checks.

The crypto layer will determine whether that is valid
or not.

Suggested by Herbert Xu, based upon a report and patch
by Martin Willi.

Signed-off-by: David S. Miller <davem@davemloft.net>
Acked-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 net/xfrm/xfrm_user.c | 11 -----------
 1 file changed, 11 deletions(-)

(limited to 'net')

diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index a1b0fbe3ea35..b976d9ed10e4 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -50,19 +50,8 @@ static int verify_one_alg(struct nlattr **attrs, enum xfrm_attr_type_t type)
 
 	switch (type) {
 	case XFRMA_ALG_AUTH:
-		if (!algp->alg_key_len &&
-		    strcmp(algp->alg_name, "digest_null") != 0)
-			return -EINVAL;
-		break;
-
 	case XFRMA_ALG_CRYPT:
-		if (!algp->alg_key_len &&
-		    strcmp(algp->alg_name, "cipher_null") != 0)
-			return -EINVAL;
-		break;
-
 	case XFRMA_ALG_COMP:
-		/* Zero length keys are legal.  */
 		break;
 
 	default:
-- 
cgit v1.2.3


From 071f92d05967a0c8422f1c8587ce0b4d90a8b447 Mon Sep 17 00:00:00 2001
From: Rami Rosen <ramirose@gmail.com>
Date: Wed, 21 May 2008 17:47:54 -0700
Subject: net: The world is not perfect patch.

  Unless there will be any objection here, I suggest consider the
following patch which simply removes the code for the
-DI_WISH_WORLD_WERE_PERFECT in the three methods which use it.

The compilation errors we get when using -DI_WISH_WORLD_WERE_PERFECT
show that this code was not built and not used for really a long time.

Signed-off-by: Rami Rosen <ramirose@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_gre.c | 146 +-----------------------------------------------------
 net/ipv4/ipip.c   | 130 +-----------------------------------------------
 net/ipv6/sit.c    |  89 +--------------------------------
 3 files changed, 3 insertions(+), 362 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 2ada033406de..4342cba4ff82 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -313,9 +313,8 @@ static void ipgre_tunnel_uninit(struct net_device *dev)
 
 static void ipgre_err(struct sk_buff *skb, u32 info)
 {
-#ifndef I_WISH_WORLD_WERE_PERFECT
 
-/* It is not :-( All the routers (except for Linux) return only
+/* All the routers (except for Linux) return only
    8 bytes of packet payload. It means, that precise relaying of
    ICMP in the real Internet is absolutely infeasible.
 
@@ -398,149 +397,6 @@ static void ipgre_err(struct sk_buff *skb, u32 info)
 out:
 	read_unlock(&ipgre_lock);
 	return;
-#else
-	struct iphdr *iph = (struct iphdr*)dp;
-	struct iphdr *eiph;
-	__be16	     *p = (__be16*)(dp+(iph->ihl<<2));
-	const int type = icmp_hdr(skb)->type;
-	const int code = icmp_hdr(skb)->code;
-	int rel_type = 0;
-	int rel_code = 0;
-	__be32 rel_info = 0;
-	__u32 n = 0;
-	__be16 flags;
-	int grehlen = (iph->ihl<<2) + 4;
-	struct sk_buff *skb2;
-	struct flowi fl;
-	struct rtable *rt;
-
-	if (p[1] != htons(ETH_P_IP))
-		return;
-
-	flags = p[0];
-	if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
-		if (flags&(GRE_VERSION|GRE_ROUTING))
-			return;
-		if (flags&GRE_CSUM)
-			grehlen += 4;
-		if (flags&GRE_KEY)
-			grehlen += 4;
-		if (flags&GRE_SEQ)
-			grehlen += 4;
-	}
-	if (len < grehlen + sizeof(struct iphdr))
-		return;
-	eiph = (struct iphdr*)(dp + grehlen);
-
-	switch (type) {
-	default:
-		return;
-	case ICMP_PARAMETERPROB:
-		n = ntohl(icmp_hdr(skb)->un.gateway) >> 24;
-		if (n < (iph->ihl<<2))
-			return;
-
-		/* So... This guy found something strange INSIDE encapsulated
-		   packet. Well, he is fool, but what can we do ?
-		 */
-		rel_type = ICMP_PARAMETERPROB;
-		n -= grehlen;
-		rel_info = htonl(n << 24);
-		break;
-
-	case ICMP_DEST_UNREACH:
-		switch (code) {
-		case ICMP_SR_FAILED:
-		case ICMP_PORT_UNREACH:
-			/* Impossible event. */
-			return;
-		case ICMP_FRAG_NEEDED:
-			/* And it is the only really necessary thing :-) */
-			n = ntohs(icmp_hdr(skb)->un.frag.mtu);
-			if (n < grehlen+68)
-				return;
-			n -= grehlen;
-			/* BSD 4.2 MORE DOES NOT EXIST IN NATURE. */
-			if (n > ntohs(eiph->tot_len))
-				return;
-			rel_info = htonl(n);
-			break;
-		default:
-			/* All others are translated to HOST_UNREACH.
-			   rfc2003 contains "deep thoughts" about NET_UNREACH,
-			   I believe, it is just ether pollution. --ANK
-			 */
-			rel_type = ICMP_DEST_UNREACH;
-			rel_code = ICMP_HOST_UNREACH;
-			break;
-		}
-		break;
-	case ICMP_TIME_EXCEEDED:
-		if (code != ICMP_EXC_TTL)
-			return;
-		break;
-	}
-
-	/* Prepare fake skb to feed it to icmp_send */
-	skb2 = skb_clone(skb, GFP_ATOMIC);
-	if (skb2 == NULL)
-		return;
-	dst_release(skb2->dst);
-	skb2->dst = NULL;
-	skb_pull(skb2, skb->data - (u8*)eiph);
-	skb_reset_network_header(skb2);
-
-	/* Try to guess incoming interface */
-	memset(&fl, 0, sizeof(fl));
-	fl.fl4_dst = eiph->saddr;
-	fl.fl4_tos = RT_TOS(eiph->tos);
-	fl.proto = IPPROTO_GRE;
-	if (ip_route_output_key(dev_net(skb->dev), &rt, &fl)) {
-		kfree_skb(skb2);
-		return;
-	}
-	skb2->dev = rt->u.dst.dev;
-
-	/* route "incoming" packet */
-	if (rt->rt_flags&RTCF_LOCAL) {
-		ip_rt_put(rt);
-		rt = NULL;
-		fl.fl4_dst = eiph->daddr;
-		fl.fl4_src = eiph->saddr;
-		fl.fl4_tos = eiph->tos;
-		if (ip_route_output_key(dev_net(skb->dev), &rt, &fl) ||
-		    rt->u.dst.dev->type != ARPHRD_IPGRE) {
-			ip_rt_put(rt);
-			kfree_skb(skb2);
-			return;
-		}
-	} else {
-		ip_rt_put(rt);
-		if (ip_route_input(skb2, eiph->daddr, eiph->saddr, eiph->tos, skb2->dev) ||
-		    skb2->dst->dev->type != ARPHRD_IPGRE) {
-			kfree_skb(skb2);
-			return;
-		}
-	}
-
-	/* change mtu on this route */
-	if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
-		if (n > dst_mtu(skb2->dst)) {
-			kfree_skb(skb2);
-			return;
-		}
-		skb2->dst->ops->update_pmtu(skb2->dst, n);
-	} else if (type == ICMP_TIME_EXCEEDED) {
-		struct ip_tunnel *t = netdev_priv(skb2->dev);
-		if (t->parms.iph.ttl) {
-			rel_type = ICMP_DEST_UNREACH;
-			rel_code = ICMP_HOST_UNREACH;
-		}
-	}
-
-	icmp_send(skb2, rel_type, rel_code, rel_info);
-	kfree_skb(skb2);
-#endif
 }
 
 static inline void ipgre_ecn_decapsulate(struct iphdr *iph, struct sk_buff *skb)
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 149111f08e8d..af5cb53da5cc 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -278,9 +278,8 @@ static void ipip_tunnel_uninit(struct net_device *dev)
 
 static int ipip_err(struct sk_buff *skb, u32 info)
 {
-#ifndef I_WISH_WORLD_WERE_PERFECT
 
-/* It is not :-( All the routers (except for Linux) return only
+/* All the routers (except for Linux) return only
    8 bytes of packet payload. It means, that precise relaying of
    ICMP in the real Internet is absolutely infeasible.
  */
@@ -337,133 +336,6 @@ static int ipip_err(struct sk_buff *skb, u32 info)
 out:
 	read_unlock(&ipip_lock);
 	return err;
-#else
-	struct iphdr *iph = (struct iphdr*)dp;
-	int hlen = iph->ihl<<2;
-	struct iphdr *eiph;
-	const int type = icmp_hdr(skb)->type;
-	const int code = icmp_hdr(skb)->code;
-	int rel_type = 0;
-	int rel_code = 0;
-	__be32 rel_info = 0;
-	__u32 n = 0;
-	struct sk_buff *skb2;
-	struct flowi fl;
-	struct rtable *rt;
-
-	if (len < hlen + sizeof(struct iphdr))
-		return 0;
-	eiph = (struct iphdr*)(dp + hlen);
-
-	switch (type) {
-	default:
-		return 0;
-	case ICMP_PARAMETERPROB:
-		n = ntohl(icmp_hdr(skb)->un.gateway) >> 24;
-		if (n < hlen)
-			return 0;
-
-		/* So... This guy found something strange INSIDE encapsulated
-		   packet. Well, he is fool, but what can we do ?
-		 */
-		rel_type = ICMP_PARAMETERPROB;
-		rel_info = htonl((n - hlen) << 24);
-		break;
-
-	case ICMP_DEST_UNREACH:
-		switch (code) {
-		case ICMP_SR_FAILED:
-		case ICMP_PORT_UNREACH:
-			/* Impossible event. */
-			return 0;
-		case ICMP_FRAG_NEEDED:
-			/* And it is the only really necessary thing :-) */
-			n = ntohs(icmp_hdr(skb)->un.frag.mtu);
-			if (n < hlen+68)
-				return 0;
-			n -= hlen;
-			/* BSD 4.2 MORE DOES NOT EXIST IN NATURE. */
-			if (n > ntohs(eiph->tot_len))
-				return 0;
-			rel_info = htonl(n);
-			break;
-		default:
-			/* All others are translated to HOST_UNREACH.
-			   rfc2003 contains "deep thoughts" about NET_UNREACH,
-			   I believe, it is just ether pollution. --ANK
-			 */
-			rel_type = ICMP_DEST_UNREACH;
-			rel_code = ICMP_HOST_UNREACH;
-			break;
-		}
-		break;
-	case ICMP_TIME_EXCEEDED:
-		if (code != ICMP_EXC_TTL)
-			return 0;
-		break;
-	}
-
-	/* Prepare fake skb to feed it to icmp_send */
-	skb2 = skb_clone(skb, GFP_ATOMIC);
-	if (skb2 == NULL)
-		return 0;
-	dst_release(skb2->dst);
-	skb2->dst = NULL;
-	skb_pull(skb2, skb->data - (u8*)eiph);
-	skb_reset_network_header(skb2);
-
-	/* Try to guess incoming interface */
-	memset(&fl, 0, sizeof(fl));
-	fl.fl4_daddr = eiph->saddr;
-	fl.fl4_tos = RT_TOS(eiph->tos);
-	fl.proto = IPPROTO_IPIP;
-	if (ip_route_output_key(dev_net(skb->dev), &rt, &key)) {
-		kfree_skb(skb2);
-		return 0;
-	}
-	skb2->dev = rt->u.dst.dev;
-
-	/* route "incoming" packet */
-	if (rt->rt_flags&RTCF_LOCAL) {
-		ip_rt_put(rt);
-		rt = NULL;
-		fl.fl4_daddr = eiph->daddr;
-		fl.fl4_src = eiph->saddr;
-		fl.fl4_tos = eiph->tos;
-		if (ip_route_output_key(dev_net(skb->dev), &rt, &fl) ||
-		    rt->u.dst.dev->type != ARPHRD_TUNNEL) {
-			ip_rt_put(rt);
-			kfree_skb(skb2);
-			return 0;
-		}
-	} else {
-		ip_rt_put(rt);
-		if (ip_route_input(skb2, eiph->daddr, eiph->saddr, eiph->tos, skb2->dev) ||
-		    skb2->dst->dev->type != ARPHRD_TUNNEL) {
-			kfree_skb(skb2);
-			return 0;
-		}
-	}
-
-	/* change mtu on this route */
-	if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
-		if (n > dst_mtu(skb2->dst)) {
-			kfree_skb(skb2);
-			return 0;
-		}
-		skb2->dst->ops->update_pmtu(skb2->dst, n);
-	} else if (type == ICMP_TIME_EXCEEDED) {
-		struct ip_tunnel *t = netdev_priv(skb2->dev);
-		if (t->parms.iph.ttl) {
-			rel_type = ICMP_DEST_UNREACH;
-			rel_code = ICMP_HOST_UNREACH;
-		}
-	}
-
-	icmp_send(skb2, rel_type, rel_code, rel_info);
-	kfree_skb(skb2);
-	return 0;
-#endif
 }
 
 static inline void ipip_ecn_decapsulate(const struct iphdr *outer_iph,
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index 5a6fab95569f..3de6ffdaedf2 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -403,9 +403,8 @@ static void ipip6_tunnel_uninit(struct net_device *dev)
 
 static int ipip6_err(struct sk_buff *skb, u32 info)
 {
-#ifndef I_WISH_WORLD_WERE_PERFECT
 
-/* It is not :-( All the routers (except for Linux) return only
+/* All the routers (except for Linux) return only
    8 bytes of packet payload. It means, that precise relaying of
    ICMP in the real Internet is absolutely infeasible.
  */
@@ -462,92 +461,6 @@ static int ipip6_err(struct sk_buff *skb, u32 info)
 out:
 	read_unlock(&ipip6_lock);
 	return err;
-#else
-	struct iphdr *iph = (struct iphdr*)dp;
-	int hlen = iph->ihl<<2;
-	struct ipv6hdr *iph6;
-	const int type = icmp_hdr(skb)->type;
-	const int code = icmp_hdr(skb)->code;
-	int rel_type = 0;
-	int rel_code = 0;
-	int rel_info = 0;
-	struct sk_buff *skb2;
-	struct rt6_info *rt6i;
-
-	if (len < hlen + sizeof(struct ipv6hdr))
-		return;
-	iph6 = (struct ipv6hdr*)(dp + hlen);
-
-	switch (type) {
-	default:
-		return;
-	case ICMP_PARAMETERPROB:
-		if (icmp_hdr(skb)->un.gateway < hlen)
-			return;
-
-		/* So... This guy found something strange INSIDE encapsulated
-		   packet. Well, he is fool, but what can we do ?
-		 */
-		rel_type = ICMPV6_PARAMPROB;
-		rel_info = icmp_hdr(skb)->un.gateway - hlen;
-		break;
-
-	case ICMP_DEST_UNREACH:
-		switch (code) {
-		case ICMP_SR_FAILED:
-		case ICMP_PORT_UNREACH:
-			/* Impossible event. */
-			return;
-		case ICMP_FRAG_NEEDED:
-			/* Too complicated case ... */
-			return;
-		default:
-			/* All others are translated to HOST_UNREACH.
-			   rfc2003 contains "deep thoughts" about NET_UNREACH,
-			   I believe, it is just ether pollution. --ANK
-			 */
-			rel_type = ICMPV6_DEST_UNREACH;
-			rel_code = ICMPV6_ADDR_UNREACH;
-			break;
-		}
-		break;
-	case ICMP_TIME_EXCEEDED:
-		if (code != ICMP_EXC_TTL)
-			return;
-		rel_type = ICMPV6_TIME_EXCEED;
-		rel_code = ICMPV6_EXC_HOPLIMIT;
-		break;
-	}
-
-	/* Prepare fake skb to feed it to icmpv6_send */
-	skb2 = skb_clone(skb, GFP_ATOMIC);
-	if (skb2 == NULL)
-		return 0;
-	dst_release(skb2->dst);
-	skb2->dst = NULL;
-	skb_pull(skb2, skb->data - (u8*)iph6);
-	skb_reset_network_header(skb2);
-
-	/* Try to guess incoming interface */
-	rt6i = rt6_lookup(dev_net(skb->dev), &iph6->saddr, NULL, NULL, 0);
-	if (rt6i && rt6i->rt6i_dev) {
-		skb2->dev = rt6i->rt6i_dev;
-
-		rt6i = rt6_lookup(dev_net(skb->dev),
-				&iph6->daddr, &iph6->saddr, NULL, 0);
-
-		if (rt6i && rt6i->rt6i_dev && rt6i->rt6i_dev->type == ARPHRD_SIT) {
-			struct ip_tunnel *t = netdev_priv(rt6i->rt6i_dev);
-			if (rel_type == ICMPV6_TIME_EXCEED && t->parms.iph.ttl) {
-				rel_type = ICMPV6_DEST_UNREACH;
-				rel_code = ICMPV6_ADDR_UNREACH;
-			}
-			icmpv6_send(skb2, rel_type, rel_code, rel_info, skb2->dev);
-		}
-	}
-	kfree_skb(skb2);
-	return 0;
-#endif
 }
 
 static inline void ipip6_ecn_decapsulate(struct iphdr *iph, struct sk_buff *skb)
-- 
cgit v1.2.3


From 289c79a4bd350e8a25065102563ad1a183d1b402 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Fri, 23 May 2008 00:22:04 -0700
Subject: vlan: Use bitmask of feature flags instead of seperate feature bits

Herbert Xu points out that the use of seperate feature bits for features
to be propagated to VLAN devices is going to get messy real soon.
Replace the VLAN feature bits by a bitmask of feature flags to be
propagated and restore the old GSO_SHIFT/MASK values.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Acked-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/8021q/vlan.c     | 10 ++--------
 net/8021q/vlan.h     |  2 --
 net/8021q/vlan_dev.c |  5 +----
 3 files changed, 3 insertions(+), 14 deletions(-)

(limited to 'net')

diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c
index 51961300b586..ab2225da0ee2 100644
--- a/net/8021q/vlan.c
+++ b/net/8021q/vlan.c
@@ -387,14 +387,8 @@ static void vlan_transfer_features(struct net_device *dev,
 {
 	unsigned long old_features = vlandev->features;
 
-	if (dev->features & NETIF_F_VLAN_TSO) {
-		vlandev->features &= ~VLAN_TSO_FEATURES;
-		vlandev->features |= dev->features & VLAN_TSO_FEATURES;
-	}
-	if (dev->features & NETIF_F_VLAN_CSUM) {
-		vlandev->features &= ~NETIF_F_ALL_CSUM;
-		vlandev->features |= dev->features & NETIF_F_ALL_CSUM;
-	}
+	vlandev->features &= ~dev->vlan_features;
+	vlandev->features |= dev->features & dev->vlan_features;
 
 	if (old_features != vlandev->features)
 		netdev_features_change(vlandev);
diff --git a/net/8021q/vlan.h b/net/8021q/vlan.h
index 79625696e86a..5229a72c7ea1 100644
--- a/net/8021q/vlan.h
+++ b/net/8021q/vlan.h
@@ -7,8 +7,6 @@
 #define VLAN_GRP_HASH_SIZE	(1 << VLAN_GRP_HASH_SHIFT)
 #define VLAN_GRP_HASH_MASK	(VLAN_GRP_HASH_SIZE - 1)
 
-#define VLAN_TSO_FEATURES	(NETIF_F_TSO | NETIF_F_TSO6 | NETIF_F_SG)
-
 /*  Find a VLAN device by the MAC address of its Ethernet device, and
  *  it's VLAN ID.  The default configuration is to have VLAN's scope
  *  to be box-wide, so the MAC will be ignored.  The mac will only be
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index b1cfbaa88db2..5d055c242ed8 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -663,10 +663,7 @@ static int vlan_dev_init(struct net_device *dev)
 					  (1<<__LINK_STATE_DORMANT))) |
 		      (1<<__LINK_STATE_PRESENT);
 
-	if (real_dev->features & NETIF_F_VLAN_TSO)
-		dev->features |= real_dev->features & VLAN_TSO_FEATURES;
-	if (real_dev->features & NETIF_F_VLAN_CSUM)
-		dev->features |= real_dev->features & NETIF_F_ALL_CSUM;
+	dev->features |= real_dev->features & real_dev->vlan_features;
 
 	/* ipv6 shared card related stuff */
 	dev->dev_id = real_dev->dev_id;
-- 
cgit v1.2.3


From 6079a463cf95fafcc704a4e5e92a4da12444bd3c Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yjwei@cn.fujitsu.com>
Date: Tue, 27 May 2008 06:22:38 -0700
Subject: dccp: Fix to handle short sequence numbers packet correctly

RFC4340 said:
  8.5.  Pseudocode
       ...
       If P.type is not Data, Ack, or DataAck and P.X == 0 (the packet
             has short sequence numbers), drop packet and return

But DCCP has some mistake to handle short sequence numbers packet, now
it drop packet only if P.type is Data, Ack, or DataAck and P.X == 0.

Signed-off-by: Wei Yongjun <yjwei@cn.fujitsu.com>
Acked-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Acked-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dccp/ipv4.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index b348dd70c685..c22a3780c14e 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -739,8 +739,8 @@ int dccp_invalid_packet(struct sk_buff *skb)
 	 * If P.type is not Data, Ack, or DataAck and P.X == 0 (the packet
 	 * has short sequence numbers), drop packet and return
 	 */
-	if (dh->dccph_type >= DCCP_PKT_DATA    &&
-	    dh->dccph_type <= DCCP_PKT_DATAACK && dh->dccph_x == 0)  {
+	if ((dh->dccph_type < DCCP_PKT_DATA    ||
+	    dh->dccph_type > DCCP_PKT_DATAACK) && dh->dccph_x == 0)  {
 		DCCP_WARN("P.type (%s) not Data || [Data]Ack, while P.X == 0\n",
 			  dccp_packet_name(dh->dccph_type));
 		return 1;
-- 
cgit v1.2.3


From 825de27d9e40b3117b29a79d412b7a4b78c5d815 Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Tue, 27 May 2008 06:33:54 -0700
Subject: dccp ccid-3: Fix "t_ipi explosion" bug

The identification of this bug is thanks to Cheng Wei and Tomasz
Grobelny.

To avoid divide-by-zero, the implementation previously ignored RTTs
smaller than 4 microseconds when performing integer division RTT/4.

When the RTT reached a value less than 4 microseconds (as observed on
loopback), this prevented the Window Counter CCVal value from
advancing. As a result, the receiver stopped sending feedback. This in
turn caused non-ending expiries of the nofeedback timer at the sender,
so that the sending rate was progressively reduced until reaching the
minimum of one packet per 64 seconds.

The patch fixes this bug by handling integer division more
intelligently. Due to consistent use of dccp_sample_rtt(),
divide-by-zero-RTT is avoided.

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dccp/ccids/ccid3.c | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c
index cd61dea2eea1..f813077234b7 100644
--- a/net/dccp/ccids/ccid3.c
+++ b/net/dccp/ccids/ccid3.c
@@ -193,22 +193,17 @@ static inline void ccid3_hc_tx_update_s(struct ccid3_hc_tx_sock *hctx, int len)
 
 /*
  *	Update Window Counter using the algorithm from [RFC 4342, 8.1].
- *	The algorithm is not applicable if RTT < 4 microseconds.
+ *	As elsewhere, RTT > 0 is assumed by using dccp_sample_rtt().
  */
 static inline void ccid3_hc_tx_update_win_count(struct ccid3_hc_tx_sock *hctx,
 						ktime_t now)
 {
-	u32 quarter_rtts;
-
-	if (unlikely(hctx->ccid3hctx_rtt < 4))	/* avoid divide-by-zero */
-		return;
-
-	quarter_rtts = ktime_us_delta(now, hctx->ccid3hctx_t_last_win_count);
-	quarter_rtts /= hctx->ccid3hctx_rtt / 4;
+	u32 delta = ktime_us_delta(now, hctx->ccid3hctx_t_last_win_count),
+	    quarter_rtts = (4 * delta) / hctx->ccid3hctx_rtt;
 
 	if (quarter_rtts > 0) {
 		hctx->ccid3hctx_t_last_win_count = now;
-		hctx->ccid3hctx_last_win_count	+= min_t(u32, quarter_rtts, 5);
+		hctx->ccid3hctx_last_win_count  += min(quarter_rtts, 5U);
 		hctx->ccid3hctx_last_win_count	&= 0xF;		/* mod 16 */
 	}
 }
-- 
cgit v1.2.3


From 679fda1aa49fddf938bb699df7867c01988371ab Mon Sep 17 00:00:00 2001
From: Nicolas Kaiser <nikai@nikai.net>
Date: Tue, 20 May 2008 18:42:54 +0200
Subject: net/mac80211: always true conditionals

Correct always true conditionals.

Signed-off-by: Nicolas Kaiser <nikai@nikai.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/cfg.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index 699d97b8de5e..a9fce4afdf21 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -672,7 +672,7 @@ static int ieee80211_add_station(struct wiphy *wiphy, struct net_device *dev,
 	if (params->vlan) {
 		sdata = IEEE80211_DEV_TO_SUB_IF(params->vlan);
 
-		if (sdata->vif.type != IEEE80211_IF_TYPE_VLAN ||
+		if (sdata->vif.type != IEEE80211_IF_TYPE_VLAN &&
 		    sdata->vif.type != IEEE80211_IF_TYPE_AP)
 			return -EINVAL;
 	} else
@@ -760,7 +760,7 @@ static int ieee80211_change_station(struct wiphy *wiphy,
 	if (params->vlan && params->vlan != sta->sdata->dev) {
 		vlansdata = IEEE80211_DEV_TO_SUB_IF(params->vlan);
 
-		if (vlansdata->vif.type != IEEE80211_IF_TYPE_VLAN ||
+		if (vlansdata->vif.type != IEEE80211_IF_TYPE_VLAN &&
 		    vlansdata->vif.type != IEEE80211_IF_TYPE_AP) {
 			rcu_read_unlock();
 			return -EINVAL;
-- 
cgit v1.2.3


From 167ad6f7a2b2ae58dfaa46620b9b3212594f38e6 Mon Sep 17 00:00:00 2001
From: Tomas Winkler <tomas.winkler@intel.com>
Date: Wed, 21 May 2008 18:17:05 +0300
Subject: mac80211: fix ieee80211_rx_bss_put/get imbalance

This patch fixes iee80211_rx_bss_put/get imbalance
introduced by 'mac80211: enable IBSS merging' patch.

Signed-off-by: Tomas Winkler <tomas.winkler@intel.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/mlme.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 7cfd12e0d1e2..0ef5993e785b 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -2479,8 +2479,6 @@ static int ieee80211_sta_join_ibss(struct net_device *dev,
 	ifsta->state = IEEE80211_IBSS_JOINED;
 	mod_timer(&ifsta->timer, jiffies + IEEE80211_IBSS_MERGE_INTERVAL);
 
-	ieee80211_rx_bss_put(dev, bss);
-
 	return res;
 }
 
@@ -3523,6 +3521,7 @@ static int ieee80211_sta_create_ibss(struct net_device *dev,
 	struct ieee80211_supported_band *sband;
 	u8 bssid[ETH_ALEN], *pos;
 	int i;
+	int ret;
 	DECLARE_MAC_BUF(mac);
 
 #if 0
@@ -3567,7 +3566,9 @@ static int ieee80211_sta_create_ibss(struct net_device *dev,
 		*pos++ = (u8) (rate / 5);
 	}
 
-	return ieee80211_sta_join_ibss(dev, ifsta, bss);
+	ret = ieee80211_sta_join_ibss(dev, ifsta, bss);
+	ieee80211_rx_bss_put(dev, bss);
+	return ret;
 }
 
 
@@ -3615,10 +3616,13 @@ static int ieee80211_sta_find_ibss(struct net_device *dev,
 	    (bss = ieee80211_rx_bss_get(dev, bssid,
 					local->hw.conf.channel->center_freq,
 					ifsta->ssid, ifsta->ssid_len))) {
+		int ret;
 		printk(KERN_DEBUG "%s: Selected IBSS BSSID %s"
 		       " based on configured SSID\n",
 		       dev->name, print_mac(mac, bssid));
-		return ieee80211_sta_join_ibss(dev, ifsta, bss);
+		ret = ieee80211_sta_join_ibss(dev, ifsta, bss);
+		ieee80211_rx_bss_put(dev, bss);
+		return ret;
 	}
 #ifdef CONFIG_MAC80211_IBSS_DEBUG
 	printk(KERN_DEBUG "   did not try to join ibss\n");
-- 
cgit v1.2.3


From 9381be059bf5831d259e8735005cfa35b7488543 Mon Sep 17 00:00:00 2001
From: Tomas Winkler <tomas.winkler@intel.com>
Date: Fri, 23 May 2008 01:36:36 +0300
Subject: mac80211: reorder channel and freq reporting in wext scan report

This patch switch order of channel and freq (SIOCGIWFREQ) reports
in scan results in order to overcome wpa_supplicant inability
to handle channel numbers in 5.2Ghz band.
Wext reporting channel number is ambiguous as channels 7-12 (802.11j)
exist on both bands.

Signed-off-by: Tomas Winkler <tomas.winkler@intel.com>
Signed-off-by: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Acked-by: Dan Williams <dcbw@redhat.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/mlme.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 0ef5993e785b..c29927c4977a 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -4099,18 +4099,17 @@ ieee80211_sta_scan_result(struct net_device *dev,
 
 	memset(&iwe, 0, sizeof(iwe));
 	iwe.cmd = SIOCGIWFREQ;
-	iwe.u.freq.m = bss->freq;
-	iwe.u.freq.e = 6;
+	iwe.u.freq.m = ieee80211_frequency_to_channel(bss->freq);
+	iwe.u.freq.e = 0;
 	current_ev = iwe_stream_add_event(current_ev, end_buf, &iwe,
 					  IW_EV_FREQ_LEN);
 
 	memset(&iwe, 0, sizeof(iwe));
 	iwe.cmd = SIOCGIWFREQ;
-	iwe.u.freq.m = ieee80211_frequency_to_channel(bss->freq);
-	iwe.u.freq.e = 0;
+	iwe.u.freq.m = bss->freq;
+	iwe.u.freq.e = 6;
 	current_ev = iwe_stream_add_event(current_ev, end_buf, &iwe,
 					  IW_EV_FREQ_LEN);
-
 	memset(&iwe, 0, sizeof(iwe));
 	iwe.cmd = IWEVQUAL;
 	iwe.u.qual.qual = bss->signal;
-- 
cgit v1.2.3


From d4231ca3e162387a2b6964dacaa83604e065c4e9 Mon Sep 17 00:00:00 2001
From: Abhijeet Kolekar <abhijeet.kolekar@intel.com>
Date: Fri, 23 May 2008 10:15:26 -0700
Subject: mac80211 : Fixes the status message for iwconfig

iwconfig was showing incorrect status messages when disassociated.
Patch fixes this by always checking for association status in
ioctl calls for getting ap address.

Signed-off-by: Abhijeet Kolekar <abhijeet.kolekar@intel.com>
Acked-by: Dan Williams <dcbw@redhat.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/wext.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/wext.c b/net/mac80211/wext.c
index 457ebf9e85ae..8311bb24f9f3 100644
--- a/net/mac80211/wext.c
+++ b/net/mac80211/wext.c
@@ -489,9 +489,14 @@ static int ieee80211_ioctl_giwap(struct net_device *dev,
 	sdata = IEEE80211_DEV_TO_SUB_IF(dev);
 	if (sdata->vif.type == IEEE80211_IF_TYPE_STA ||
 	    sdata->vif.type == IEEE80211_IF_TYPE_IBSS) {
-		ap_addr->sa_family = ARPHRD_ETHER;
-		memcpy(&ap_addr->sa_data, sdata->u.sta.bssid, ETH_ALEN);
-		return 0;
+		if (sdata->u.sta.state == IEEE80211_ASSOCIATED) {
+			ap_addr->sa_family = ARPHRD_ETHER;
+			memcpy(&ap_addr->sa_data, sdata->u.sta.bssid, ETH_ALEN);
+			return 0;
+		} else {
+			memset(&ap_addr->sa_data, 0, ETH_ALEN);
+			return 0;
+		}
 	} else if (sdata->vif.type == IEEE80211_IF_TYPE_WDS) {
 		ap_addr->sa_family = ARPHRD_ETHER;
 		memcpy(&ap_addr->sa_data, sdata->u.wds.remote_addr, ETH_ALEN);
-- 
cgit v1.2.3


From f6d97104890203ba9c2cf8e34894c4c8e64cb880 Mon Sep 17 00:00:00 2001
From: Yi Zhu <yi.zhu@intel.com>
Date: Tue, 27 May 2008 17:50:50 +0300
Subject: mac80211: fix a typo in ieee80211_handle_filtered_frame comment

fix a typo in ieee80211_handle_filtered_frame comment

Signed-off-by: Yi Zhu <yi.zhu@intel.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index 915afadb0602..5c876450b14c 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -1313,7 +1313,7 @@ static void ieee80211_handle_filtered_frame(struct ieee80211_local *local,
 	/*
 	 * Clear the TX filter mask for this STA when sending the next
 	 * packet. If the STA went to power save mode, this will happen
-	 * happen when it wakes up for the next time.
+	 * when it wakes up for the next time.
 	 */
 	sta->flags |= WLAN_STA_CLEAR_PS_FILT;
 
-- 
cgit v1.2.3


From 70d251b24c44ab2fcba1807a5206e844cf10eb38 Mon Sep 17 00:00:00 2001
From: Senthil Balasubramanian <senthilkumar@atheros.com>
Date: Wed, 28 May 2008 20:08:12 +0530
Subject: mac80211: Fix for NULL pointer dereference in sta_info_get()

This addresses a NULL pointer dereference in sta_info_get().
TID and sta_info are extracted in ADDBA Timer expiry function
through the timer handler's argument.

The problem is extracging the TID (which was stored in
timer_to_tid[] array of type "u8") through "int *" typecast which
may also yield unwanted bytes for the MSB of TID that results
in incorrect sta_info and ieee80211_local pointers.

ieee80211_local pointer is NULL as illustrated below, it crashes in
sta_info_get(). The problem started when extracting ieee80211_local
pointer out of sta_info iteself and eventually crashed in
stat_info_get().

The proper way to fix is to change the data type of TID to u8
instead of u16. However changing all the occurences requires
some prototype changes as well. We should fix this in upcoming
patches.

Signed-off-by: Senthil Balasubramanian <senthilkumar@atheros.com>
Signed-off-by: Luis Rodriguez <lrodriguez@atheros.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/mlme.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index c29927c4977a..33a356e7b66f 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -1614,7 +1614,7 @@ void sta_addba_resp_timer_expired(unsigned long data)
 	 * only one argument, and both sta_info and TID are needed, so init
 	 * flow in sta_info_create gives the TID as data, while the timer_to_id
 	 * array gives the sta through container_of */
-	u16 tid = *(int *)data;
+	u16 tid = *(u8 *)data;
 	struct sta_info *temp_sta = container_of((void *)data,
 		struct sta_info, timer_to_tid[tid]);
 
@@ -1662,7 +1662,7 @@ timer_expired_exit:
 void sta_rx_agg_session_timer_expired(unsigned long data)
 {
 	/* not an elegant detour, but there is no choice as the timer passes
-	 * only one argument, and verious sta_info are needed here, so init
+	 * only one argument, and various sta_info are needed here, so init
 	 * flow in sta_info_create gives the TID as data, while the timer_to_id
 	 * array gives the sta through container_of */
 	u8 *ptid = (u8 *)data;
-- 
cgit v1.2.3


From c97c23e38625f59e3e9869664eeeb0cab1822948 Mon Sep 17 00:00:00 2001
From: Senthil Balasubramanian <senthilkumar@atheros.com>
Date: Wed, 28 May 2008 23:15:32 +0530
Subject: mac80211: fix alignment issue with compare_ether_addr()

This addresses an alignment issue with compare_ether_addr().
The addresses passed to compare_ether_addr should be two bytes aligned.
It may function properly in x86 platform. However may not work properly
on IA-64 or ARM processor.

This also fixes a typo in mlme.c where the sk_buff struct name is incorect.
Though sizeof() works for any incorrect structure pointer name as its just
a pointer length that we want, lets just fix it.

Signed-off-by: Senthil Balasubramanian <senthilkumar@atheros.com>
Signed-off-by: Luis R. Rodriguez <lrodriguez@atheros.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/mlme.c | 4 ++--
 net/mac80211/rx.c   | 4 ++--
 net/mac80211/util.c | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 33a356e7b66f..841278f1df8e 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -1325,7 +1325,7 @@ static void ieee80211_sta_process_addba_request(struct net_device *dev,
 
 	/* prepare reordering buffer */
 	tid_agg_rx->reorder_buf =
-		kmalloc(buf_size * sizeof(struct sk_buf *), GFP_ATOMIC);
+		kmalloc(buf_size * sizeof(struct sk_buff *), GFP_ATOMIC);
 	if (!tid_agg_rx->reorder_buf) {
 		if (net_ratelimit())
 			printk(KERN_ERR "can not allocate reordering buffer "
@@ -1334,7 +1334,7 @@ static void ieee80211_sta_process_addba_request(struct net_device *dev,
 		goto end;
 	}
 	memset(tid_agg_rx->reorder_buf, 0,
-		buf_size * sizeof(struct sk_buf *));
+		buf_size * sizeof(struct sk_buff *));
 
 	if (local->ops->ampdu_action)
 		ret = local->ops->ampdu_action(hw, IEEE80211_AMPDU_RX_START,
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 1958bfb361c6..0941e5d6a522 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -1091,7 +1091,7 @@ ieee80211_data_to_8023(struct ieee80211_rx_data *rx)
 	u16 fc, hdrlen, ethertype;
 	u8 *payload;
 	u8 dst[ETH_ALEN];
-	u8 src[ETH_ALEN];
+	u8 src[ETH_ALEN] __aligned(2);
 	struct sk_buff *skb = rx->skb;
 	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
 	DECLARE_MAC_BUF(mac);
@@ -1234,7 +1234,7 @@ ieee80211_data_to_8023(struct ieee80211_rx_data *rx)
  */
 static bool ieee80211_frame_allowed(struct ieee80211_rx_data *rx)
 {
-	static const u8 pae_group_addr[ETH_ALEN]
+	static const u8 pae_group_addr[ETH_ALEN] __aligned(2)
 		= { 0x01, 0x80, 0xC2, 0x00, 0x00, 0x03 };
 	struct ethhdr *ehdr = (struct ethhdr *) rx->skb->data;
 
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index 131e9e6c8a32..4e97b266f907 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -34,11 +34,11 @@ void *mac80211_wiphy_privid = &mac80211_wiphy_privid;
 
 /* See IEEE 802.1H for LLC/SNAP encapsulation/decapsulation */
 /* Ethernet-II snap header (RFC1042 for most EtherTypes) */
-const unsigned char rfc1042_header[] =
+const unsigned char rfc1042_header[] __aligned(2) =
 	{ 0xaa, 0xaa, 0x03, 0x00, 0x00, 0x00 };
 
 /* Bridge-Tunnel header (for EtherTypes ETH_P_AARP and ETH_P_IPX) */
-const unsigned char bridge_tunnel_header[] =
+const unsigned char bridge_tunnel_header[] __aligned(2) =
 	{ 0xaa, 0xaa, 0x03, 0x00, 0x00, 0xf8 };
 
 
-- 
cgit v1.2.3


From 4c8411f8c115def968820a4df6658ccfd55d7f1a Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Thu, 29 May 2008 01:32:47 -0700
Subject: bluetooth: fix locking bug in the rfcomm socket cleanup handling

in net/bluetooth/rfcomm/sock.c, rfcomm_sk_state_change() does the
following operation:

        if (parent && sock_flag(sk, SOCK_ZAPPED)) {
                /* We have to drop DLC lock here, otherwise
                 * rfcomm_sock_destruct() will dead lock. */
                rfcomm_dlc_unlock(d);
                rfcomm_sock_kill(sk);
                rfcomm_dlc_lock(d);
        }
}

which is fine, since rfcomm_sock_kill() will call sk_free() which will call
rfcomm_sock_destruct() which takes the rfcomm_dlc_lock()... so far so good.

HOWEVER, this assumes that the rfcomm_sk_state_change() function always gets
called with the rfcomm_dlc_lock() taken. This is the case for all but one
case, and in that case where we don't have the lock, we do a double unlock
followed by an attempt to take the lock, which due to underflow isn't
going anywhere fast.

This patch fixes this by moving the stragling case inside the lock, like
the other usages of the same call are doing in this code.

This was found with the help of the www.kerneloops.org project, where this
deadlock was observed 51 times at this point in time:
http://www.kerneloops.org/search.php?search=rfcomm_sock_destruct

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Acked-by: Marcel Holtmann <marcel@holtmann.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bluetooth/rfcomm/core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/bluetooth/rfcomm/core.c b/net/bluetooth/rfcomm/core.c
index eb62558e9b09..0c2c93735e93 100644
--- a/net/bluetooth/rfcomm/core.c
+++ b/net/bluetooth/rfcomm/core.c
@@ -423,8 +423,8 @@ static int __rfcomm_dlc_close(struct rfcomm_dlc *d, int err)
 
 		rfcomm_dlc_lock(d);
 		d->state = BT_CLOSED;
-		rfcomm_dlc_unlock(d);
 		d->state_change(d, err);
+		rfcomm_dlc_unlock(d);
 
 		skb_queue_purge(&d->tx_queue);
 		rfcomm_dlc_unlink(d);
-- 
cgit v1.2.3


From 12293bf91126ad253a25e2840b307fdc7c2754c3 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@parallels.com>
Date: Thu, 29 May 2008 03:19:37 -0700
Subject: netfilter: nf_conntrack_expect: fix error path unwind in
 nf_conntrack_expect_init()

Signed-off-by: Alexey Dobriyan <adobriyan@parallels.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/netfilter/nf_conntrack_expect.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c
index e31beeb33b2b..e8f0dead267f 100644
--- a/net/netfilter/nf_conntrack_expect.c
+++ b/net/netfilter/nf_conntrack_expect.c
@@ -587,10 +587,10 @@ int __init nf_conntrack_expect_init(void)
 	return 0;
 
 err3:
+	kmem_cache_destroy(nf_ct_expect_cachep);
+err2:
 	nf_ct_free_hashtable(nf_ct_expect_hash, nf_ct_expect_vmalloc,
 			     nf_ct_expect_hsize);
-err2:
-	kmem_cache_destroy(nf_ct_expect_cachep);
 err1:
 	return err;
 }
-- 
cgit v1.2.3


From 3446b9d57edd0b96a89715fef222879e4919a115 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Fri, 30 May 2008 02:57:29 -0700
Subject: llc: Fix double accounting of received packets

llc_sap_rcv was being preceded by skb_set_owner_r, then calling
llc_state_process that calls sock_queue_rcv_skb, that in turn calls
skb_set_owner_r again making the space allowed to be used by the socket to be
leaked, making the socket to get stuck.

Fix it by setting skb->sk at llc_sap_rcv and leave the accounting to be done
only at sock_queue_rcv_skb.

Reported-by: Dmitry Petukhov <dmgenp@gmail.com>
Tested-by: Dmitry Petukhov <dmgenp@gmail.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/llc/llc_sap.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/llc/llc_sap.c b/net/llc/llc_sap.c
index e2ddde755019..008de1fc42ca 100644
--- a/net/llc/llc_sap.c
+++ b/net/llc/llc_sap.c
@@ -286,12 +286,14 @@ void llc_build_and_send_xid_pkt(struct llc_sap *sap, struct sk_buff *skb,
  *
  *	Sends received pdus to the sap state machine.
  */
-static void llc_sap_rcv(struct llc_sap *sap, struct sk_buff *skb)
+static void llc_sap_rcv(struct llc_sap *sap, struct sk_buff *skb,
+			struct sock *sk)
 {
 	struct llc_sap_state_ev *ev = llc_sap_ev(skb);
 
 	ev->type   = LLC_SAP_EV_TYPE_PDU;
 	ev->reason = 0;
+	skb->sk = sk;
 	llc_sap_state_process(sap, skb);
 }
 
@@ -360,8 +362,7 @@ static void llc_sap_mcast(struct llc_sap *sap,
 			break;
 
 		sock_hold(sk);
-		skb_set_owner_r(skb1, sk);
-		llc_sap_rcv(sap, skb1);
+		llc_sap_rcv(sap, skb1, sk);
 		sock_put(sk);
 	}
 	read_unlock_bh(&sap->sk_list.lock);
@@ -381,8 +382,7 @@ void llc_sap_handler(struct llc_sap *sap, struct sk_buff *skb)
 	} else {
 		struct sock *sk = llc_lookup_dgram(sap, &laddr);
 		if (sk) {
-			skb_set_owner_r(skb, sk);
-			llc_sap_rcv(sap, skb);
+			llc_sap_rcv(sap, skb, sk);
 			sock_put(sk);
 		} else
 			kfree_skb(skb);
-- 
cgit v1.2.3


From 537d59af73d894750cff14f90fe2b6d77fbab15b Mon Sep 17 00:00:00 2001
From: Dave Young <hidave.darkstar@gmail.com>
Date: Sun, 1 Jun 2008 23:50:52 -0700
Subject: bluetooth: rfcomm_dev_state_change deadlock fix

There's logic in __rfcomm_dlc_close:
	rfcomm_dlc_lock(d);
	d->state = BT_CLOSED;
	d->state_changed(d, err);
	rfcomm_dlc_unlock(d);

In rfcomm_dev_state_change, it's possible that rfcomm_dev_put try to
take the dlc lock, then we will deadlock.

Here fixed it by unlock dlc before rfcomm_dev_get in
rfcomm_dev_state_change.

why not unlock just before rfcomm_dev_put? it's because there's
another problem.  rfcomm_dev_get/rfcomm_dev_del will take
rfcomm_dev_lock, but in rfcomm_dev_add the lock order is :
rfcomm_dev_lock --> dlc lock

so I unlock dlc before the taken of rfcomm_dev_lock.

Actually it's a regression caused by commit
1905f6c736cb618e07eca0c96e60e3c024023428 ("bluetooth :
__rfcomm_dlc_close lock fix"), the dlc state_change could be two
callbacks : rfcomm_sk_state_change and rfcomm_dev_state_change. I
missed the rfcomm_sk_state_change that time.

Thanks Arjan van de Ven <arjan@linux.intel.com> for the effort in
commit 4c8411f8c115def968820a4df6658ccfd55d7f1a ("bluetooth: fix
locking bug in the rfcomm socket cleanup handling") but he missed the
rfcomm_dev_state_change lock issue.

Signed-off-by: Dave Young <hidave.darkstar@gmail.com>
Acked-by: Marcel Holtmann <marcel@holtmann.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bluetooth/rfcomm/tty.c | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/bluetooth/rfcomm/tty.c b/net/bluetooth/rfcomm/tty.c
index c3f749abb2d0..c9191871c1e0 100644
--- a/net/bluetooth/rfcomm/tty.c
+++ b/net/bluetooth/rfcomm/tty.c
@@ -566,11 +566,22 @@ static void rfcomm_dev_state_change(struct rfcomm_dlc *dlc, int err)
 	if (dlc->state == BT_CLOSED) {
 		if (!dev->tty) {
 			if (test_bit(RFCOMM_RELEASE_ONHUP, &dev->flags)) {
-				if (rfcomm_dev_get(dev->id) == NULL)
+				/* Drop DLC lock here to avoid deadlock
+				 * 1. rfcomm_dev_get will take rfcomm_dev_lock
+				 *    but in rfcomm_dev_add there's lock order:
+				 *    rfcomm_dev_lock -> dlc lock
+				 * 2. rfcomm_dev_put will deadlock if it's
+				 *    the last reference
+				 */
+				rfcomm_dlc_unlock(dlc);
+				if (rfcomm_dev_get(dev->id) == NULL) {
+					rfcomm_dlc_lock(dlc);
 					return;
+				}
 
 				rfcomm_dev_del(dev);
 				rfcomm_dev_put(dev);
+				rfcomm_dlc_lock(dlc);
 			}
 		} else
 			tty_hangup(dev->tty);
-- 
cgit v1.2.3


From 7dccf1f4e1696c79bff064c3770867cc53cbc71c Mon Sep 17 00:00:00 2001
From: Jarek Poplawski <jarkao2@gmail.com>
Date: Tue, 3 Jun 2008 14:53:46 -0700
Subject: ax25: Fix NULL pointer dereference and lockup.

From: Jarek Poplawski <jarkao2@gmail.com>

There is only one function in AX25 calling skb_append(), and it really
looks suspicious: appends skb after previously enqueued one, but in
the meantime this previous skb could be removed from the queue.

This patch Fixes it the simple way, so this is not fully compatible with
the current method, but testing hasn't shown any problems.

Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ax25/ax25_subr.c | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/net/ax25/ax25_subr.c b/net/ax25/ax25_subr.c
index d8f215733175..034aa10a5198 100644
--- a/net/ax25/ax25_subr.c
+++ b/net/ax25/ax25_subr.c
@@ -64,20 +64,15 @@ void ax25_frames_acked(ax25_cb *ax25, unsigned short nr)
 
 void ax25_requeue_frames(ax25_cb *ax25)
 {
-	struct sk_buff *skb, *skb_prev = NULL;
+	struct sk_buff *skb;
 
 	/*
 	 * Requeue all the un-ack-ed frames on the output queue to be picked
 	 * up by ax25_kick called from the timer. This arrangement handles the
 	 * possibility of an empty output queue.
 	 */
-	while ((skb = skb_dequeue(&ax25->ack_queue)) != NULL) {
-		if (skb_prev == NULL)
-			skb_queue_head(&ax25->write_queue, skb);
-		else
-			skb_append(skb_prev, skb, &ax25->write_queue);
-		skb_prev = skb;
-	}
+	while ((skb = skb_dequeue_tail(&ax25->ack_queue)) != NULL)
+		skb_queue_head(&ax25->write_queue, skb);
 }
 
 /*
-- 
cgit v1.2.3


From 9ecad877948deb2871d29e03786a7d7911687009 Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@openvz.org>
Date: Tue, 3 Jun 2008 15:18:36 -0700
Subject: irda: Sock leak on error path in irda_create.

Bad type/protocol specified result in sk leak.

Fix is simple - release the sk if bad values are given,
but to make it possible just to call sk_free(), I move
some sk initialization a bit lower.

Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/irda/af_irda.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/irda/af_irda.c b/net/irda/af_irda.c
index ae54b20d0470..3eb5bcc75f99 100644
--- a/net/irda/af_irda.c
+++ b/net/irda/af_irda.c
@@ -1093,11 +1093,6 @@ static int irda_create(struct net *net, struct socket *sock, int protocol)
 
 	init_waitqueue_head(&self->query_wait);
 
-	/* Initialise networking socket struct */
-	sock_init_data(sock, sk);	/* Note : set sk->sk_refcnt to 1 */
-	sk->sk_family = PF_IRDA;
-	sk->sk_protocol = protocol;
-
 	switch (sock->type) {
 	case SOCK_STREAM:
 		sock->ops = &irda_stream_ops;
@@ -1124,13 +1119,20 @@ static int irda_create(struct net *net, struct socket *sock, int protocol)
 			self->max_sdu_size_rx = TTP_SAR_UNBOUND;
 			break;
 		default:
+			sk_free(sk);
 			return -ESOCKTNOSUPPORT;
 		}
 		break;
 	default:
+		sk_free(sk);
 		return -ESOCKTNOSUPPORT;
 	}
 
+	/* Initialise networking socket struct */
+	sock_init_data(sock, sk);	/* Note : set sk->sk_refcnt to 1 */
+	sk->sk_family = PF_IRDA;
+	sk->sk_protocol = protocol;
+
 	/* Register as a client with IrLMP */
 	self->ckey = irlmp_register_client(0, NULL, NULL, NULL);
 	self->mask.word = 0xffff;
-- 
cgit v1.2.3


From b9f5f52cca3e94f1e7509f366aa250ebbe1ed0b5 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@vyatta.com>
Date: Tue, 3 Jun 2008 16:03:15 -0700
Subject: net: neighbour table ABI problem

The neighbor table time of last use information is returned in the
incorrect unit. Kernel to user space ABI's need to use USER_HZ (or
milliseconds), otherwise the application has to try and discover the
real system HZ value which is problematic.  Linux has standardized on
keeping USER_HZ consistent (100hz) even when kernel is running
internally at some other value.

This change is small, but it breaks the ABI for older version of
iproute2 utilities.  But these utilities are already broken since they
are looking at the psched_hz values which are completely different. So
let's just go ahead and fix both kernel and user space. Older
utilities will just print wrong values.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/neighbour.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 5d9d7130bd6e..3896de79dfbf 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -2057,9 +2057,9 @@ static int neigh_fill_info(struct sk_buff *skb, struct neighbour *neigh,
 		goto nla_put_failure;
 	}
 
-	ci.ndm_used	 = now - neigh->used;
-	ci.ndm_confirmed = now - neigh->confirmed;
-	ci.ndm_updated	 = now - neigh->updated;
+	ci.ndm_used	 = jiffies_to_clock_t(now - neigh->used);
+	ci.ndm_confirmed = jiffies_to_clock_t(now - neigh->confirmed);
+	ci.ndm_updated	 = jiffies_to_clock_t(now - neigh->updated);
 	ci.ndm_refcnt	 = atomic_read(&neigh->refcnt) - 1;
 	read_unlock_bh(&neigh->lock);
 
-- 
cgit v1.2.3


From 7557af25155a82ac2dad73eec6b0166868bf8ea2 Mon Sep 17 00:00:00 2001
From: Brice Goglin <Brice.Goglin@inria.fr>
Date: Tue, 3 Jun 2008 16:07:45 -0700
Subject: net_dma: remove duplicate assignment in dma_skb_copy_datagram_iovec

No need to compute copy twice in the frags loop in
dma_skb_copy_datagram_iovec().

Signed-off-by: Brice Goglin <Brice.Goglin@inria.fr>
Acked-by: Shannon Nelson <shannon.nelson@intel.com>
Signed-off-by: Maciej Sosnowski <maciej.sosnowski@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/user_dma.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/core/user_dma.c b/net/core/user_dma.c
index 0ad1cd57bc39..c77aff9c6eb3 100644
--- a/net/core/user_dma.c
+++ b/net/core/user_dma.c
@@ -75,7 +75,7 @@ int dma_skb_copy_datagram_iovec(struct dma_chan *chan,
 
 		end = start + skb_shinfo(skb)->frags[i].size;
 		copy = end - offset;
-		if ((copy = end - offset) > 0) {
+		if (copy > 0) {
 			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
 			struct page *page = frag->page;
 
-- 
cgit v1.2.3


From 51b77cae0d5aa8e1546fca855dcfe48ddfadfa9c Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Tue, 3 Jun 2008 16:36:01 -0700
Subject: route: Mark unused route cache flags as such.

Also removes an obsolete check for the unused flag RTCF_MASQ.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/route.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index df41026b60db..96be336064fb 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1792,7 +1792,7 @@ static int __mkroute_input(struct sk_buff *skb,
 	if (err)
 		flags |= RTCF_DIRECTSRC;
 
-	if (out_dev == in_dev && err && !(flags & RTCF_MASQ) &&
+	if (out_dev == in_dev && err &&
 	    (IN_DEV_SHARED_MEDIA(out_dev) ||
 	     inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
 		flags |= RTCF_DOREDIRECT;
-- 
cgit v1.2.3


From 1f9d11c7c99da706e33646c3a9080dd5a8ef9a0b Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Tue, 3 Jun 2008 16:36:27 -0700
Subject: route: Mark unused routing attributes as such

Also removes an unused policy entry for an attribute which is
only used in kernel->user direction.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_frontend.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 0f1557a4ac7a..0b2ac6a3d903 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -506,7 +506,6 @@ const struct nla_policy rtm_ipv4_policy[RTA_MAX+1] = {
 	[RTA_PREFSRC]		= { .type = NLA_U32 },
 	[RTA_METRICS]		= { .type = NLA_NESTED },
 	[RTA_MULTIPATH]		= { .len = sizeof(struct rtnexthop) },
-	[RTA_PROTOINFO]		= { .type = NLA_U32 },
 	[RTA_FLOW]		= { .type = NLA_U32 },
 };
 
-- 
cgit v1.2.3


From bc3ed28caaef55e7e3a9316464256353c5f9b1df Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Tue, 3 Jun 2008 16:36:54 -0700
Subject: netlink: Improve returned error codes

Make nlmsg_trim(), nlmsg_cancel(), genlmsg_cancel(), and
nla_nest_cancel() void functions.

Return -EMSGSIZE instead of -1 if the provided message buffer is not
big enough.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/neighbour.c    |  3 ++-
 net/core/rtnetlink.c    |  3 ++-
 net/netlink/attr.c      | 12 ++++++------
 net/netlink/genetlink.c |  6 ++++--
 net/sched/sch_dsmark.c  |  6 ++++--
 net/sched/sch_gred.c    |  3 ++-
 net/sched/sch_hfsc.c    |  2 +-
 net/sched/sch_red.c     |  3 ++-
 net/wireless/nl80211.c  | 12 ++++++++----
 9 files changed, 31 insertions(+), 19 deletions(-)

(limited to 'net')

diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 3896de79dfbf..65f01f71b3f3 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -1714,7 +1714,8 @@ static int neightbl_fill_parms(struct sk_buff *skb, struct neigh_parms *parms)
 	return nla_nest_end(skb, nest);
 
 nla_put_failure:
-	return nla_nest_cancel(skb, nest);
+	nla_nest_cancel(skb, nest);
+	return -EMSGSIZE;
 }
 
 static int neightbl_fill_info(struct sk_buff *skb, struct neigh_table *tbl,
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index cf857c4dc7b1..a9a77216310e 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -498,7 +498,8 @@ int rtnetlink_put_metrics(struct sk_buff *skb, u32 *metrics)
 	return nla_nest_end(skb, mx);
 
 nla_put_failure:
-	return nla_nest_cancel(skb, mx);
+	nla_nest_cancel(skb, mx);
+	return -EMSGSIZE;
 }
 
 int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, u32 id,
diff --git a/net/netlink/attr.c b/net/netlink/attr.c
index feb326f4a752..47bbf45ae5d7 100644
--- a/net/netlink/attr.c
+++ b/net/netlink/attr.c
@@ -400,13 +400,13 @@ void __nla_put_nohdr(struct sk_buff *skb, int attrlen, const void *data)
  * @attrlen: length of attribute payload
  * @data: head of attribute payload
  *
- * Returns -1 if the tailroom of the skb is insufficient to store
+ * Returns -EMSGSIZE if the tailroom of the skb is insufficient to store
  * the attribute header and payload.
  */
 int nla_put(struct sk_buff *skb, int attrtype, int attrlen, const void *data)
 {
 	if (unlikely(skb_tailroom(skb) < nla_total_size(attrlen)))
-		return -1;
+		return -EMSGSIZE;
 
 	__nla_put(skb, attrtype, attrlen, data);
 	return 0;
@@ -418,13 +418,13 @@ int nla_put(struct sk_buff *skb, int attrtype, int attrlen, const void *data)
  * @attrlen: length of attribute payload
  * @data: head of attribute payload
  *
- * Returns -1 if the tailroom of the skb is insufficient to store
+ * Returns -EMSGSIZE if the tailroom of the skb is insufficient to store
  * the attribute payload.
  */
 int nla_put_nohdr(struct sk_buff *skb, int attrlen, const void *data)
 {
 	if (unlikely(skb_tailroom(skb) < NLA_ALIGN(attrlen)))
-		return -1;
+		return -EMSGSIZE;
 
 	__nla_put_nohdr(skb, attrlen, data);
 	return 0;
@@ -436,13 +436,13 @@ int nla_put_nohdr(struct sk_buff *skb, int attrlen, const void *data)
  * @attrlen: length of attribute payload
  * @data: head of attribute payload
  *
- * Returns -1 if the tailroom of the skb is insufficient to store
+ * Returns -EMSGSIZE if the tailroom of the skb is insufficient to store
  * the attribute payload.
  */
 int nla_append(struct sk_buff *skb, int attrlen, const void *data)
 {
 	if (unlikely(skb_tailroom(skb) < NLA_ALIGN(attrlen)))
-		return -1;
+		return -EMSGSIZE;
 
 	memcpy(skb_put(skb, attrlen), data, attrlen);
 	return 0;
diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c
index d16929c9b4bc..f5aa23c3e886 100644
--- a/net/netlink/genetlink.c
+++ b/net/netlink/genetlink.c
@@ -554,7 +554,8 @@ static int ctrl_fill_info(struct genl_family *family, u32 pid, u32 seq,
 	return genlmsg_end(skb, hdr);
 
 nla_put_failure:
-	return genlmsg_cancel(skb, hdr);
+	genlmsg_cancel(skb, hdr);
+	return -EMSGSIZE;
 }
 
 static int ctrl_fill_mcgrp_info(struct genl_multicast_group *grp, u32 pid,
@@ -590,7 +591,8 @@ static int ctrl_fill_mcgrp_info(struct genl_multicast_group *grp, u32 pid,
 	return genlmsg_end(skb, hdr);
 
 nla_put_failure:
-	return genlmsg_cancel(skb, hdr);
+	genlmsg_cancel(skb, hdr);
+	return -EMSGSIZE;
 }
 
 static int ctrl_dumpfamily(struct sk_buff *skb, struct netlink_callback *cb)
diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c
index 0df911fd67b1..64465bacbe79 100644
--- a/net/sched/sch_dsmark.c
+++ b/net/sched/sch_dsmark.c
@@ -444,7 +444,8 @@ static int dsmark_dump_class(struct Qdisc *sch, unsigned long cl,
 	return nla_nest_end(skb, opts);
 
 nla_put_failure:
-	return nla_nest_cancel(skb, opts);
+	nla_nest_cancel(skb, opts);
+	return -EMSGSIZE;
 }
 
 static int dsmark_dump(struct Qdisc *sch, struct sk_buff *skb)
@@ -466,7 +467,8 @@ static int dsmark_dump(struct Qdisc *sch, struct sk_buff *skb)
 	return nla_nest_end(skb, opts);
 
 nla_put_failure:
-	return nla_nest_cancel(skb, opts);
+	nla_nest_cancel(skb, opts);
+	return -EMSGSIZE;
 }
 
 static const struct Qdisc_class_ops dsmark_class_ops = {
diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c
index 3a9d226ff1e4..c89fba56db56 100644
--- a/net/sched/sch_gred.c
+++ b/net/sched/sch_gred.c
@@ -582,7 +582,8 @@ append_opt:
 	return nla_nest_end(skb, opts);
 
 nla_put_failure:
-	return nla_nest_cancel(skb, opts);
+	nla_nest_cancel(skb, opts);
+	return -EMSGSIZE;
 }
 
 static void gred_destroy(struct Qdisc *sch)
diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c
index 87293d0db1d7..fdfaa3fcc16d 100644
--- a/net/sched/sch_hfsc.c
+++ b/net/sched/sch_hfsc.c
@@ -1360,7 +1360,7 @@ hfsc_dump_class(struct Qdisc *sch, unsigned long arg, struct sk_buff *skb,
 
  nla_put_failure:
 	nla_nest_cancel(skb, nest);
-	return -1;
+	return -EMSGSIZE;
 }
 
 static int
diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c
index 3dcd493f4f4a..5c569853b9c0 100644
--- a/net/sched/sch_red.c
+++ b/net/sched/sch_red.c
@@ -281,7 +281,8 @@ static int red_dump(struct Qdisc *sch, struct sk_buff *skb)
 	return nla_nest_end(skb, opts);
 
 nla_put_failure:
-	return nla_nest_cancel(skb, opts);
+	nla_nest_cancel(skb, opts);
+	return -EMSGSIZE;
 }
 
 static int red_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 2bdd4dddc0e1..fb75f265b39c 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -187,7 +187,8 @@ static int nl80211_send_wiphy(struct sk_buff *msg, u32 pid, u32 seq, int flags,
 	return genlmsg_end(msg, hdr);
 
  nla_put_failure:
-	return genlmsg_cancel(msg, hdr);
+	genlmsg_cancel(msg, hdr);
+	return -EMSGSIZE;
 }
 
 static int nl80211_dump_wiphy(struct sk_buff *skb, struct netlink_callback *cb)
@@ -273,7 +274,8 @@ static int nl80211_send_iface(struct sk_buff *msg, u32 pid, u32 seq, int flags,
 	return genlmsg_end(msg, hdr);
 
  nla_put_failure:
-	return genlmsg_cancel(msg, hdr);
+	genlmsg_cancel(msg, hdr);
+	return -EMSGSIZE;
 }
 
 static int nl80211_dump_interface(struct sk_buff *skb, struct netlink_callback *cb)
@@ -928,7 +930,8 @@ static int nl80211_send_station(struct sk_buff *msg, u32 pid, u32 seq,
 	return genlmsg_end(msg, hdr);
 
  nla_put_failure:
-	return genlmsg_cancel(msg, hdr);
+	genlmsg_cancel(msg, hdr);
+	return -EMSGSIZE;
 }
 
 static int nl80211_dump_station(struct sk_buff *skb,
@@ -1267,7 +1270,8 @@ static int nl80211_send_mpath(struct sk_buff *msg, u32 pid, u32 seq,
 	return genlmsg_end(msg, hdr);
 
  nla_put_failure:
-	return genlmsg_cancel(msg, hdr);
+	genlmsg_cancel(msg, hdr);
+	return -EMSGSIZE;
 }
 
 static int nl80211_dump_mpath(struct sk_buff *skb,
-- 
cgit v1.2.3


From ab32cd793dca21eec846a8204390d9594ed994d5 Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Tue, 3 Jun 2008 16:37:33 -0700
Subject: route: Remove unused ifa_anycast field

The field was supposed to allow the creation of an anycast route by
assigning an anycast address to an address prefix. It was never
implemented so this field is unused and serves no purpose. Remove it.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/devinet.c | 9 ---------
 1 file changed, 9 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 6848e4760f34..79a7ef6209ff 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -90,7 +90,6 @@ static const struct nla_policy ifa_ipv4_policy[IFA_MAX+1] = {
 	[IFA_LOCAL]     	= { .type = NLA_U32 },
 	[IFA_ADDRESS]   	= { .type = NLA_U32 },
 	[IFA_BROADCAST] 	= { .type = NLA_U32 },
-	[IFA_ANYCAST]   	= { .type = NLA_U32 },
 	[IFA_LABEL]     	= { .type = NLA_STRING, .len = IFNAMSIZ - 1 },
 };
 
@@ -536,9 +535,6 @@ static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh)
 	if (tb[IFA_BROADCAST])
 		ifa->ifa_broadcast = nla_get_be32(tb[IFA_BROADCAST]);
 
-	if (tb[IFA_ANYCAST])
-		ifa->ifa_anycast = nla_get_be32(tb[IFA_ANYCAST]);
-
 	if (tb[IFA_LABEL])
 		nla_strlcpy(ifa->ifa_label, tb[IFA_LABEL], IFNAMSIZ);
 	else
@@ -745,7 +741,6 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg)
 				break;
 			inet_del_ifa(in_dev, ifap, 0);
 			ifa->ifa_broadcast = 0;
-			ifa->ifa_anycast = 0;
 			ifa->ifa_scope = 0;
 		}
 
@@ -1113,7 +1108,6 @@ static inline size_t inet_nlmsg_size(void)
 	       + nla_total_size(4) /* IFA_ADDRESS */
 	       + nla_total_size(4) /* IFA_LOCAL */
 	       + nla_total_size(4) /* IFA_BROADCAST */
-	       + nla_total_size(4) /* IFA_ANYCAST */
 	       + nla_total_size(IFNAMSIZ); /* IFA_LABEL */
 }
 
@@ -1143,9 +1137,6 @@ static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa,
 	if (ifa->ifa_broadcast)
 		NLA_PUT_BE32(skb, IFA_BROADCAST, ifa->ifa_broadcast);
 
-	if (ifa->ifa_anycast)
-		NLA_PUT_BE32(skb, IFA_ANYCAST, ifa->ifa_anycast);
-
 	if (ifa->ifa_label[0])
 		NLA_PUT_STRING(skb, IFA_LABEL, ifa->ifa_label);
 
-- 
cgit v1.2.3


From d430a227d272fa514bade388bf511dba4ec2962a Mon Sep 17 00:00:00 2001
From: Al Viro <viro@ftp.linux.org.uk>
Date: Mon, 2 Jun 2008 10:59:02 +0100
Subject: bogus format in ip6mr

ptrdiff_t is %t..., not %Z...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 net/ipv6/ip6mr.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index 2de3c464fe75..14796181e8b5 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -197,7 +197,7 @@ static int ip6mr_vif_seq_show(struct seq_file *seq, void *v)
 		const char *name = vif->dev ? vif->dev->name : "none";
 
 		seq_printf(seq,
-			   "%2Zd %-10s %8ld %7ld  %8ld %7ld %05X\n",
+			   "%2td %-10s %8ld %7ld  %8ld %7ld %05X\n",
 			   vif - vif6_table,
 			   name, vif->bytes_in, vif->pkt_in,
 			   vif->bytes_out, vif->pkt_out,
-- 
cgit v1.2.3


From d2ee3f2c4b1db1320c1efb4dcaceeaf6c7e6c2d3 Mon Sep 17 00:00:00 2001
From: Dong Wei <dwei.zh@gmail.com>
Date: Wed, 4 Jun 2008 09:57:51 -0700
Subject: netfilter: xt_connlimit: fix accouning when receive RST packet in
 ESTABLISHED state

In xt_connlimit match module, the counter of an IP is decreased when
the TCP packet is go through the chain with ip_conntrack state TW.
Well, it's very natural that the server and client close the socket
with FIN packet. But when the client/server close the socket with RST
packet(using so_linger), the counter for this connection still exsit.
The following patch can fix it which is based on linux-2.6.25.4

Signed-off-by: Dong Wei <dwei.zh@gmail.com>
Acked-by: Jan Engelhardt <jengelh@medozas.de>
Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/netfilter/xt_connlimit.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/netfilter/xt_connlimit.c b/net/netfilter/xt_connlimit.c
index 2e89a00df92c..70907f6baac3 100644
--- a/net/netfilter/xt_connlimit.c
+++ b/net/netfilter/xt_connlimit.c
@@ -73,7 +73,8 @@ connlimit_iphash6(const union nf_inet_addr *addr,
 static inline bool already_closed(const struct nf_conn *conn)
 {
 	if (nf_ct_protonum(conn) == IPPROTO_TCP)
-		return conn->proto.tcp.state == TCP_CONNTRACK_TIME_WAIT;
+		return conn->proto.tcp.state == TCP_CONNTRACK_TIME_WAIT ||
+		       conn->proto.tcp.state == TCP_CONNTRACK_CLOSE;
 	else
 		return 0;
 }
-- 
cgit v1.2.3


From b9c698964614f71b9c8afeca163a945b4c2e2d20 Mon Sep 17 00:00:00 2001
From: Jarek Poplawski <jarkao2@gmail.com>
Date: Wed, 4 Jun 2008 09:58:27 -0700
Subject: netfilter: nf_conntrack_ipv6: fix inconsistent lock state in
 nf_ct_frag6_gather()

[   63.531438] =================================
[   63.531520] [ INFO: inconsistent lock state ]
[   63.531520] 2.6.26-rc4 #7
[   63.531520] ---------------------------------
[   63.531520] inconsistent {softirq-on-W} -> {in-softirq-W} usage.
[   63.531520] tcpsic6/3864 [HC0[0]:SC1[1]:HE1:SE0] takes:
[   63.531520]  (&q->lock#2){-+..}, at: [<c07175b0>] ipv6_frag_rcv+0xd0/0xbd0
[   63.531520] {softirq-on-W} state was registered at:
[   63.531520]   [<c0143bba>] __lock_acquire+0x3aa/0x1080
[   63.531520]   [<c0144906>] lock_acquire+0x76/0xa0
[   63.531520]   [<c07a8f0b>] _spin_lock+0x2b/0x40
[   63.531520]   [<c0727636>] nf_ct_frag6_gather+0x3f6/0x910
 ...

According to this and another similar lockdep report inet_fragment
locks are taken from nf_ct_frag6_gather() with softirqs enabled, but
these locks are mainly used in softirq context, so disabling BHs is
necessary.

Reported-and-tested-by: Eric Sesterhenn <snakebyte@gmx.de>
Signed-off-by: Jarek Poplawski <jarkao2@gmail.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/netfilter/nf_conntrack_reasm.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index 2dccad48058c..e65e26e210ee 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -209,7 +209,9 @@ fq_find(__be32 id, struct in6_addr *src, struct in6_addr *dst)
 	arg.dst = dst;
 	hash = ip6qhashfn(id, src, dst);
 
+	local_bh_disable();
 	q = inet_frag_find(&nf_init_frags, &nf_frags, &arg, hash);
+	local_bh_enable();
 	if (q == NULL)
 		goto oom;
 
@@ -638,10 +640,10 @@ struct sk_buff *nf_ct_frag6_gather(struct sk_buff *skb)
 		goto ret_orig;
 	}
 
-	spin_lock(&fq->q.lock);
+	spin_lock_bh(&fq->q.lock);
 
 	if (nf_ct_frag6_queue(fq, clone, fhdr, nhoff) < 0) {
-		spin_unlock(&fq->q.lock);
+		spin_unlock_bh(&fq->q.lock);
 		pr_debug("Can't insert skb to queue\n");
 		fq_put(fq);
 		goto ret_orig;
@@ -653,7 +655,7 @@ struct sk_buff *nf_ct_frag6_gather(struct sk_buff *skb)
 		if (ret_skb == NULL)
 			pr_debug("Can't reassemble fragmented packets\n");
 	}
-	spin_unlock(&fq->q.lock);
+	spin_unlock_bh(&fq->q.lock);
 
 	fq_put(fq);
 	return ret_skb;
-- 
cgit v1.2.3


From 8aca6cb1179ed9bef9351028c8d8af852903eae2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= <ilpo.jarvinen@helsinki.fi>
Date: Wed, 4 Jun 2008 11:34:22 -0700
Subject: tcp: Fix inconsistency source (CA_Open only when !tcp_left_out(tp))
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

It is possible that this skip path causes TCP to end up into an
invalid state where ca_state was left to CA_Open while some
segments already came into sacked_out. If next valid ACK doesn't
contain new SACK information TCP fails to enter into
tcp_fastretrans_alert(). Thus at least high_seq is set
incorrectly to a too high seqno because some new data segments
could be sent in between (and also, limited transmit is not
being correctly invoked there). Reordering in both directions
can easily cause this situation to occur.

I guess we would want to use tcp_moderate_cwnd(tp) there as well
as it may be possible to use this to trigger oversized burst to
network by sending an old ACK with huge amount of SACK info, but
I'm a bit unsure about its effects (mainly to FlightSize), so to
be on the safe side I just currently fixed it minimally to keep
TCP's state consistent (obviously, such nasty ACKs have been
possible this far). Though it seems that FlightSize is already
underestimated by some amount, so probably on the long term we
might want to trigger recovery there too, if appropriate, to make
FlightSize calculation to resemble reality at the time when the
losses where discovered (but such change scares me too much now
and requires some more thinking anyway how to do that as it
likely involves some code shuffling).

This bug was found by Brian Vowell while running my TCP debug
patch to find cause of another TCP issue (fackets_out
miscount).

Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 29 +++++++++++++++++++----------
 1 file changed, 19 insertions(+), 10 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index b54d9d37b636..54a0b7412782 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2483,6 +2483,20 @@ static inline void tcp_complete_cwr(struct sock *sk)
 	tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR);
 }
 
+static void tcp_try_keep_open(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	int state = TCP_CA_Open;
+
+	if (tcp_left_out(tp) || tp->retrans_out || tp->undo_marker)
+		state = TCP_CA_Disorder;
+
+	if (inet_csk(sk)->icsk_ca_state != state) {
+		tcp_set_ca_state(sk, state);
+		tp->high_seq = tp->snd_nxt;
+	}
+}
+
 static void tcp_try_to_open(struct sock *sk, int flag)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
@@ -2496,15 +2510,7 @@ static void tcp_try_to_open(struct sock *sk, int flag)
 		tcp_enter_cwr(sk, 1);
 
 	if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) {
-		int state = TCP_CA_Open;
-
-		if (tcp_left_out(tp) || tp->retrans_out || tp->undo_marker)
-			state = TCP_CA_Disorder;
-
-		if (inet_csk(sk)->icsk_ca_state != state) {
-			tcp_set_ca_state(sk, state);
-			tp->high_seq = tp->snd_nxt;
-		}
+		tcp_try_keep_open(sk);
 		tcp_moderate_cwnd(tp);
 	} else {
 		tcp_cwnd_down(sk, flag);
@@ -3310,8 +3316,11 @@ no_queue:
 	return 1;
 
 old_ack:
-	if (TCP_SKB_CB(skb)->sacked)
+	if (TCP_SKB_CB(skb)->sacked) {
 		tcp_sacktag_write_queue(sk, skb, prior_snd_una);
+		if (icsk->icsk_ca_state == TCP_CA_Open)
+			tcp_try_keep_open(sk);
+	}
 
 uninteresting_ack:
 	SOCK_DEBUG(sk, "Ack %u out of %u:%u\n", ack, tp->snd_una, tp->snd_nxt);
-- 
cgit v1.2.3


From e51171019bb0e1f9fb57c25bd2e38ce652eaea27 Mon Sep 17 00:00:00 2001
From: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Date: Thu, 29 May 2008 19:55:05 +0900
Subject: [SCTP]: Fix NULL dereference of asoc.

Commit 7cbca67c073263c179f605bdbbdc565ab29d801d ("[IPV6]: Support
Source Address Selection API (RFC5014)") introduced NULL dereference
of asoc to sctp_v6_get_saddr in net/sctp/ipv6.c.
Pointed out by Johann Felix Soden <johfel@users.sourceforge.net>.

Signed-off-by: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
---
 net/sctp/ipv6.c      | 5 +++--
 net/sctp/protocol.c  | 3 ++-
 net/sctp/transport.c | 2 +-
 3 files changed, 6 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index e45e44c60635..e4aac3266fcd 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -299,7 +299,8 @@ static inline int sctp_v6_addr_match_len(union sctp_addr *s1,
 /* Fills in the source address(saddr) based on the destination address(daddr)
  * and asoc's bind address list.
  */
-static void sctp_v6_get_saddr(struct sctp_association *asoc,
+static void sctp_v6_get_saddr(struct sctp_sock *sk,
+			      struct sctp_association *asoc,
 			      struct dst_entry *dst,
 			      union sctp_addr *daddr,
 			      union sctp_addr *saddr)
@@ -318,7 +319,7 @@ static void sctp_v6_get_saddr(struct sctp_association *asoc,
 	if (!asoc) {
 		ipv6_dev_get_saddr(dst ? ip6_dst_idev(dst)->dev : NULL,
 				   &daddr->v6.sin6_addr,
-				   inet6_sk(asoc->base.sk)->srcprefs,
+				   inet6_sk(&sk->inet.sk)->srcprefs,
 				   &saddr->v6.sin6_addr);
 		SCTP_DEBUG_PRINTK("saddr from ipv6_get_saddr: " NIP6_FMT "\n",
 				  NIP6(saddr->v6.sin6_addr));
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index 0ec234b762c2..13ee7fa92e07 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -519,7 +519,8 @@ out:
 /* For v4, the source address is cached in the route entry(dst). So no need
  * to cache it separately and hence this is an empty routine.
  */
-static void sctp_v4_get_saddr(struct sctp_association *asoc,
+static void sctp_v4_get_saddr(struct sctp_sock *sk,
+			      struct sctp_association *asoc,
 			      struct dst_entry *dst,
 			      union sctp_addr *daddr,
 			      union sctp_addr *saddr)
diff --git a/net/sctp/transport.c b/net/sctp/transport.c
index f4938f6c5abe..62082e7b7972 100644
--- a/net/sctp/transport.c
+++ b/net/sctp/transport.c
@@ -291,7 +291,7 @@ void sctp_transport_route(struct sctp_transport *transport,
 	if (saddr)
 		memcpy(&transport->saddr, saddr, sizeof(union sctp_addr));
 	else
-		af->get_saddr(asoc, dst, daddr, &transport->saddr);
+		af->get_saddr(opt, asoc, dst, daddr, &transport->saddr);
 
 	transport->dst = dst;
 	if ((transport->param_flags & SPP_PMTUD_DISABLE) && transport->pathmtu) {
-- 
cgit v1.2.3


From a3c960899e042bc1c2b730a2115fa32da7802039 Mon Sep 17 00:00:00 2001
From: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Date: Wed, 4 Jun 2008 01:30:25 +0900
Subject: [IPV6] UDP: Possible dst leak in udpv6_sendmsg.

ip6_sk_dst_lookup returns held dst entry. It should be released
on all paths beyond this point. Add missed release when up->pending
is set.

Bug report and initial patch by Denis V. Lunev <den@openvz.org>.

Signed-off-by: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Acked-by: Denis V. Lunev <den@openvz.org>
---
 net/ipv6/udp.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net')

diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 1fd784f3e2ec..47123bf5eb0f 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -848,12 +848,14 @@ do_append_data:
 		} else {
 			dst_release(dst);
 		}
+		dst = NULL;
 	}
 
 	if (err > 0)
 		err = np->recverr ? net_xmit_errno(err) : 0;
 	release_sock(sk);
 out:
+	dst_release(dst);
 	fl6_sock_release(flowlabel);
 	if (!err)
 		return len;
-- 
cgit v1.2.3


From 24ef0da7b864435f221f668bc8a324160d063e78 Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Wed, 28 May 2008 16:54:22 +0200
Subject: [IPV6] ADDRCONF: Check range of prefix length

As of now, the prefix length is not vaildated when adding or deleting
addresses. The value is passed directly into the inet6_ifaddr structure
and later passed on to memcmp() as length indicator which relies on
the value never to exceed 128 (bits).

Due to the missing check, the currently code allows for any 8 bit
value to be passed on as prefix length while using the netlink
interface, and any 32 bit value while using the ioctl interface.

[Use unsigned int instead to generate better code - yoshfuji]

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
---
 net/ipv6/addrconf.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 3a835578fd1c..c3b20c5afa3e 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -2027,7 +2027,7 @@ err_exit:
  *	Manual configuration of address on an interface
  */
 static int inet6_addr_add(struct net *net, int ifindex, struct in6_addr *pfx,
-			  int plen, __u8 ifa_flags, __u32 prefered_lft,
+			  unsigned int plen, __u8 ifa_flags, __u32 prefered_lft,
 			  __u32 valid_lft)
 {
 	struct inet6_ifaddr *ifp;
@@ -2039,6 +2039,9 @@ static int inet6_addr_add(struct net *net, int ifindex, struct in6_addr *pfx,
 
 	ASSERT_RTNL();
 
+	if (plen > 128)
+		return -EINVAL;
+
 	/* check the lifetime */
 	if (!valid_lft || prefered_lft > valid_lft)
 		return -EINVAL;
@@ -2095,12 +2098,15 @@ static int inet6_addr_add(struct net *net, int ifindex, struct in6_addr *pfx,
 }
 
 static int inet6_addr_del(struct net *net, int ifindex, struct in6_addr *pfx,
-			  int plen)
+			  unsigned int plen)
 {
 	struct inet6_ifaddr *ifp;
 	struct inet6_dev *idev;
 	struct net_device *dev;
 
+	if (plen > 128)
+		return -EINVAL;
+
 	dev = __dev_get_by_index(net, ifindex);
 	if (!dev)
 		return -ENODEV;
-- 
cgit v1.2.3


From 82836372311a5cbf9cc5f4f47f9b56cb9edfe90d Mon Sep 17 00:00:00 2001
From: Colin <colins@sjtu.edu.cn>
Date: Tue, 27 May 2008 00:04:43 +0800
Subject: [IPV6] TUNNEL6: Fix incoming packet length check for inter-protocol
 tunnel.

I discover a strange behavior in [ipv4 in ipv6] tunnel. When IPv6 tunnel
payload is less than 40(0x28), packet can be sent to network, received in
physical interface, but not seen in IP tunnel interface. No counter increase
in tunnel interface.

Signed-off-by: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
---
 net/ipv6/tunnel6.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv6/tunnel6.c b/net/ipv6/tunnel6.c
index 6323921b40be..669f280989c3 100644
--- a/net/ipv6/tunnel6.c
+++ b/net/ipv6/tunnel6.c
@@ -109,7 +109,7 @@ static int tunnel46_rcv(struct sk_buff *skb)
 {
 	struct xfrm6_tunnel *handler;
 
-	if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
+	if (!pskb_may_pull(skb, sizeof(struct iphdr)))
 		goto drop;
 
 	for (handler = tunnel46_handlers; handler; handler = handler->next)
-- 
cgit v1.2.3


From baa2bfb8aef24bb7fe1875b256918724b3884662 Mon Sep 17 00:00:00 2001
From: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Date: Fri, 30 May 2008 11:35:03 +0900
Subject: [IPV4] TUNNEL4: Fix incoming packet length check for inter-protocol
 tunnel.

Signed-off-by: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
---
 net/ipv4/tunnel4.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/tunnel4.c b/net/ipv4/tunnel4.c
index d3b709a6f264..cb1f0e83830b 100644
--- a/net/ipv4/tunnel4.c
+++ b/net/ipv4/tunnel4.c
@@ -97,7 +97,7 @@ static int tunnel64_rcv(struct sk_buff *skb)
 {
 	struct xfrm_tunnel *handler;
 
-	if (!pskb_may_pull(skb, sizeof(struct iphdr)))
+	if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
 		goto drop;
 
 	for (handler = tunnel64_handlers; handler; handler = handler->next)
-- 
cgit v1.2.3


From 4bed72e4f5502ea3322f0a00794815fa58951abe Mon Sep 17 00:00:00 2001
From: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Date: Tue, 27 May 2008 17:37:49 +0900
Subject: [IPV6] ADDRCONF: Allow longer lifetime on 64bit archs.

- Allow longer lifetimes (>= 0x7fffffff/HZ) on 64bit archs
  by using unsigned long.
- Shadow this arithmetic overflow workaround by introducing
  helper functions: addrconf_timeout_fixup() and
  addrconf_finite_timeout().

Signed-off-by: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
---
 net/ipv6/addrconf.c | 97 +++++++++++++++++++++++++++--------------------------
 net/ipv6/route.c    | 12 ++-----
 2 files changed, 53 insertions(+), 56 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index c3b20c5afa3e..147588f4c7c0 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -731,8 +731,13 @@ static void ipv6_del_addr(struct inet6_ifaddr *ifp)
 						onlink = -1;
 
 					spin_lock(&ifa->lock);
-					lifetime = min_t(unsigned long,
-							 ifa->valid_lft, 0x7fffffffUL/HZ);
+
+					lifetime = addrconf_timeout_fixup(ifa->valid_lft, HZ);
+					/*
+					 * Note: Because this address is
+					 * not permanent, lifetime <
+					 * LONG_MAX / HZ here.
+					 */
 					if (time_before(expires,
 							ifa->tstamp + lifetime * HZ))
 						expires = ifa->tstamp + lifetime * HZ;
@@ -1722,7 +1727,6 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len)
 	__u32 valid_lft;
 	__u32 prefered_lft;
 	int addr_type;
-	unsigned long rt_expires;
 	struct inet6_dev *in6_dev;
 
 	pinfo = (struct prefix_info *) opt;
@@ -1764,28 +1768,23 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len)
 	 *	2) Configure prefixes with the auto flag set
 	 */
 
-	if (valid_lft == INFINITY_LIFE_TIME)
-		rt_expires = ~0UL;
-	else if (valid_lft >= 0x7FFFFFFF/HZ) {
+	if (pinfo->onlink) {
+		struct rt6_info *rt;
+		unsigned long rt_expires;
+
 		/* Avoid arithmetic overflow. Really, we could
 		 * save rt_expires in seconds, likely valid_lft,
 		 * but it would require division in fib gc, that it
 		 * not good.
 		 */
-		rt_expires = 0x7FFFFFFF - (0x7FFFFFFF % HZ);
-	} else
-		rt_expires = valid_lft * HZ;
+		if (HZ > USER_HZ)
+			rt_expires = addrconf_timeout_fixup(valid_lft, HZ);
+		else
+			rt_expires = addrconf_timeout_fixup(valid_lft, USER_HZ);
 
-	/*
-	 * We convert this (in jiffies) to clock_t later.
-	 * Avoid arithmetic overflow there as well.
-	 * Overflow can happen only if HZ < USER_HZ.
-	 */
-	if (HZ < USER_HZ && ~rt_expires && rt_expires > 0x7FFFFFFF / USER_HZ)
-		rt_expires = 0x7FFFFFFF / USER_HZ;
+		if (addrconf_finite_timeout(rt_expires))
+			rt_expires *= HZ;
 
-	if (pinfo->onlink) {
-		struct rt6_info *rt;
 		rt = rt6_lookup(dev_net(dev), &pinfo->prefix, NULL,
 				dev->ifindex, 1);
 
@@ -1794,7 +1793,7 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len)
 			if (valid_lft == 0) {
 				ip6_del_rt(rt);
 				rt = NULL;
-			} else if (~rt_expires) {
+			} else if (addrconf_finite_timeout(rt_expires)) {
 				/* not infinity */
 				rt->rt6i_expires = jiffies + rt_expires;
 				rt->rt6i_flags |= RTF_EXPIRES;
@@ -1803,9 +1802,9 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len)
 				rt->rt6i_expires = 0;
 			}
 		} else if (valid_lft) {
-			int flags = RTF_ADDRCONF | RTF_PREFIX_RT;
 			clock_t expires = 0;
-			if (~rt_expires) {
+			int flags = RTF_ADDRCONF | RTF_PREFIX_RT;
+			if (addrconf_finite_timeout(rt_expires)) {
 				/* not infinity */
 				flags |= RTF_EXPIRES;
 				expires = jiffies_to_clock_t(rt_expires);
@@ -2036,6 +2035,7 @@ static int inet6_addr_add(struct net *net, int ifindex, struct in6_addr *pfx,
 	int scope;
 	u32 flags;
 	clock_t expires;
+	unsigned long timeout;
 
 	ASSERT_RTNL();
 
@@ -2055,22 +2055,23 @@ static int inet6_addr_add(struct net *net, int ifindex, struct in6_addr *pfx,
 
 	scope = ipv6_addr_scope(pfx);
 
-	if (valid_lft == INFINITY_LIFE_TIME) {
-		ifa_flags |= IFA_F_PERMANENT;
-		flags = 0;
-		expires = 0;
-	} else {
-		if (valid_lft >= 0x7FFFFFFF/HZ)
-			valid_lft = 0x7FFFFFFF/HZ;
+	timeout = addrconf_timeout_fixup(valid_lft, HZ);
+	if (addrconf_finite_timeout(timeout)) {
+		expires = jiffies_to_clock_t(timeout * HZ);
+		valid_lft = timeout;
 		flags = RTF_EXPIRES;
-		expires = jiffies_to_clock_t(valid_lft * HZ);
+	} else {
+		expires = 0;
+		flags = 0;
+		ifa_flags |= IFA_F_PERMANENT;
 	}
 
-	if (prefered_lft == 0)
-		ifa_flags |= IFA_F_DEPRECATED;
-	else if ((prefered_lft >= 0x7FFFFFFF/HZ) &&
-		 (prefered_lft != INFINITY_LIFE_TIME))
-		prefered_lft = 0x7FFFFFFF/HZ;
+	timeout = addrconf_timeout_fixup(prefered_lft, HZ);
+	if (addrconf_finite_timeout(timeout)) {
+		if (timeout == 0)
+			ifa_flags |= IFA_F_DEPRECATED;
+		prefered_lft = timeout;
+	}
 
 	ifp = ipv6_add_addr(idev, pfx, plen, scope, ifa_flags);
 
@@ -3175,26 +3176,28 @@ static int inet6_addr_modify(struct inet6_ifaddr *ifp, u8 ifa_flags,
 {
 	u32 flags;
 	clock_t expires;
+	unsigned long timeout;
 
 	if (!valid_lft || (prefered_lft > valid_lft))
 		return -EINVAL;
 
-	if (valid_lft == INFINITY_LIFE_TIME) {
-		ifa_flags |= IFA_F_PERMANENT;
-		flags = 0;
-		expires = 0;
-	} else {
-		if (valid_lft >= 0x7FFFFFFF/HZ)
-			valid_lft = 0x7FFFFFFF/HZ;
+	timeout = addrconf_timeout_fixup(valid_lft, HZ);
+	if (addrconf_finite_timeout(timeout)) {
+		expires = jiffies_to_clock_t(timeout * HZ);
+		valid_lft = timeout;
 		flags = RTF_EXPIRES;
-		expires = jiffies_to_clock_t(valid_lft * HZ);
+	} else {
+		expires = 0;
+		flags = 0;
+		ifa_flags |= IFA_F_PERMANENT;
 	}
 
-	if (prefered_lft == 0)
-		ifa_flags |= IFA_F_DEPRECATED;
-	else if ((prefered_lft >= 0x7FFFFFFF/HZ) &&
-		 (prefered_lft != INFINITY_LIFE_TIME))
-		prefered_lft = 0x7FFFFFFF/HZ;
+	timeout = addrconf_timeout_fixup(prefered_lft, HZ);
+	if (addrconf_finite_timeout(timeout)) {
+		if (timeout == 0)
+			ifa_flags |= IFA_F_DEPRECATED;
+		prefered_lft = timeout;
+	}
 
 	spin_lock_bh(&ifp->lock);
 	ifp->flags = (ifp->flags & ~(IFA_F_DEPRECATED | IFA_F_PERMANENT | IFA_F_NODAD | IFA_F_HOMEADDRESS)) | ifa_flags;
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 48534c6c0735..220cffe9e63b 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -446,7 +446,7 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
 	struct route_info *rinfo = (struct route_info *) opt;
 	struct in6_addr prefix_buf, *prefix;
 	unsigned int pref;
-	u32 lifetime;
+	unsigned long lifetime;
 	struct rt6_info *rt;
 
 	if (len < sizeof(struct route_info)) {
@@ -472,13 +472,7 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
 	if (pref == ICMPV6_ROUTER_PREF_INVALID)
 		pref = ICMPV6_ROUTER_PREF_MEDIUM;
 
-	lifetime = ntohl(rinfo->lifetime);
-	if (lifetime == 0xffffffff) {
-		/* infinity */
-	} else if (lifetime > 0x7fffffff/HZ - 1) {
-		/* Avoid arithmetic overflow */
-		lifetime = 0x7fffffff/HZ - 1;
-	}
+	lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
 
 	if (rinfo->length == 3)
 		prefix = (struct in6_addr *)rinfo->prefix;
@@ -506,7 +500,7 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
 				 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
 
 	if (rt) {
-		if (lifetime == 0xffffffff) {
+		if (!addrconf_finite_timeout(lifetime)) {
 			rt->rt6i_flags &= ~RTF_EXPIRES;
 		} else {
 			rt->rt6i_expires = jiffies + HZ * lifetime;
-- 
cgit v1.2.3


From 05335c2220c4911b69cb1bdd79e603ab08088372 Mon Sep 17 00:00:00 2001
From: Yang Hongyang <yanghy@cn.fujitsu.com>
Date: Wed, 28 May 2008 16:23:47 +0800
Subject: [IPV6]: Fix the return value of get destination options with NULL
 data pointer

If we pass NULL data buffer to getsockopt(), it will return 0,
and the option length is set to -EFAULT:
    getsockopt(sk, IPPROTO_IPV6, IPV6_DSTOPTS, NULL, &len);

This is because ipv6_getsockopt_sticky() will return -EFAULT or
-EINVAL if some error occur.

This patch fix this problem.

Signed-off-by: Yang Hongyang <yanghy@cn.fujitsu.com>
Signed-off-by: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
---
 net/ipv6/ipv6_sockglue.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'net')

diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index 56d55fecf8ec..aa7bedf780e5 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -975,6 +975,9 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
 		len = ipv6_getsockopt_sticky(sk, np->opt,
 					     optname, optval, len);
 		release_sock(sk);
+		/* check if ipv6_getsockopt_sticky() returns err code */
+		if (len < 0)
+			return len;
 		return put_user(len, optlen);
 	}
 
-- 
cgit v1.2.3


From 95b496b66615d8c43f77702049b1bd01e2f06595 Mon Sep 17 00:00:00 2001
From: Yang Hongyang <yanghy@cn.fujitsu.com>
Date: Wed, 28 May 2008 16:27:28 +0800
Subject: [IPV6]: Fix the data length of get destination options with short
 length

 If get destination options with length which is not enough for that
option,getsockopt() will still return the real length of the option,
which is larger then the buffer space.
 This is because ipv6_getsockopt_sticky() returns the real length of
the option.

This patch fix this problem.

Signed-off-by: Yang Hongyang <yanghy@cn.fujitsu.com>
Signed-off-by: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
---
 net/ipv6/ipv6_sockglue.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index aa7bedf780e5..9293b9f0ac23 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -832,7 +832,7 @@ static int ipv6_getsockopt_sticky(struct sock *sk, struct ipv6_txoptions *opt,
 	len = min_t(unsigned int, len, ipv6_optlen(hdr));
 	if (copy_to_user(optval, hdr, len))
 		return -EFAULT;
-	return ipv6_optlen(hdr);
+	return len;
 }
 
 static int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
-- 
cgit v1.2.3


From 187e38384c4abfbbb1b880fab234d16c2df23a25 Mon Sep 17 00:00:00 2001
From: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Date: Wed, 4 Jun 2008 13:01:37 +0900
Subject: [IPV6]: Check outgoing interface even if source address is
 unspecified.

The outgoing interface index (ipi6_ifindex) in IPV6_PKTINFO
ancillary data, is not checked if the source address (ipi6_addr)
is unspecified.  If the ipi6_ifindex is the not-exist interface,
it should be fail.

Based on patch from Shan Wei <shanwei@cn.fujitsu.com> and
Brian Haley <brian.haley@hp.com>.

Signed-off-by: Shan Wei <shanwei@cn.fujitsu.com>
Signed-off-by: Brian Haley <brian.haley@hp.com>
Signed-off-by: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
---
 net/ipv6/datagram.c | 42 ++++++++++++++++++++++--------------------
 1 file changed, 22 insertions(+), 20 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index 94fa6ae77cfe..53e3883f7666 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -509,7 +509,6 @@ int datagram_send_ctl(struct msghdr *msg, struct flowi *fl,
 
 	for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) {
 		int addr_type;
-		struct net_device *dev = NULL;
 
 		if (!CMSG_OK(msg, cmsg)) {
 			err = -EINVAL;
@@ -522,6 +521,9 @@ int datagram_send_ctl(struct msghdr *msg, struct flowi *fl,
 		switch (cmsg->cmsg_type) {
 		case IPV6_PKTINFO:
 		case IPV6_2292PKTINFO:
+		    {
+			struct net_device *dev = NULL;
+
 			if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct in6_pktinfo))) {
 				err = -EINVAL;
 				goto exit_f;
@@ -535,32 +537,32 @@ int datagram_send_ctl(struct msghdr *msg, struct flowi *fl,
 				fl->oif = src_info->ipi6_ifindex;
 			}
 
-			addr_type = ipv6_addr_type(&src_info->ipi6_addr);
+			addr_type = __ipv6_addr_type(&src_info->ipi6_addr);
 
-			if (addr_type == IPV6_ADDR_ANY)
-				break;
+			if (fl->oif) {
+				dev = dev_get_by_index(&init_net, fl->oif);
+				if (!dev)
+					return -ENODEV;
+			} else if (addr_type & IPV6_ADDR_LINKLOCAL)
+				return -EINVAL;
 
-			if (addr_type & IPV6_ADDR_LINKLOCAL) {
-				if (!src_info->ipi6_ifindex)
-					return -EINVAL;
-				else {
-					dev = dev_get_by_index(&init_net, src_info->ipi6_ifindex);
-					if (!dev)
-						return -ENODEV;
-				}
-			}
-			if (!ipv6_chk_addr(&init_net, &src_info->ipi6_addr,
-					   dev, 0)) {
-				if (dev)
-					dev_put(dev);
-				err = -EINVAL;
-				goto exit_f;
+			if (addr_type != IPV6_ADDR_ANY) {
+				int strict = __ipv6_addr_src_scope(addr_type) <= IPV6_ADDR_SCOPE_LINKLOCAL;
+				if (!ipv6_chk_addr(&init_net, &src_info->ipi6_addr,
+						   strict ? dev : NULL, 0))
+					err = -EINVAL;
+				else
+					ipv6_addr_copy(&fl->fl6_src, &src_info->ipi6_addr);
 			}
+
 			if (dev)
 				dev_put(dev);
 
-			ipv6_addr_copy(&fl->fl6_src, &src_info->ipi6_addr);
+			if (err)
+				goto exit_f;
+
 			break;
+		    }
 
 		case IPV6_FLOWINFO:
 			if (cmsg->cmsg_len < CMSG_LEN(4)) {
-- 
cgit v1.2.3


From 91e1908f569dd96a25a3947de8771e6cc93999dd Mon Sep 17 00:00:00 2001
From: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Date: Wed, 4 Jun 2008 13:02:49 +0900
Subject: [IPV6] NETNS: Handle ancillary data in appropriate namespace.

Signed-off-by: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
---
 net/ipv6/datagram.c      | 7 ++++---
 net/ipv6/ip6_flowlabel.c | 2 +-
 net/ipv6/ipv6_sockglue.c | 2 +-
 net/ipv6/raw.c           | 2 +-
 net/ipv6/udp.c           | 2 +-
 5 files changed, 8 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index 53e3883f7666..b9c2de84a8a2 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -496,7 +496,8 @@ int datagram_recv_ctl(struct sock *sk, struct msghdr *msg, struct sk_buff *skb)
 	return 0;
 }
 
-int datagram_send_ctl(struct msghdr *msg, struct flowi *fl,
+int datagram_send_ctl(struct net *net,
+		      struct msghdr *msg, struct flowi *fl,
 		      struct ipv6_txoptions *opt,
 		      int *hlimit, int *tclass)
 {
@@ -540,7 +541,7 @@ int datagram_send_ctl(struct msghdr *msg, struct flowi *fl,
 			addr_type = __ipv6_addr_type(&src_info->ipi6_addr);
 
 			if (fl->oif) {
-				dev = dev_get_by_index(&init_net, fl->oif);
+				dev = dev_get_by_index(net, fl->oif);
 				if (!dev)
 					return -ENODEV;
 			} else if (addr_type & IPV6_ADDR_LINKLOCAL)
@@ -548,7 +549,7 @@ int datagram_send_ctl(struct msghdr *msg, struct flowi *fl,
 
 			if (addr_type != IPV6_ADDR_ANY) {
 				int strict = __ipv6_addr_src_scope(addr_type) <= IPV6_ADDR_SCOPE_LINKLOCAL;
-				if (!ipv6_chk_addr(&init_net, &src_info->ipi6_addr,
+				if (!ipv6_chk_addr(net, &src_info->ipi6_addr,
 						   strict ? dev : NULL, 0))
 					err = -EINVAL;
 				else
diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c
index eb7a940310f4..37a4e777e347 100644
--- a/net/ipv6/ip6_flowlabel.c
+++ b/net/ipv6/ip6_flowlabel.c
@@ -354,7 +354,7 @@ fl_create(struct net *net, struct in6_flowlabel_req *freq, char __user *optval,
 		msg.msg_control = (void*)(fl->opt+1);
 		flowi.oif = 0;
 
-		err = datagram_send_ctl(&msg, &flowi, fl->opt, &junk, &junk);
+		err = datagram_send_ctl(net, &msg, &flowi, fl->opt, &junk, &junk);
 		if (err)
 			goto done;
 		err = -EINVAL;
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index 9293b9f0ac23..3eef8e5b3636 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -416,7 +416,7 @@ sticky_done:
 		msg.msg_controllen = optlen;
 		msg.msg_control = (void*)(opt+1);
 
-		retv = datagram_send_ctl(&msg, &fl, opt, &junk, &junk);
+		retv = datagram_send_ctl(net, &msg, &fl, opt, &junk, &junk);
 		if (retv)
 			goto done;
 update:
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index 232e0dc45bf5..603df76e0522 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -813,7 +813,7 @@ static int rawv6_sendmsg(struct kiocb *iocb, struct sock *sk,
 		memset(opt, 0, sizeof(struct ipv6_txoptions));
 		opt->tot_len = sizeof(struct ipv6_txoptions);
 
-		err = datagram_send_ctl(msg, &fl, opt, &hlimit, &tclass);
+		err = datagram_send_ctl(sock_net(sk), msg, &fl, opt, &hlimit, &tclass);
 		if (err < 0) {
 			fl6_sock_release(flowlabel);
 			return err;
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 47123bf5eb0f..1b35c4722004 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -731,7 +731,7 @@ do_udp_sendmsg:
 		memset(opt, 0, sizeof(struct ipv6_txoptions));
 		opt->tot_len = sizeof(*opt);
 
-		err = datagram_send_ctl(msg, &fl, opt, &hlimit, &tclass);
+		err = datagram_send_ctl(sock_net(sk), msg, &fl, opt, &hlimit, &tclass);
 		if (err < 0) {
 			fl6_sock_release(flowlabel);
 			return err;
-- 
cgit v1.2.3


From 49d074f4009a7b5ce9c17b040f978abcb4d7f6f6 Mon Sep 17 00:00:00 2001
From: "Denis V. Lunev" <den@openvz.org>
Date: Wed, 4 Jun 2008 15:49:06 +0400
Subject: [IPV6]: Do not change protocol for raw IPv6 sockets.

It is not allowed to change underlying protocol for
   int fd = socket(PF_INET6, SOCK_RAW, IPPROTO_UDP);

Signed-off-by: Denis V. Lunev <den@openvz.org>
Signed-off-by: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
---
 net/ipv6/ipv6_sockglue.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'net')

diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index 3eef8e5b3636..1afe210d6286 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -161,6 +161,9 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
 			struct ipv6_txoptions *opt;
 			struct sk_buff *pktopt;
 
+			if (sk->sk_type == SOCK_RAW)
+				break;
+
 			if (sk->sk_protocol != IPPROTO_UDP &&
 			    sk->sk_protocol != IPPROTO_UDPLITE &&
 			    sk->sk_protocol != IPPROTO_TCP)
-- 
cgit v1.2.3


From 36d926b94a9908937593e5669162305a071b9cc3 Mon Sep 17 00:00:00 2001
From: "Denis V. Lunev" <den@openvz.org>
Date: Wed, 4 Jun 2008 15:49:07 +0400
Subject: [IPV6]: inet_sk(sk)->cork.opt leak

IPv6 UDP sockets wth IPv4 mapped address use udp_sendmsg to send the data
actually. In this case ip_flush_pending_frames should be called instead
of ip6_flush_pending_frames.

Signed-off-by: Denis V. Lunev <den@openvz.org>
Signed-off-by: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
---
 net/ipv4/udp.c | 3 ++-
 net/ipv6/udp.c | 4 +++-
 2 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index db1cb7c96d63..56fcda3694ba 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -420,7 +420,7 @@ void udp_err(struct sk_buff *skb, u32 info)
 /*
  * Throw away all pending data and cancel the corking. Socket is locked.
  */
-static void udp_flush_pending_frames(struct sock *sk)
+void udp_flush_pending_frames(struct sock *sk)
 {
 	struct udp_sock *up = udp_sk(sk);
 
@@ -430,6 +430,7 @@ static void udp_flush_pending_frames(struct sock *sk)
 		ip_flush_pending_frames(sk);
 	}
 }
+EXPORT_SYMBOL(udp_flush_pending_frames);
 
 /**
  * 	udp4_hwcsum_outgoing  -  handle outgoing HW checksumming
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 1b35c4722004..dd309626ae9a 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -534,7 +534,9 @@ static void udp_v6_flush_pending_frames(struct sock *sk)
 {
 	struct udp_sock *up = udp_sk(sk);
 
-	if (up->pending) {
+	if (up->pending == AF_INET)
+		udp_flush_pending_frames(sk);
+	else if (up->pending) {
 		up->len = 0;
 		up->pending = 0;
 		ip6_flush_pending_frames(sk);
-- 
cgit v1.2.3


From 9596cc826e2e52bfc318ca37a6c52fe3d72990a3 Mon Sep 17 00:00:00 2001
From: "Denis V. Lunev" <den@openvz.org>
Date: Wed, 4 Jun 2008 15:49:08 +0400
Subject: [IPV6]: Do not change protocol for UDPv6 sockets with pending sent
 data.

Signed-off-by: Denis V. Lunev <den@openvz.org>
Signed-off-by: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
---
 net/ipv6/ipv6_sockglue.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index 1afe210d6286..26b83e512a09 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -164,9 +164,14 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
 			if (sk->sk_type == SOCK_RAW)
 				break;
 
-			if (sk->sk_protocol != IPPROTO_UDP &&
-			    sk->sk_protocol != IPPROTO_UDPLITE &&
-			    sk->sk_protocol != IPPROTO_TCP)
+			if (sk->sk_protocol == IPPROTO_UDP ||
+			    sk->sk_protocol == IPPROTO_UDPLITE) {
+				struct udp_sock *up = udp_sk(sk);
+				if (up->pending == AF_INET6) {
+					retv = -EBUSY;
+					break;
+				}
+			} else if (sk->sk_protocol != IPPROTO_TCP)
 				break;
 
 			if (sk->sk_state != TCP_ESTABLISHED) {
-- 
cgit v1.2.3


From a13366c632132bb9f8f2950a79773d8f68f4871e Mon Sep 17 00:00:00 2001
From: Adrian-Ken Rueegsegger <rueegsegger@swiss-it.ch>
Date: Wed, 4 Jun 2008 12:04:55 -0700
Subject: xfrm: xfrm_algo: correct usage of RIPEMD-160

This patch fixes the usage of RIPEMD-160 in xfrm_algo which in turn
allows hmac(rmd160) to be used as authentication mechanism in IPsec
ESP and AH (see RFC 2857).

Signed-off-by: Adrian-Ken Rueegsegger <rueegsegger@swiss-it.ch>
Acked-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/xfrm/xfrm_algo.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/xfrm/xfrm_algo.c b/net/xfrm/xfrm_algo.c
index ac765dd9c7f5..23a2cc04b8cd 100644
--- a/net/xfrm/xfrm_algo.c
+++ b/net/xfrm/xfrm_algo.c
@@ -200,8 +200,8 @@ static struct xfrm_algo_desc aalg_list[] = {
 	}
 },
 {
-	.name = "hmac(ripemd160)",
-	.compat = "ripemd160",
+	.name = "hmac(rmd160)",
+	.compat = "rmd160",
 
 	.uinfo = {
 		.auth = {
-- 
cgit v1.2.3


From a6604471db5e7a33474a7f16c64d6b118fae3e74 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= <ilpo.jarvinen@helsinki.fi>
Date: Wed, 4 Jun 2008 12:07:44 -0700
Subject: tcp: fix skb vs fack_count out-of-sync condition
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This bug is able to corrupt fackets_out in very rare cases.
In order for this to cause corruption:
  1) DSACK in the middle of previous SACK block must be generated.
  2) In order to take that particular branch, part or all of the
     DSACKed segment must already be SACKed so that we have that
     in cache in the first place.
  3) The new info must be top enough so that fackets_out will be
     updated on this iteration.
...then fack_count is updated while skb wasn't, then we walk again
that particular segment thus updating fack_count twice for
a single skb and finally that value is assigned to fackets_out
by tcp_sacktag_one.

It is safe to call tcp_sacktag_one just once for a segment (at
DSACK), no need to call again for plain SACK.

Potential problem of the miscount are limited to premature entry
to recovery and to inflated reordering metric (which could even
cancel each other out in the most the luckiest scenarios :-)).
Both are quite insignificant in worst case too and there exists
also code to reset them (fackets_out once sacked_out becomes zero
and reordering metric on RTO).

This has been reported by a number of people, because it occurred
quite rarely, it has been very evasive. Andy Furniss was able to
get it to occur couple of times so that a bit more info was
collected about the problem using a debug patch, though it still
required lot of checking around. Thanks also to others who have
tried to help here.

This is listed as Bugzilla #10346. The bug was introduced by
me in commit 68f8353b48 ([TCP]: Rewrite SACK block processing &
sack_recv_cache use), I probably thought back then that there's
need to scan that entry twice or didn't dare to make it go
through it just once there. Going through twice would have
required restoring fack_count after the walk but as noted above,
I chose to drop the additional walk step altogether here.

Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 54a0b7412782..eba873e9b560 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1392,9 +1392,9 @@ static struct sk_buff *tcp_maybe_skipping_dsack(struct sk_buff *skb,
 
 	if (before(next_dup->start_seq, skip_to_seq)) {
 		skb = tcp_sacktag_skip(skb, sk, next_dup->start_seq, fack_count);
-		tcp_sacktag_walk(skb, sk, NULL,
-				 next_dup->start_seq, next_dup->end_seq,
-				 1, fack_count, reord, flag);
+		skb = tcp_sacktag_walk(skb, sk, NULL,
+				     next_dup->start_seq, next_dup->end_seq,
+				     1, fack_count, reord, flag);
 	}
 
 	return skb;
-- 
cgit v1.2.3


From 4141ddc02a92a6e3e5793601554c6033e83c25b9 Mon Sep 17 00:00:00 2001
From: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Date: Wed, 4 Jun 2008 12:37:33 -0700
Subject: sctp: retran_path update bug fix

If the current retran_path is the only active one, it should
update it to the the next inactive one.

Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Signed-off-by: Vlad Yasevich <vladislav.yasevich@hp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/associola.c | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index b4cd2b71953f..532634861db1 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -1203,6 +1203,9 @@ void sctp_assoc_update_retran_path(struct sctp_association *asoc)
 	struct list_head *head = &asoc->peer.transport_addr_list;
 	struct list_head *pos;
 
+	if (asoc->peer.transport_count == 1)
+		return;
+
 	/* Find the next transport in a round-robin fashion. */
 	t = asoc->peer.retran_path;
 	pos = &t->transports;
@@ -1217,6 +1220,15 @@ void sctp_assoc_update_retran_path(struct sctp_association *asoc)
 
 		t = list_entry(pos, struct sctp_transport, transports);
 
+		/* We have exhausted the list, but didn't find any
+		 * other active transports.  If so, use the next
+		 * transport.
+		 */
+		if (t == asoc->peer.retran_path) {
+			t = next;
+			break;
+		}
+
 		/* Try to find an active transport. */
 
 		if ((t->state == SCTP_ACTIVE) ||
@@ -1229,15 +1241,6 @@ void sctp_assoc_update_retran_path(struct sctp_association *asoc)
 			if (!next)
 				next = t;
 		}
-
-		/* We have exhausted the list, but didn't find any
-		 * other active transports.  If so, use the next
-		 * transport.
-		 */
-		if (t == asoc->peer.retran_path) {
-			t = next;
-			break;
-		}
 	}
 
 	asoc->peer.retran_path = t;
-- 
cgit v1.2.3


From 159c6bea37c54dfae44409467e0f17600722d541 Mon Sep 17 00:00:00 2001
From: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Date: Wed, 4 Jun 2008 12:38:07 -0700
Subject: sctp: Move sctp_v4_dst_saddr out of loop

There's no need to execute sctp_v4_dst_saddr() for each
iteration, just move it out of loop.

Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Signed-off-by: Vlad Yasevich <vladislav.yasevich@hp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/protocol.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index 13ee7fa92e07..56bdaf7fc425 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -470,11 +470,11 @@ static struct dst_entry *sctp_v4_get_dst(struct sctp_association *asoc,
 		/* Walk through the bind address list and look for a bind
 		 * address that matches the source address of the returned dst.
 		 */
+		sctp_v4_dst_saddr(&dst_saddr, dst, htons(bp->port));
 		rcu_read_lock();
 		list_for_each_entry_rcu(laddr, &bp->address_list, list) {
 			if (!laddr->valid || (laddr->state != SCTP_ADDR_SRC))
 				continue;
-			sctp_v4_dst_saddr(&dst_saddr, dst, htons(bp->port));
 			if (sctp_v4_cmp_addr(&dst_saddr, &laddr->a))
 				goto out_unlock;
 		}
-- 
cgit v1.2.3


From a6465234814efda9ed1dccdba852953f7508e827 Mon Sep 17 00:00:00 2001
From: Vlad Yasevich <vladislav.yasevich@hp.com>
Date: Wed, 4 Jun 2008 12:38:43 -0700
Subject: sctp: Correctly implement Fast Recovery cwnd manipulations.

Correctly keep track of Fast Recovery state and do not reduce
congestion window multiple times during sucht state.

Signed-off-by: Vlad Yasevich <vladislav.yasevich@hp.com>
Tested-by: Wei Yongjun <yjwei@cn.fujitsu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/transport.c | 44 ++++++++++++++++++++++++++++++++------------
 1 file changed, 32 insertions(+), 12 deletions(-)

(limited to 'net')

diff --git a/net/sctp/transport.c b/net/sctp/transport.c
index 62082e7b7972..9647fb277221 100644
--- a/net/sctp/transport.c
+++ b/net/sctp/transport.c
@@ -79,6 +79,7 @@ static struct sctp_transport *sctp_transport_init(struct sctp_transport *peer,
 	peer->rttvar = 0;
 	peer->srtt = 0;
 	peer->rto_pending = 0;
+	peer->fast_recovery = 0;
 
 	peer->last_time_heard = jiffies;
 	peer->last_time_used = jiffies;
@@ -403,11 +404,16 @@ void sctp_transport_raise_cwnd(struct sctp_transport *transport,
 	cwnd = transport->cwnd;
 	flight_size = transport->flight_size;
 
+	/* See if we need to exit Fast Recovery first */
+	if (transport->fast_recovery &&
+	    TSN_lte(transport->fast_recovery_exit, sack_ctsn))
+		transport->fast_recovery = 0;
+
 	/* The appropriate cwnd increase algorithm is performed if, and only
-	 * if the cumulative TSN has advanced and the congestion window is
+	 * if the cumulative TSN whould advanced and the congestion window is
 	 * being fully utilized.
 	 */
-	if ((transport->asoc->ctsn_ack_point >= sack_ctsn) ||
+	if (TSN_lte(sack_ctsn, transport->asoc->ctsn_ack_point) ||
 	    (flight_size < cwnd))
 		return;
 
@@ -416,17 +422,23 @@ void sctp_transport_raise_cwnd(struct sctp_transport *transport,
 	pmtu = transport->asoc->pathmtu;
 
 	if (cwnd <= ssthresh) {
-		/* RFC 2960 7.2.1, sctpimpguide-05 2.14.2 When cwnd is less
-		 * than or equal to ssthresh an SCTP endpoint MUST use the
-		 * slow start algorithm to increase cwnd only if the current
-		 * congestion window is being fully utilized and an incoming
-		 * SACK advances the Cumulative TSN Ack Point. Only when these
-		 * two conditions are met can the cwnd be increased otherwise
-		 * the cwnd MUST not be increased. If these conditions are met
-		 * then cwnd MUST be increased by at most the lesser of
-		 * 1) the total size of the previously outstanding DATA
-		 * chunk(s) acknowledged, and 2) the destination's path MTU.
+		/* RFC 4960 7.2.1
+		 * o  When cwnd is less than or equal to ssthresh, an SCTP
+		 *    endpoint MUST use the slow-start algorithm to increase
+		 *    cwnd only if the current congestion window is being fully
+		 *    utilized, an incoming SACK advances the Cumulative TSN
+		 *    Ack Point, and the data sender is not in Fast Recovery.
+		 *    Only when these three conditions are met can the cwnd be
+		 *    increased; otherwise, the cwnd MUST not be increased.
+		 *    If these conditions are met, then cwnd MUST be increased
+		 *    by, at most, the lesser of 1) the total size of the
+		 *    previously outstanding DATA chunk(s) acknowledged, and
+		 *    2) the destination's path MTU.  This upper bound protects
+		 *    against the ACK-Splitting attack outlined in [SAVAGE99].
 		 */
+		if (transport->fast_recovery)
+			return;
+
 		if (bytes_acked > pmtu)
 			cwnd += pmtu;
 		else
@@ -502,6 +514,13 @@ void sctp_transport_lower_cwnd(struct sctp_transport *transport,
 		 *      cwnd = ssthresh
 		 *      partial_bytes_acked = 0
 		 */
+		if (transport->fast_recovery)
+			return;
+
+		/* Mark Fast recovery */
+		transport->fast_recovery = 1;
+		transport->fast_recovery_exit = transport->asoc->next_tsn - 1;
+
 		transport->ssthresh = max(transport->cwnd/2,
 					  4*transport->asoc->pathmtu);
 		transport->cwnd = transport->ssthresh;
@@ -586,6 +605,7 @@ void sctp_transport_reset(struct sctp_transport *t)
 	t->flight_size = 0;
 	t->error_count = 0;
 	t->rto_pending = 0;
+	t->fast_recovery = 0;
 
 	/* Initialize the state information for SFR-CACC */
 	t->cacc.changeover_active = 0;
-- 
cgit v1.2.3


From 62aeaff5ccd96462b7077046357a6d7886175a57 Mon Sep 17 00:00:00 2001
From: Vlad Yasevich <vladislav.yasevich@hp.com>
Date: Wed, 4 Jun 2008 12:39:11 -0700
Subject: sctp: Start T3-RTX timer when fast retransmitting lowest TSN

When we are trying to fast retransmit the lowest outstanding TSN, we
need to restart the T3-RTX timer, so that subsequent timeouts will
correctly tag all the packets necessary for retransmissions.

Signed-off-by: Vlad Yasevich <vladislav.yasevich@hp.com>
Tested-by: Wei Yongjun <yjwei@cn.fujitsu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/outqueue.c  | 42 +++++++++++++++++++++++++++++++-----------
 net/sctp/transport.c |  4 ++--
 2 files changed, 33 insertions(+), 13 deletions(-)

(limited to 'net')

diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index 59edfd25a19c..5d3c441e84d3 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -208,6 +208,7 @@ void sctp_outq_init(struct sctp_association *asoc, struct sctp_outq *q)
 	INIT_LIST_HEAD(&q->sacked);
 	INIT_LIST_HEAD(&q->abandoned);
 
+	q->fast_rtx = 0;
 	q->outstanding_bytes = 0;
 	q->empty = 1;
 	q->cork  = 0;
@@ -500,6 +501,7 @@ void sctp_retransmit(struct sctp_outq *q, struct sctp_transport *transport,
 	case SCTP_RTXR_FAST_RTX:
 		SCTP_INC_STATS(SCTP_MIB_FAST_RETRANSMITS);
 		sctp_transport_lower_cwnd(transport, SCTP_LOWER_CWND_FAST_RTX);
+		q->fast_rtx = 1;
 		break;
 	case SCTP_RTXR_PMTUD:
 		SCTP_INC_STATS(SCTP_MIB_PMTUD_RETRANSMITS);
@@ -543,10 +545,13 @@ static int sctp_outq_flush_rtx(struct sctp_outq *q, struct sctp_packet *pkt,
 	sctp_xmit_t status;
 	struct sctp_chunk *chunk, *chunk1;
 	struct sctp_association *asoc;
+	int fast_rtx;
 	int error = 0;
+	int timer = 0;
 
 	asoc = q->asoc;
 	lqueue = &q->retransmit;
+	fast_rtx = q->fast_rtx;
 
 	/* RFC 2960 6.3.3 Handle T3-rtx Expiration
 	 *
@@ -587,13 +592,12 @@ static int sctp_outq_flush_rtx(struct sctp_outq *q, struct sctp_packet *pkt,
 		switch (status) {
 		case SCTP_XMIT_PMTU_FULL:
 			/* Send this packet.  */
-			if ((error = sctp_packet_transmit(pkt)) == 0)
-				*start_timer = 1;
+			error = sctp_packet_transmit(pkt);
 
 			/* If we are retransmitting, we should only
 			 * send a single packet.
 			 */
-			if (rtx_timeout) {
+			if (rtx_timeout || fast_rtx) {
 				list_add(lchunk, lqueue);
 				lchunk = NULL;
 			}
@@ -603,8 +607,7 @@ static int sctp_outq_flush_rtx(struct sctp_outq *q, struct sctp_packet *pkt,
 
 		case SCTP_XMIT_RWND_FULL:
 			/* Send this packet. */
-			if ((error = sctp_packet_transmit(pkt)) == 0)
-				*start_timer = 1;
+			error = sctp_packet_transmit(pkt);
 
 			/* Stop sending DATA as there is no more room
 			 * at the receiver.
@@ -615,8 +618,7 @@ static int sctp_outq_flush_rtx(struct sctp_outq *q, struct sctp_packet *pkt,
 
 		case SCTP_XMIT_NAGLE_DELAY:
 			/* Send this packet. */
-			if ((error = sctp_packet_transmit(pkt)) == 0)
-				*start_timer = 1;
+			error = sctp_packet_transmit(pkt);
 
 			/* Stop sending DATA because of nagle delay. */
 			list_add(lchunk, lqueue);
@@ -635,7 +637,14 @@ static int sctp_outq_flush_rtx(struct sctp_outq *q, struct sctp_packet *pkt,
 			if (chunk->fast_retransmit > 0)
 				chunk->fast_retransmit = -1;
 
-			*start_timer = 1;
+			/* Force start T3-rtx timer when fast retransmitting
+			 * the earliest outstanding TSN
+			 */
+			if (!timer && fast_rtx &&
+			    ntohl(chunk->subh.data_hdr->tsn) ==
+					     asoc->ctsn_ack_point + 1)
+				timer = 2;
+
 			q->empty = 0;
 
 			/* Retrieve a new chunk to bundle. */
@@ -643,12 +652,16 @@ static int sctp_outq_flush_rtx(struct sctp_outq *q, struct sctp_packet *pkt,
 			break;
 		}
 
+		/* Set the timer if there were no errors */
+		if (!error && !timer)
+			timer = 1;
+
 		/* If we are here due to a retransmit timeout or a fast
 		 * retransmit and if there are any chunks left in the retransmit
 		 * queue that could not fit in the PMTU sized packet, they need
 		 * to be marked as ineligible for a subsequent fast retransmit.
 		 */
-		if (rtx_timeout && !lchunk) {
+		if (rtx_timeout && fast_rtx) {
 			list_for_each_entry(chunk1, lqueue, transmitted_list) {
 				if (chunk1->fast_retransmit > 0)
 					chunk1->fast_retransmit = -1;
@@ -656,6 +669,12 @@ static int sctp_outq_flush_rtx(struct sctp_outq *q, struct sctp_packet *pkt,
 		}
 	}
 
+	*start_timer = timer;
+
+	/* Clear fast retransmit hint */
+	if (fast_rtx)
+		q->fast_rtx = 0;
+
 	return error;
 }
 
@@ -862,7 +881,8 @@ int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout)
 						    rtx_timeout, &start_timer);
 
 			if (start_timer)
-				sctp_transport_reset_timers(transport);
+				sctp_transport_reset_timers(transport,
+							    start_timer-1);
 
 			/* This can happen on COOKIE-ECHO resend.  Only
 			 * one chunk can get bundled with a COOKIE-ECHO.
@@ -977,7 +997,7 @@ int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout)
 			list_add_tail(&chunk->transmitted_list,
 				      &transport->transmitted);
 
-			sctp_transport_reset_timers(transport);
+			sctp_transport_reset_timers(transport, start_timer-1);
 
 			q->empty = 0;
 
diff --git a/net/sctp/transport.c b/net/sctp/transport.c
index 9647fb277221..3f34f61221ec 100644
--- a/net/sctp/transport.c
+++ b/net/sctp/transport.c
@@ -191,7 +191,7 @@ static void sctp_transport_destroy(struct sctp_transport *transport)
 /* Start T3_rtx timer if it is not already running and update the heartbeat
  * timer.  This routine is called every time a DATA chunk is sent.
  */
-void sctp_transport_reset_timers(struct sctp_transport *transport)
+void sctp_transport_reset_timers(struct sctp_transport *transport, int force)
 {
 	/* RFC 2960 6.3.2 Retransmission Timer Rules
 	 *
@@ -201,7 +201,7 @@ void sctp_transport_reset_timers(struct sctp_transport *transport)
 	 * address.
 	 */
 
-	if (!timer_pending(&transport->T3_rtx_timer))
+	if (force || !timer_pending(&transport->T3_rtx_timer))
 		if (!mod_timer(&transport->T3_rtx_timer,
 			       jiffies + transport->rto))
 			sctp_transport_hold(transport);
-- 
cgit v1.2.3


From 8b750ce54bd8ab5f75d519ee450e1b0c5226ebe9 Mon Sep 17 00:00:00 2001
From: Vlad Yasevich <vladislav.yasevich@hp.com>
Date: Wed, 4 Jun 2008 12:39:36 -0700
Subject: sctp: Flush the queue only once during fast retransmit.

When fast retransmit is triggered by a sack, we should flush the queue
only once so that only 1 retransmit happens.  Also, since we could
potentially have non-fast-rtx chunks on the retransmit queue, we need
make sure any chunks eligable for fast retransmit are sent first
during fast retransmission.

Signed-off-by: Vlad Yasevich <vladislav.yasevich@hp.com>
Tested-by: Wei Yongjun <yjwei@cn.fujitsu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/outqueue.c | 82 +++++++++++++++++++++++++++++++----------------------
 1 file changed, 48 insertions(+), 34 deletions(-)

(limited to 'net')

diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index 5d3c441e84d3..ace6770e9048 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -520,9 +520,15 @@ void sctp_retransmit(struct sctp_outq *q, struct sctp_transport *transport,
 	 * the sender SHOULD try to advance the "Advanced.Peer.Ack.Point" by
 	 * following the procedures outlined in C1 - C5.
 	 */
-	sctp_generate_fwdtsn(q, q->asoc->ctsn_ack_point);
+	if (reason == SCTP_RTXR_T3_RTX)
+		sctp_generate_fwdtsn(q, q->asoc->ctsn_ack_point);
 
-	error = sctp_outq_flush(q, /* rtx_timeout */ 1);
+	/* Flush the queues only on timeout, since fast_rtx is only
+	 * triggered during sack processing and the queue
+	 * will be flushed at the end.
+	 */
+	if (reason != SCTP_RTXR_FAST_RTX)
+		error = sctp_outq_flush(q, /* rtx_timeout */ 1);
 
 	if (error)
 		q->asoc->base.sk->sk_err = -error;
@@ -540,7 +546,6 @@ static int sctp_outq_flush_rtx(struct sctp_outq *q, struct sctp_packet *pkt,
 			       int rtx_timeout, int *start_timer)
 {
 	struct list_head *lqueue;
-	struct list_head *lchunk;
 	struct sctp_transport *transport = pkt->transport;
 	sctp_xmit_t status;
 	struct sctp_chunk *chunk, *chunk1;
@@ -548,12 +553,16 @@ static int sctp_outq_flush_rtx(struct sctp_outq *q, struct sctp_packet *pkt,
 	int fast_rtx;
 	int error = 0;
 	int timer = 0;
+	int done = 0;
 
 	asoc = q->asoc;
 	lqueue = &q->retransmit;
 	fast_rtx = q->fast_rtx;
 
-	/* RFC 2960 6.3.3 Handle T3-rtx Expiration
+	/* This loop handles time-out retransmissions, fast retransmissions,
+	 * and retransmissions due to opening of whindow.
+	 *
+	 * RFC 2960 6.3.3 Handle T3-rtx Expiration
 	 *
 	 * E3) Determine how many of the earliest (i.e., lowest TSN)
 	 * outstanding DATA chunks for the address for which the
@@ -568,12 +577,12 @@ static int sctp_outq_flush_rtx(struct sctp_outq *q, struct sctp_packet *pkt,
 	 * [Just to be painfully clear, if we are retransmitting
 	 * because a timeout just happened, we should send only ONE
 	 * packet of retransmitted data.]
+	 *
+	 * For fast retransmissions we also send only ONE packet.  However,
+	 * if we are just flushing the queue due to open window, we'll
+	 * try to send as much as possible.
 	 */
-	lchunk = sctp_list_dequeue(lqueue);
-
-	while (lchunk) {
-		chunk = list_entry(lchunk, struct sctp_chunk,
-				   transmitted_list);
+	list_for_each_entry_safe(chunk, chunk1, lqueue, transmitted_list) {
 
 		/* Make sure that Gap Acked TSNs are not retransmitted.  A
 		 * simple approach is just to move such TSNs out of the
@@ -581,11 +590,18 @@ static int sctp_outq_flush_rtx(struct sctp_outq *q, struct sctp_packet *pkt,
 		 * next chunk.
 		 */
 		if (chunk->tsn_gap_acked) {
-			list_add_tail(lchunk, &transport->transmitted);
-			lchunk = sctp_list_dequeue(lqueue);
+			list_del(&chunk->transmitted_list);
+			list_add_tail(&chunk->transmitted_list,
+					&transport->transmitted);
 			continue;
 		}
 
+		/* If we are doing fast retransmit, ignore non-fast_rtransmit
+		 * chunks
+		 */
+		if (fast_rtx && !chunk->fast_retransmit)
+			continue;
+
 		/* Attempt to append this chunk to the packet. */
 		status = sctp_packet_append_chunk(pkt, chunk);
 
@@ -597,12 +613,10 @@ static int sctp_outq_flush_rtx(struct sctp_outq *q, struct sctp_packet *pkt,
 			/* If we are retransmitting, we should only
 			 * send a single packet.
 			 */
-			if (rtx_timeout || fast_rtx) {
-				list_add(lchunk, lqueue);
-				lchunk = NULL;
-			}
+			if (rtx_timeout || fast_rtx)
+				done = 1;
 
-			/* Bundle lchunk in the next round.  */
+			/* Bundle next chunk in the next round.  */
 			break;
 
 		case SCTP_XMIT_RWND_FULL:
@@ -612,8 +626,7 @@ static int sctp_outq_flush_rtx(struct sctp_outq *q, struct sctp_packet *pkt,
 			/* Stop sending DATA as there is no more room
 			 * at the receiver.
 			 */
-			list_add(lchunk, lqueue);
-			lchunk = NULL;
+			done = 1;
 			break;
 
 		case SCTP_XMIT_NAGLE_DELAY:
@@ -621,15 +634,16 @@ static int sctp_outq_flush_rtx(struct sctp_outq *q, struct sctp_packet *pkt,
 			error = sctp_packet_transmit(pkt);
 
 			/* Stop sending DATA because of nagle delay. */
-			list_add(lchunk, lqueue);
-			lchunk = NULL;
+			done = 1;
 			break;
 
 		default:
 			/* The append was successful, so add this chunk to
 			 * the transmitted list.
 			 */
-			list_add_tail(lchunk, &transport->transmitted);
+			list_del(&chunk->transmitted_list);
+			list_add_tail(&chunk->transmitted_list,
+					&transport->transmitted);
 
 			/* Mark the chunk as ineligible for fast retransmit
 			 * after it is retransmitted.
@@ -646,9 +660,6 @@ static int sctp_outq_flush_rtx(struct sctp_outq *q, struct sctp_packet *pkt,
 				timer = 2;
 
 			q->empty = 0;
-
-			/* Retrieve a new chunk to bundle. */
-			lchunk = sctp_list_dequeue(lqueue);
 			break;
 		}
 
@@ -656,16 +667,19 @@ static int sctp_outq_flush_rtx(struct sctp_outq *q, struct sctp_packet *pkt,
 		if (!error && !timer)
 			timer = 1;
 
-		/* If we are here due to a retransmit timeout or a fast
-		 * retransmit and if there are any chunks left in the retransmit
-		 * queue that could not fit in the PMTU sized packet, they need
-		 * to be marked as ineligible for a subsequent fast retransmit.
-		 */
-		if (rtx_timeout && fast_rtx) {
-			list_for_each_entry(chunk1, lqueue, transmitted_list) {
-				if (chunk1->fast_retransmit > 0)
-					chunk1->fast_retransmit = -1;
-			}
+		if (done)
+			break;
+	}
+
+	/* If we are here due to a retransmit timeout or a fast
+	 * retransmit and if there are any chunks left in the retransmit
+	 * queue that could not fit in the PMTU sized packet, they need
+	 * to be marked as ineligible for a subsequent fast retransmit.
+	 */
+	if (rtx_timeout || fast_rtx) {
+		list_for_each_entry(chunk1, lqueue, transmitted_list) {
+			if (chunk1->fast_retransmit > 0)
+				chunk1->fast_retransmit = -1;
 		}
 	}
 
-- 
cgit v1.2.3


From b9031d9d87b24e24cd32ea15b5f4220a1e8da909 Mon Sep 17 00:00:00 2001
From: Vlad Yasevich <vladislav.yasevich@hp.com>
Date: Wed, 4 Jun 2008 12:40:15 -0700
Subject: sctp: Fix ECN markings for IPv6

Commit e9df2e8fd8fbc95c57dbd1d33dada66c4627b44c ("[IPV6]: Use
appropriate sock tclass setting for routing lookup.") also changed the
way that ECN capable transports mark this capability in IPv6.  As a
result, SCTP was not marking ECN capablity because the traffic class
was never set.  This patch brings back the markings for IPv6 traffic.

Signed-off-by: Vlad Yasevich <vladislav.yasevich@hp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/ipv6.c     | 6 ++++++
 net/sctp/output.c   | 2 +-
 net/sctp/protocol.c | 6 ++++++
 3 files changed, 13 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index e4aac3266fcd..a2f4d4d51593 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -727,6 +727,11 @@ static void sctp_v6_seq_dump_addr(struct seq_file *seq, union sctp_addr *addr)
 	seq_printf(seq, NIP6_FMT " ", NIP6(addr->v6.sin6_addr));
 }
 
+static void sctp_v6_ecn_capable(struct sock *sk)
+{
+	inet6_sk(sk)->tclass |= INET_ECN_ECT_0;
+}
+
 /* Initialize a PF_INET6 socket msg_name. */
 static void sctp_inet6_msgname(char *msgname, int *addr_len)
 {
@@ -997,6 +1002,7 @@ static struct sctp_af sctp_af_inet6 = {
 	.skb_iif	   = sctp_v6_skb_iif,
 	.is_ce		   = sctp_v6_is_ce,
 	.seq_dump_addr	   = sctp_v6_seq_dump_addr,
+	.ecn_capable	   = sctp_v6_ecn_capable,
 	.net_header_len	   = sizeof(struct ipv6hdr),
 	.sockaddr_len	   = sizeof(struct sockaddr_in6),
 #ifdef CONFIG_COMPAT
diff --git a/net/sctp/output.c b/net/sctp/output.c
index cf4f9fb6819d..6d45bae93b46 100644
--- a/net/sctp/output.c
+++ b/net/sctp/output.c
@@ -548,7 +548,7 @@ int sctp_packet_transmit(struct sctp_packet *packet)
 	 * Note: The works for IPv6 layer checks this bit too later
 	 * in transmission.  See IP6_ECN_flow_xmit().
 	 */
-	INET_ECN_xmit(nskb->sk);
+	(*tp->af_specific->ecn_capable)(nskb->sk);
 
 	/* Set up the IP options.  */
 	/* BUG: not implemented
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index 56bdaf7fc425..b435a193c5df 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -617,6 +617,11 @@ static void sctp_v4_seq_dump_addr(struct seq_file *seq, union sctp_addr *addr)
 	seq_printf(seq, "%d.%d.%d.%d ", NIPQUAD(addr->v4.sin_addr));
 }
 
+static void sctp_v4_ecn_capable(struct sock *sk)
+{
+	INET_ECN_xmit(sk);
+}
+
 /* Event handler for inet address addition/deletion events.
  * The sctp_local_addr_list needs to be protocted by a spin lock since
  * multiple notifiers (say IPv4 and IPv6) may be running at the same
@@ -935,6 +940,7 @@ static struct sctp_af sctp_af_inet = {
 	.skb_iif	   = sctp_v4_skb_iif,
 	.is_ce		   = sctp_v4_is_ce,
 	.seq_dump_addr	   = sctp_v4_seq_dump_addr,
+	.ecn_capable	   = sctp_v4_ecn_capable,
 	.net_header_len	   = sizeof(struct iphdr),
 	.sockaddr_len	   = sizeof(struct sockaddr_in),
 #ifdef CONFIG_COMPAT
-- 
cgit v1.2.3


From 22dd485022f3d0b162ceb5e67d85de7c3806aa20 Mon Sep 17 00:00:00 2001
From: "Denis V. Lunev" <den@openvz.org>
Date: Wed, 4 Jun 2008 15:16:12 -0700
Subject: raw: Raw socket leak.

The program below just leaks the raw kernel socket

int main() {
        int fd = socket(PF_INET, SOCK_RAW, IPPROTO_UDP);
        struct sockaddr_in addr;

        memset(&addr, 0, sizeof(addr));
        inet_aton("127.0.0.1", &addr.sin_addr);
        addr.sin_family = AF_INET;
        addr.sin_port = htons(2048);
        sendto(fd,  "a", 1, MSG_MORE, &addr, sizeof(addr));
        return 0;
}

Corked packet is allocated via sock_wmalloc which holds the owner socket,
so one should uncork it and flush all pending data on close. Do this in the
same way as in UDP.

Signed-off-by: Denis V. Lunev <den@openvz.org>
Acked-by: Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/raw.c | 9 +++++++++
 net/ipv6/raw.c | 9 +++++++++
 2 files changed, 18 insertions(+)

(limited to 'net')

diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index fead049daf43..e7e091d365ff 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -608,6 +608,14 @@ static void raw_close(struct sock *sk, long timeout)
 	sk_common_release(sk);
 }
 
+static int raw_destroy(struct sock *sk)
+{
+	lock_sock(sk);
+	ip_flush_pending_frames(sk);
+	release_sock(sk);
+	return 0;
+}
+
 /* This gets rid of all the nasties in af_inet. -DaveM */
 static int raw_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 {
@@ -820,6 +828,7 @@ struct proto raw_prot = {
 	.name		   = "RAW",
 	.owner		   = THIS_MODULE,
 	.close		   = raw_close,
+	.destroy	   = raw_destroy,
 	.connect	   = ip4_datagram_connect,
 	.disconnect	   = udp_disconnect,
 	.ioctl		   = raw_ioctl,
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index 603df76e0522..8fee9a15b2d3 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -1164,6 +1164,14 @@ static void rawv6_close(struct sock *sk, long timeout)
 	sk_common_release(sk);
 }
 
+static int raw6_destroy(struct sock *sk)
+{
+	lock_sock(sk);
+	ip6_flush_pending_frames(sk);
+	release_sock(sk);
+	return 0;
+}
+
 static int rawv6_init_sk(struct sock *sk)
 {
 	struct raw6_sock *rp = raw6_sk(sk);
@@ -1187,6 +1195,7 @@ struct proto rawv6_prot = {
 	.name		   = "RAWv6",
 	.owner		   = THIS_MODULE,
 	.close		   = rawv6_close,
+	.destroy	   = raw6_destroy,
 	.connect	   = ip6_datagram_connect,
 	.disconnect	   = udp_disconnect,
 	.ioctl		   = rawv6_ioctl,
-- 
cgit v1.2.3


From 26af65cbeb2467a486ae4fc7242c94e470c67c50 Mon Sep 17 00:00:00 2001
From: Sridhar Samudrala <sri@us.ibm.com>
Date: Wed, 4 Jun 2008 15:19:35 -0700
Subject: tcp: Increment OUTRSTS in tcp_send_active_reset()

TCP "resets sent" counter is not incremented when a TCP Reset is
sent via tcp_send_active_reset().

Signed-off-by: Sridhar Samudrala <sri@us.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_output.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net')

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index e399bde7813a..ad993ecb4810 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2131,6 +2131,8 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority)
 	TCP_SKB_CB(skb)->when = tcp_time_stamp;
 	if (tcp_transmit_skb(sk, skb, 0, priority))
 		NET_INC_STATS(LINUX_MIB_TCPABORTFAILED);
+
+	TCP_INC_STATS(TCP_MIB_OUTRSTS);
 }
 
 /* WARNING: This routine must only be called when we have already sent
-- 
cgit v1.2.3


From 293ad60401da621b8b329abbe8c388edb25f658a Mon Sep 17 00:00:00 2001
From: Octavian Purdila <opurdila@ixiacom.com>
Date: Wed, 4 Jun 2008 15:45:58 -0700
Subject: tcp: Fix for race due to temporary drop of the socket lock in
 skb_splice_bits.

skb_splice_bits temporary drops the socket lock while iterating over
the socket queue in order to break a reverse locking condition which
happens with sendfile. This, however, opens a window of opportunity
for tcp_collapse() to aggregate skbs and thus potentially free the
current skb used in skb_splice_bits and tcp_read_sock.

This patch fixes the problem by (re-)getting the same "logical skb"
after the lock has been temporary dropped.

Based on idea and initial patch from Evgeniy Polyakov.

Signed-off-by: Octavian Purdila <opurdila@ixiacom.com>
Acked-by: Evgeniy Polyakov <johnpol@2ka.mipt.ru>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/skbuff.c | 5 +++--
 net/ipv4/tcp.c    | 9 ++++++++-
 2 files changed, 11 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 5c459f2b7985..1e556d312117 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1445,6 +1445,7 @@ done:
 
 	if (spd.nr_pages) {
 		int ret;
+		struct sock *sk = __skb->sk;
 
 		/*
 		 * Drop the socket lock, otherwise we have reverse
@@ -1455,9 +1456,9 @@ done:
 		 * we call into ->sendpage() with the i_mutex lock held
 		 * and networking will grab the socket lock.
 		 */
-		release_sock(__skb->sk);
+		release_sock(sk);
 		ret = splice_to_pipe(pipe, &spd);
-		lock_sock(__skb->sk);
+		lock_sock(sk);
 		return ret;
 	}
 
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index f88653138621..ab66683b8043 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1227,7 +1227,14 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
 				copied += used;
 				offset += used;
 			}
-			if (offset != skb->len)
+			/*
+			 * If recv_actor drops the lock (e.g. TCP splice
+			 * receive) the skb pointer might be invalid when
+			 * getting here: tcp_collapse might have deleted it
+			 * while aggregating skbs from the socket queue.
+			 */
+			skb = tcp_recv_skb(sk, seq-1, &offset);
+			if (!skb || (offset+1 != skb->len))
 				break;
 		}
 		if (tcp_hdr(skb)->fin) {
-- 
cgit v1.2.3


From ddb2c43594f22843e9f3153da151deaba1a834c5 Mon Sep 17 00:00:00 2001
From: Chris Wright <chrisw@sous-sol.org>
Date: Wed, 4 Jun 2008 09:16:33 -0700
Subject: asn1: additional sanity checking during BER decoding

- Don't trust a length which is greater than the working buffer.
  An invalid length could cause overflow when calculating buffer size
  for decoding oid.

- An oid length of zero is invalid and allows for an off-by-one error when
  decoding oid because the first subid actually encodes first 2 subids.

- A primitive encoding may not have an indefinite length.

Thanks to Wei Wang from McAfee for report.

Cc: Steven French <sfrench@us.ibm.com>
Cc: stable@kernel.org
Acked-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: Chris Wright <chrisw@sous-sol.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 net/ipv4/netfilter/nf_nat_snmp_basic.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'net')

diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic.c b/net/ipv4/netfilter/nf_nat_snmp_basic.c
index 5daefad3d193..7750c97fde7b 100644
--- a/net/ipv4/netfilter/nf_nat_snmp_basic.c
+++ b/net/ipv4/netfilter/nf_nat_snmp_basic.c
@@ -232,6 +232,11 @@ static unsigned char asn1_length_decode(struct asn1_ctx *ctx,
 			}
 		}
 	}
+
+	/* don't trust len bigger than ctx buffer */
+	if (*len > ctx->end - ctx->pointer)
+		return 0;
+
 	return 1;
 }
 
@@ -250,6 +255,10 @@ static unsigned char asn1_header_decode(struct asn1_ctx *ctx,
 	if (!asn1_length_decode(ctx, &def, &len))
 		return 0;
 
+	/* primitive shall be definite, indefinite shall be constructed */
+	if (*con == ASN1_PRI && !def)
+		return 0;
+
 	if (def)
 		*eoc = ctx->pointer + len;
 	else
@@ -434,6 +443,11 @@ static unsigned char asn1_oid_decode(struct asn1_ctx *ctx,
 	unsigned long *optr;
 
 	size = eoc - ctx->pointer + 1;
+
+	/* first subid actually encodes first two subids */
+	if (size < 2 || size > ULONG_MAX/sizeof(unsigned long))
+		return 0;
+
 	*oid = kmalloc(size * sizeof(unsigned long), GFP_ATOMIC);
 	if (*oid == NULL) {
 		if (net_ratelimit())
-- 
cgit v1.2.3


From 507b06d0622480f8026d49a94f86068bb0fd6ed6 Mon Sep 17 00:00:00 2001
From: Dan Williams <dcbw@redhat.com>
Date: Tue, 3 Jun 2008 23:39:55 -0400
Subject: mac80211: send association event on IBSS create

Otherwise userspace has no idea the IBSS creation succeeded.

Signed-off-by: Dan Williams <dcbw@redhat.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/mlme.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'net')

diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 841278f1df8e..af375da9df21 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -2336,6 +2336,7 @@ static int ieee80211_sta_join_ibss(struct net_device *dev,
 	u8 *pos;
 	struct ieee80211_sub_if_data *sdata;
 	struct ieee80211_supported_band *sband;
+	union iwreq_data wrqu;
 
 	sband = local->hw.wiphy->bands[local->hw.conf.channel->band];
 
@@ -2479,6 +2480,10 @@ static int ieee80211_sta_join_ibss(struct net_device *dev,
 	ifsta->state = IEEE80211_IBSS_JOINED;
 	mod_timer(&ifsta->timer, jiffies + IEEE80211_IBSS_MERGE_INTERVAL);
 
+	memset(&wrqu, 0, sizeof(wrqu));
+	memcpy(wrqu.ap_addr.sa_data, bss->bssid, ETH_ALEN);
+	wireless_send_event(dev, SIOCGIWAP, &wrqu, NULL);
+
 	return res;
 }
 
-- 
cgit v1.2.3


From ad81b2f97d42e13ef78bb3798e046cd5f0492980 Mon Sep 17 00:00:00 2001
From: Assaf Krauss <assaf.krauss@intel.com>
Date: Wed, 4 Jun 2008 20:27:59 +0300
Subject: mac80211: Fixing slow IBSS rejoin

This patch fixes the issue of slow reconnection to an IBSS cell after
disconnection from it. Now the interface's bssid is reset upon ifdown.

ieee80211_sta_find_ibss:
if (found && memcmp(ifsta->bssid, bssid, ETH_ALEN) != 0 &&
	    (bss = ieee80211_rx_bss_get(dev, bssid,
					local->hw.conf.channel->center_freq,
					ifsta->ssid, ifsta->ssid_len)))

Note:
In general disconnection is still not handled properly in mac80211

Signed-off-by: Assaf Krauss <assaf.krauss@intel.com>
Signed-off-by: Tomas Winkler <tomas.winkler@intel.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/main.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index 5c876450b14c..98c0b5e56ecc 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -511,6 +511,7 @@ static int ieee80211_stop(struct net_device *dev)
 	case IEEE80211_IF_TYPE_STA:
 	case IEEE80211_IF_TYPE_IBSS:
 		sdata->u.sta.state = IEEE80211_DISABLED;
+		memset(sdata->u.sta.bssid, 0, ETH_ALEN);
 		del_timer_sync(&sdata->u.sta.timer);
 		/*
 		 * When we get here, the interface is marked down.
-- 
cgit v1.2.3


From 872ba53395b2a8be08c3ea2d39e225e5b4a8cb40 Mon Sep 17 00:00:00 2001
From: Dan Williams <dcbw@redhat.com>
Date: Wed, 4 Jun 2008 13:59:34 -0400
Subject: mac80211: decrease IBSS creation latency

Sufficient scans (at least 2 or 3) should have been done within 7
seconds to find an existing IBSS to join.  This should improve IBSS
creation latency; and since IBSS merging is still in effect, shouldn't
have detrimental effects on eventual IBSS convergence.

Signed-off-by: Dan Williams <dcbw@redhat.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/mlme.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index af375da9df21..affe42f8484c 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -44,7 +44,7 @@
 #define IEEE80211_RETRY_AUTH_INTERVAL (1 * HZ)
 #define IEEE80211_SCAN_INTERVAL (2 * HZ)
 #define IEEE80211_SCAN_INTERVAL_SLOW (15 * HZ)
-#define IEEE80211_IBSS_JOIN_TIMEOUT (20 * HZ)
+#define IEEE80211_IBSS_JOIN_TIMEOUT (7 * HZ)
 
 #define IEEE80211_PROBE_DELAY (HZ / 33)
 #define IEEE80211_CHANNEL_TIME (HZ / 33)
-- 
cgit v1.2.3


From be038b376465953c358d675cb38a611898a49dc2 Mon Sep 17 00:00:00 2001
From: Assaf Krauss <assaf.krauss@intel.com>
Date: Thu, 5 Jun 2008 19:55:21 +0300
Subject: mac80211: Checking IBSS support while changing channel in ad-hoc mode

This patch adds a check to the set_channel flow. When attempting to change
the channel while in IBSS mode, and the new channel does not support IBSS
mode, the flow return with an error value with no consequences on the
mac80211 and driver state.

Signed-off-by: Assaf Krauss <assaf.krauss@intel.com>
Signed-off-by: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Signed-off-by: Tomas Winkler <tomas.winkler@intel.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/ieee80211_i.h |  2 +-
 net/mac80211/mlme.c        | 11 ++++-------
 net/mac80211/wext.c        | 15 +++++++++++----
 3 files changed, 16 insertions(+), 12 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index c7314bf4bec2..006486b26726 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -899,7 +899,7 @@ extern const struct iw_handler_def ieee80211_iw_handler_def;
 
 
 /* ieee80211_ioctl.c */
-int ieee80211_set_freq(struct ieee80211_local *local, int freq);
+int ieee80211_set_freq(struct net_device *dev, int freq);
 /* ieee80211_sta.c */
 void ieee80211_sta_timer(unsigned long data);
 void ieee80211_sta_work(struct work_struct *work);
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index affe42f8484c..4d2b582dd055 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -2359,13 +2359,10 @@ static int ieee80211_sta_join_ibss(struct net_device *dev,
 	sdata->drop_unencrypted = bss->capability &
 		WLAN_CAPABILITY_PRIVACY ? 1 : 0;
 
-	res = ieee80211_set_freq(local, bss->freq);
+	res = ieee80211_set_freq(dev, bss->freq);
 
-	if (local->oper_channel->flags & IEEE80211_CHAN_NO_IBSS) {
-		printk(KERN_DEBUG "%s: IBSS not allowed on frequency "
-		       "%d MHz\n", dev->name, local->oper_channel->center_freq);
-		return -1;
-	}
+	if (res)
+		return res;
 
 	/* Set beacon template */
 	skb = dev_alloc_skb(local->hw.extra_tx_headroom + 400);
@@ -3491,7 +3488,7 @@ static int ieee80211_sta_config_auth(struct net_device *dev,
 	spin_unlock_bh(&local->sta_bss_lock);
 
 	if (selected) {
-		ieee80211_set_freq(local, selected->freq);
+		ieee80211_set_freq(dev, selected->freq);
 		if (!(ifsta->flags & IEEE80211_STA_SSID_SET))
 			ieee80211_sta_set_ssid(dev, selected->ssid,
 					       selected->ssid_len);
diff --git a/net/mac80211/wext.c b/net/mac80211/wext.c
index 8311bb24f9f3..a8bb8e31b1ec 100644
--- a/net/mac80211/wext.c
+++ b/net/mac80211/wext.c
@@ -290,14 +290,22 @@ static int ieee80211_ioctl_giwmode(struct net_device *dev,
 	return 0;
 }
 
-int ieee80211_set_freq(struct ieee80211_local *local, int freqMHz)
+int ieee80211_set_freq(struct net_device *dev, int freqMHz)
 {
 	int ret = -EINVAL;
 	struct ieee80211_channel *chan;
+	struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr);
+	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
 
 	chan = ieee80211_get_channel(local->hw.wiphy, freqMHz);
 
 	if (chan && !(chan->flags & IEEE80211_CHAN_DISABLED)) {
+		if (sdata->vif.type == IEEE80211_IF_TYPE_IBSS &&
+		    chan->flags & IEEE80211_CHAN_NO_IBSS) {
+			printk(KERN_DEBUG "%s: IBSS not allowed on frequency "
+				"%d MHz\n", dev->name, chan->center_freq);
+			return ret;
+		}
 		local->oper_channel = chan;
 
 		if (local->sta_sw_scanning || local->sta_hw_scanning)
@@ -315,7 +323,6 @@ static int ieee80211_ioctl_siwfreq(struct net_device *dev,
 				   struct iw_request_info *info,
 				   struct iw_freq *freq, char *extra)
 {
-	struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr);
 	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
 
 	if (sdata->vif.type == IEEE80211_IF_TYPE_STA)
@@ -329,14 +336,14 @@ static int ieee80211_ioctl_siwfreq(struct net_device *dev,
 					IEEE80211_STA_AUTO_CHANNEL_SEL;
 			return 0;
 		} else
-			return ieee80211_set_freq(local,
+			return ieee80211_set_freq(dev,
 				ieee80211_channel_to_frequency(freq->m));
 	} else {
 		int i, div = 1000000;
 		for (i = 0; i < freq->e; i++)
 			div /= 10;
 		if (div > 0)
-			return ieee80211_set_freq(local, freq->m / div);
+			return ieee80211_set_freq(dev, freq->m / div);
 		else
 			return -EINVAL;
 	}
-- 
cgit v1.2.3


From 2e761e0532a784816e7e822dbaaece8c5d4be14d Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@openvz.org>
Date: Mon, 9 Jun 2008 15:53:30 -0700
Subject: ipv6 netns: init net is used to set bindv6only for new sock

The bindv6only is tuned via sysctl. It is already on a struct net
and per-net sysctls allow for its modification (ipv6_sysctl_net_init).

Despite this the value configured in the init net is used for the
rest of them.

Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/af_inet6.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 3c6aafb02183..e84b3fd17fb4 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -191,7 +191,7 @@ lookup_protocol:
 	np->mcast_hops	= -1;
 	np->mc_loop	= 1;
 	np->pmtudisc	= IPV6_PMTUDISC_WANT;
-	np->ipv6only	= init_net.ipv6.sysctl.bindv6only;
+	np->ipv6only	= net->ipv6.sysctl.bindv6only;
 
 	/* Init the ipv4 part of the socket since we can have sockets
 	 * using v6 API for ipv4.
-- 
cgit v1.2.3


From ce4a7d0d48bbaed78ccbb0bafb9229651a40303a Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Tue, 10 Jun 2008 12:39:35 -0700
Subject: inet{6}_request_sock: Init ->opt and ->pktopts in the constructor

Wei Yongjun noticed that we may call reqsk_free on request sock objects where
the opt fields may not be initialized, fix it by introducing inet_reqsk_alloc
where we initialize ->opt to NULL and set ->pktopts to NULL in
inet6_reqsk_alloc.

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dccp/ipv4.c       | 3 +--
 net/dccp/ipv6.c       | 1 -
 net/ipv4/syncookies.c | 3 +--
 net/ipv4/tcp_ipv4.c   | 2 +-
 net/ipv6/syncookies.c | 1 -
 net/ipv6/tcp_ipv6.c   | 1 -
 6 files changed, 3 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index c22a3780c14e..37d27bcb361f 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -589,7 +589,7 @@ int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 	if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
 		goto drop;
 
-	req = reqsk_alloc(&dccp_request_sock_ops);
+	req = inet_reqsk_alloc(&dccp_request_sock_ops);
 	if (req == NULL)
 		goto drop;
 
@@ -605,7 +605,6 @@ int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 	ireq = inet_rsk(req);
 	ireq->loc_addr = ip_hdr(skb)->daddr;
 	ireq->rmt_addr = ip_hdr(skb)->saddr;
-	ireq->opt	= NULL;
 
 	/*
 	 * Step 3: Process LISTEN state
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index 9b1129bb7ece..f7fe2a572d7b 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -421,7 +421,6 @@ static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
 	ireq6 = inet6_rsk(req);
 	ipv6_addr_copy(&ireq6->rmt_addr, &ipv6_hdr(skb)->saddr);
 	ipv6_addr_copy(&ireq6->loc_addr, &ipv6_hdr(skb)->daddr);
-	ireq6->pktopts	= NULL;
 
 	if (ipv6_opt_accepted(sk, skb) ||
 	    np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo ||
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 73ba98921d64..d182a2a26291 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -285,7 +285,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
 		cookie_check_timestamp(&tcp_opt);
 
 	ret = NULL;
-	req = reqsk_alloc(&tcp_request_sock_ops); /* for safety */
+	req = inet_reqsk_alloc(&tcp_request_sock_ops); /* for safety */
 	if (!req)
 		goto out;
 
@@ -301,7 +301,6 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
 	ireq->rmt_port		= th->source;
 	ireq->loc_addr		= ip_hdr(skb)->daddr;
 	ireq->rmt_addr		= ip_hdr(skb)->saddr;
-	ireq->opt		= NULL;
 	ireq->snd_wscale	= tcp_opt.snd_wscale;
 	ireq->rcv_wscale	= tcp_opt.rcv_wscale;
 	ireq->sack_ok		= tcp_opt.sack_ok;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index cd601a866c2f..4f8485c67d1a 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1285,7 +1285,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 	if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
 		goto drop;
 
-	req = reqsk_alloc(&tcp_request_sock_ops);
+	req = inet_reqsk_alloc(&tcp_request_sock_ops);
 	if (!req)
 		goto drop;
 
diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c
index 938ce4ecde55..3ecc1157994e 100644
--- a/net/ipv6/syncookies.c
+++ b/net/ipv6/syncookies.c
@@ -198,7 +198,6 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
 	ireq = inet_rsk(req);
 	ireq6 = inet6_rsk(req);
 	treq = tcp_rsk(req);
-	ireq6->pktopts = NULL;
 
 	if (security_inet_conn_request(sk, skb, req)) {
 		reqsk_free(req);
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 715965f0fac0..cb46749d4c32 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1299,7 +1299,6 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
 	treq = inet6_rsk(req);
 	ipv6_addr_copy(&treq->rmt_addr, &ipv6_hdr(skb)->saddr);
 	ipv6_addr_copy(&treq->loc_addr, &ipv6_hdr(skb)->daddr);
-	treq->pktopts = NULL;
 	if (!want_cookie)
 		TCP_ECN_create_request(req, tcp_hdr(skb));
 
-- 
cgit v1.2.3


From 99c6f60e72f112b57ddb07abb2e5f771ee211f43 Mon Sep 17 00:00:00 2001
From: Jamal Hadi Salim <hadi@cyberus.ca>
Date: Tue, 10 Jun 2008 14:25:34 -0700
Subject: ipsec: pfkey should ignore events when no listeners

When pfkey has no km listeners, it still does a lot of work
before finding out there aint nobody out there.
If a tree falls in a forest and no one is around to hear it, does it make
a sound? In this case it makes a lot of noise:
With this short-circuit adding 10s of thousands of SAs using
netlink improves performance by ~10%.

Signed-off-by: Jamal Hadi Salim <hadi@cyberus.ca>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/key/af_key.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'net')

diff --git a/net/key/af_key.c b/net/key/af_key.c
index 9bba7ac5fee0..7470e367272b 100644
--- a/net/key/af_key.c
+++ b/net/key/af_key.c
@@ -3030,6 +3030,9 @@ static int key_notify_sa_expire(struct xfrm_state *x, struct km_event *c)
 
 static int pfkey_send_notify(struct xfrm_state *x, struct km_event *c)
 {
+	if (atomic_read(&pfkey_socks_nr) == 0)
+		return 0;
+
 	switch (c->event) {
 	case XFRM_MSG_EXPIRE:
 		return key_notify_sa_expire(x, c);
-- 
cgit v1.2.3


From 709772e6e06564ed94ba740de70185ac3d792773 Mon Sep 17 00:00:00 2001
From: Krzysztof Piotr Oledzki <ole@ans.pl>
Date: Tue, 10 Jun 2008 15:44:49 -0700
Subject: net: Fix routing tables with id > 255 for legacy software

Most legacy software do not like tables > 255 as rtm_table is u8
so tb_id is sent &0xff and it is possible to mismatch for example
table 510 with table 254 (main).

This patch introduces RT_TABLE_COMPAT=252 so the code uses it if
tb_id > 255. It makes such old applications happy, new
ones are still able to use RTA_TABLE to get a proper table id.

Signed-off-by: Krzysztof Piotr Oledzki <ole@ans.pl>
Acked-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_semantics.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 3b83c34019fc..0d4d72827e4b 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -960,7 +960,10 @@ int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
 	rtm->rtm_dst_len = dst_len;
 	rtm->rtm_src_len = 0;
 	rtm->rtm_tos = tos;
-	rtm->rtm_table = tb_id;
+	if (tb_id < 256)
+		rtm->rtm_table = tb_id;
+	else
+		rtm->rtm_table = RT_TABLE_COMPAT;
 	NLA_PUT_U32(skb, RTA_TABLE, tb_id);
 	rtm->rtm_type = type;
 	rtm->rtm_flags = fi->fib_flags;
-- 
cgit v1.2.3


From 3294f202dc1acd82223e83ef59f272bd87bb06b2 Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Wed, 11 Jun 2008 11:19:09 +0100
Subject: dccp ccid-3: Bug-Fix - Zero RTT is possible

In commit $(825de27d9e40b3117b29a79d412b7a4b78c5d815) (from 27th May, commit
message `dccp ccid-3: Fix "t_ipi explosion" bug'), the CCID-3 window counter
computation was fixed to cope with RTTs < 4 microseconds.

Such RTTs can be found e.g. when running CCID-3 over loopback. The fix removed
a check against RTT < 4, but introduced a divide-by-zero bug.

All steady-state RTTs in DCCP are filtered using dccp_sample_rtt(), which
ensures non-zero samples. However, a zero RTT is possible on initialisation,
when there is no RTT sample from the Request/Response exchange.

The fix is to use the fallback-RTT from RFC 4340, 3.4.

This is also better than just fixing update_win_count() since it allows other
parts of the code to always assume that the RTT is non-zero during the time
that the CCID is used.

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
---
 net/dccp/ccids/ccid3.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c
index f813077234b7..0474f4c5707a 100644
--- a/net/dccp/ccids/ccid3.c
+++ b/net/dccp/ccids/ccid3.c
@@ -329,8 +329,14 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb)
 			hctx->ccid3hctx_x    = rfc3390_initial_rate(sk);
 			hctx->ccid3hctx_t_ld = now;
 		} else {
-			/* Sender does not have RTT sample: X_pps = 1 pkt/sec */
-			hctx->ccid3hctx_x = hctx->ccid3hctx_s;
+			/*
+			 * Sender does not have RTT sample:
+			 * - set fallback RTT (RFC 4340, 3.4) since a RTT value
+			 *   is needed in several parts (e.g.  window counter);
+			 * - set sending rate X_pps = 1pps as per RFC 3448, 4.2.
+			 */
+			hctx->ccid3hctx_rtt = DCCP_FALLBACK_RTT;
+			hctx->ccid3hctx_x   = hctx->ccid3hctx_s;
 			hctx->ccid3hctx_x <<= 6;
 		}
 		ccid3_update_send_interval(hctx);
-- 
cgit v1.2.3


From 1e2f0e5e8376f2a0ada8760fc9d3104e1a81382b Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Wed, 11 Jun 2008 11:19:09 +0100
Subject: dccp: Fix sparse warnings

This patch fixes the following sparse warnings:
 * nested min(max()) expression:
   net/dccp/ccids/ccid3.c:91:21: warning: symbol '__x' shadows an earlier one
   net/dccp/ccids/ccid3.c:91:21: warning: symbol '__y' shadows an earlier one

 * Declaration of function prototypes in .c instead of .h file, resulting in
   "should it be static?" warnings.

 * Declared "struct dccpw" static (local to dccp_probe).

 * Disabled dccp_delayed_ack() - not fully removed due to RFC 4340, 11.3
   ("Receivers SHOULD implement delayed acknowledgement timers ...").

 * Used a different local variable name to avoid
   net/dccp/ackvec.c:293:13: warning: symbol 'state' shadows an earlier one
   net/dccp/ackvec.c:238:33: originally declared here

 * Removed unused functions `dccp_ackvector_print' and `dccp_ackvec_print'.

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
---
 net/dccp/ackvec.c         | 29 ++---------------------------
 net/dccp/ccids/ccid3.c    |  4 ++--
 net/dccp/ccids/lib/tfrc.c |  8 --------
 net/dccp/ccids/lib/tfrc.h | 11 +++++++++--
 net/dccp/output.c         |  2 ++
 net/dccp/probe.c          |  2 +-
 6 files changed, 16 insertions(+), 40 deletions(-)

(limited to 'net')

diff --git a/net/dccp/ackvec.c b/net/dccp/ackvec.c
index 6de4bd195d28..1e8be246ad15 100644
--- a/net/dccp/ackvec.c
+++ b/net/dccp/ackvec.c
@@ -290,12 +290,12 @@ int dccp_ackvec_add(struct dccp_ackvec *av, const struct sock *sk,
 
 		while (1) {
 			const u8 len = dccp_ackvec_len(av, index);
-			const u8 state = dccp_ackvec_state(av, index);
+			const u8 av_state = dccp_ackvec_state(av, index);
 			/*
 			 * valid packets not yet in av_buf have a reserved
 			 * entry, with a len equal to 0.
 			 */
-			if (state == DCCP_ACKVEC_STATE_NOT_RECEIVED &&
+			if (av_state == DCCP_ACKVEC_STATE_NOT_RECEIVED &&
 			    len == 0 && delta == 0) { /* Found our
 							 reserved seat! */
 				dccp_pr_debug("Found %llu reserved seat!\n",
@@ -325,31 +325,6 @@ out_duplicate:
 	return -EILSEQ;
 }
 
-#ifdef CONFIG_IP_DCCP_DEBUG
-void dccp_ackvector_print(const u64 ackno, const unsigned char *vector, int len)
-{
-	dccp_pr_debug_cat("ACK vector len=%d, ackno=%llu |", len,
-			 (unsigned long long)ackno);
-
-	while (len--) {
-		const u8 state = (*vector & DCCP_ACKVEC_STATE_MASK) >> 6;
-		const u8 rl = *vector & DCCP_ACKVEC_LEN_MASK;
-
-		dccp_pr_debug_cat("%d,%d|", state, rl);
-		++vector;
-	}
-
-	dccp_pr_debug_cat("\n");
-}
-
-void dccp_ackvec_print(const struct dccp_ackvec *av)
-{
-	dccp_ackvector_print(av->av_buf_ackno,
-			     av->av_buf + av->av_buf_head,
-			     av->av_vec_len);
-}
-#endif
-
 static void dccp_ackvec_throw_record(struct dccp_ackvec *av,
 				     struct dccp_ackvec_record *avr)
 {
diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c
index 0474f4c5707a..a1929f33d703 100644
--- a/net/dccp/ccids/ccid3.c
+++ b/net/dccp/ccids/ccid3.c
@@ -159,8 +159,8 @@ static void ccid3_hc_tx_update_x(struct sock *sk, ktime_t *stamp)
 	} else if (ktime_us_delta(now, hctx->ccid3hctx_t_ld)
 				- (s64)hctx->ccid3hctx_rtt >= 0) {
 
-		hctx->ccid3hctx_x =
-			max(min(2 * hctx->ccid3hctx_x, min_rate),
+		hctx->ccid3hctx_x = min(2 * hctx->ccid3hctx_x, min_rate);
+		hctx->ccid3hctx_x = max(hctx->ccid3hctx_x,
 			    scaled_div(((__u64)hctx->ccid3hctx_s) << 6,
 				       hctx->ccid3hctx_rtt));
 		hctx->ccid3hctx_t_ld = now;
diff --git a/net/dccp/ccids/lib/tfrc.c b/net/dccp/ccids/lib/tfrc.c
index d1dfbb8de64c..97ecec0a8e76 100644
--- a/net/dccp/ccids/lib/tfrc.c
+++ b/net/dccp/ccids/lib/tfrc.c
@@ -14,14 +14,6 @@ module_param(tfrc_debug, bool, 0444);
 MODULE_PARM_DESC(tfrc_debug, "Enable debug messages");
 #endif
 
-extern int  tfrc_tx_packet_history_init(void);
-extern void tfrc_tx_packet_history_exit(void);
-extern int  tfrc_rx_packet_history_init(void);
-extern void tfrc_rx_packet_history_exit(void);
-
-extern int  tfrc_li_init(void);
-extern void tfrc_li_exit(void);
-
 static int __init tfrc_module_init(void)
 {
 	int rc = tfrc_li_init();
diff --git a/net/dccp/ccids/lib/tfrc.h b/net/dccp/ccids/lib/tfrc.h
index 1fb1187bbf1c..ddd8107b927f 100644
--- a/net/dccp/ccids/lib/tfrc.h
+++ b/net/dccp/ccids/lib/tfrc.h
@@ -58,7 +58,14 @@ static inline u32 tfrc_ewma(const u32 avg, const u32 newval, const u8 weight)
 	return avg ? (weight * avg + (10 - weight) * newval) / 10 : newval;
 }
 
-extern u32 tfrc_calc_x(u16 s, u32 R, u32 p);
-extern u32 tfrc_calc_x_reverse_lookup(u32 fvalue);
+extern u32  tfrc_calc_x(u16 s, u32 R, u32 p);
+extern u32  tfrc_calc_x_reverse_lookup(u32 fvalue);
 
+extern int  tfrc_tx_packet_history_init(void);
+extern void tfrc_tx_packet_history_exit(void);
+extern int  tfrc_rx_packet_history_init(void);
+extern void tfrc_rx_packet_history_exit(void);
+
+extern int  tfrc_li_init(void);
+extern void tfrc_li_exit(void);
 #endif /* _TFRC_H_ */
diff --git a/net/dccp/output.c b/net/dccp/output.c
index 1f8a9b64c083..fe20068c5d8e 100644
--- a/net/dccp/output.c
+++ b/net/dccp/output.c
@@ -508,6 +508,7 @@ void dccp_send_ack(struct sock *sk)
 
 EXPORT_SYMBOL_GPL(dccp_send_ack);
 
+#if 0
 /* FIXME: Is this still necessary (11.3) - currently nowhere used by DCCP. */
 void dccp_send_delayed_ack(struct sock *sk)
 {
@@ -538,6 +539,7 @@ void dccp_send_delayed_ack(struct sock *sk)
 	icsk->icsk_ack.timeout = timeout;
 	sk_reset_timer(sk, &icsk->icsk_delack_timer, timeout);
 }
+#endif
 
 void dccp_send_sync(struct sock *sk, const u64 ackno,
 		    const enum dccp_pkt_type pkt_type)
diff --git a/net/dccp/probe.c b/net/dccp/probe.c
index 0bcdc9250279..81368a7f5379 100644
--- a/net/dccp/probe.c
+++ b/net/dccp/probe.c
@@ -42,7 +42,7 @@ static int bufsize = 64 * 1024;
 
 static const char procname[] = "dccpprobe";
 
-struct {
+static struct {
 	struct kfifo	  *fifo;
 	spinlock_t	  lock;
 	wait_queue_head_t wait;
-- 
cgit v1.2.3


From 65907a433ac0ca450c4408080f24c6e4743386b2 Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Wed, 11 Jun 2008 11:19:09 +0100
Subject: dccp ccid-2: Bug-Fix - Ack Vectors need to be ignored on request
 sockets

This fixes an oversight from an earlier patch, ensuring that Ack Vectors
are not processed on request sockets.

The issue is that Ack Vectors must not be parsed on request sockets, since
the Ack Vector feature depends on the selection of the (TX) CCID. During the
initial handshake the CCIDs are undefined, and so RFC 4340, 10.3 applies:

 "Using CCID-specific options and feature options during a negotiation
  for the corresponding CCID feature is NOT RECOMMENDED [...]"

And it is not even possible: when the server receives the Request from the
client, the CCID and Ack vector features are undefined; when the Ack finalising
the 3-way hanshake arrives, the request socket has not been cloned yet into a
full socket. (This order is necessary, since otherwise the newly created socket
would have to be destroyed whenever an option error occurred - a malicious
hacker could simply send garbage options and exploit this.)

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
---
 net/dccp/options.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/dccp/options.c b/net/dccp/options.c
index d2a84a2fecee..43bc24e761d0 100644
--- a/net/dccp/options.c
+++ b/net/dccp/options.c
@@ -107,9 +107,11 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq,
 		 *
 		 * CCID-specific options are ignored during connection setup, as
 		 * negotiation may still be in progress (see RFC 4340, 10.3).
+		 * The same applies to Ack Vectors, as these depend on the CCID.
 		 *
 		 */
-		if (dreq != NULL && opt >= 128)
+		if (dreq != NULL && (opt >= 128 ||
+		    opt == DCCPO_ACK_VECTOR_0 || opt == DCCPO_ACK_VECTOR_1))
 			goto ignore_option;
 
 		switch (opt) {
-- 
cgit v1.2.3


From 1e8a287c79f64226541f5c44aa52d4698bb84cf5 Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Wed, 11 Jun 2008 11:19:10 +0100
Subject: dccp ccid-3: TFRC reverse-lookup Bug-Fix

This fixes a bug in the reverse lookup of p: given a value f(p), instead of p,
the function returned the smallest tabulated value f(p).

The smallest tabulated value of

   10^6 * f(p) =  sqrt(2*p/3) + 12 * sqrt(3*p/8) * (32 * p^3 + p)

for p=0.0001 is 8172.

Since this value is scaled by 10^6, the outcome of this bug is that a loss
of 8172/10^6 = 0.8172% was reported whenever the input was below the table
resolution of 0.01%.

This means that the value was over 80 times too high, resulting in large spikes
of the initial loss interval, thus unnecessarily reducing the throughput.

Also corrected the printk format (%u for u32).

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
---
 net/dccp/ccids/lib/tfrc_equation.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/dccp/ccids/lib/tfrc_equation.c b/net/dccp/ccids/lib/tfrc_equation.c
index e4e64b76c10c..2f20a29cffe4 100644
--- a/net/dccp/ccids/lib/tfrc_equation.c
+++ b/net/dccp/ccids/lib/tfrc_equation.c
@@ -661,7 +661,7 @@ u32 tfrc_calc_x(u16 s, u32 R, u32 p)
 
 EXPORT_SYMBOL_GPL(tfrc_calc_x);
 
-/*
+/**
  *  tfrc_calc_x_reverse_lookup  -  try to find p given f(p)
  *
  *  @fvalue: function value to match, scaled by 1000000
@@ -676,11 +676,11 @@ u32 tfrc_calc_x_reverse_lookup(u32 fvalue)
 
 	/* Error cases. */
 	if (fvalue < tfrc_calc_x_lookup[0][1]) {
-		DCCP_WARN("fvalue %d smaller than resolution\n", fvalue);
-		return tfrc_calc_x_lookup[0][1];
+		DCCP_WARN("fvalue %u smaller than resolution\n", fvalue);
+		return TFRC_SMALLEST_P;
 	}
 	if (fvalue > tfrc_calc_x_lookup[TFRC_CALC_X_ARRSIZE - 1][0]) {
-		DCCP_WARN("fvalue %d exceeds bounds!\n", fvalue);
+		DCCP_WARN("fvalue %u exceeds bounds!\n", fvalue);
 		return 1000000;
 	}
 
-- 
cgit v1.2.3


From 7deb0f851003287d7e259bf6b33548b144c0f2d5 Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Wed, 11 Jun 2008 11:19:10 +0100
Subject: dccp ccid-3: X truncated due to type conversion

This fixes a bug in computing the inter-packet-interval t_ipi = s/X:

 scaled_div32(a, b) uses u32 for b, but in "scaled_div32(s, X)" the type of the
 sending rate `X' is u64. Since X is scaled by 2^6, this truncates rates greater
 than 2^26 Bps (~537 Mbps).

Using full 64-bit division now.

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
---
 net/dccp/ccids/lib/tfrc.h | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/net/dccp/ccids/lib/tfrc.h b/net/dccp/ccids/lib/tfrc.h
index ddd8107b927f..ed9857527acf 100644
--- a/net/dccp/ccids/lib/tfrc.h
+++ b/net/dccp/ccids/lib/tfrc.h
@@ -15,7 +15,7 @@
  *  (at your option) any later version.
  */
 #include <linux/types.h>
-#include <asm/div64.h>
+#include <linux/math64.h>
 #include "../../dccp.h"
 /* internal includes that this module exports: */
 #include "loss_interval.h"
@@ -29,21 +29,19 @@ extern int tfrc_debug;
 #endif
 
 /* integer-arithmetic divisions of type (a * 1000000)/b */
-static inline u64 scaled_div(u64 a, u32 b)
+static inline u64 scaled_div(u64 a, u64 b)
 {
 	BUG_ON(b==0);
-	a *= 1000000;
-	do_div(a, b);
-	return a;
+	return div64_u64(a * 1000000, b);
 }
 
-static inline u32 scaled_div32(u64 a, u32 b)
+static inline u32 scaled_div32(u64 a, u64 b)
 {
 	u64 result = scaled_div(a, b);
 
 	if (result > UINT_MAX) {
-		DCCP_CRIT("Overflow: a(%llu)/b(%u) > ~0U",
-			  (unsigned long long)a, b);
+		DCCP_CRIT("Overflow: %llu/%llu > UINT_MAX",
+			  (unsigned long long)a, (unsigned long long)b);
 		return UINT_MAX;
 	}
 	return result;
-- 
cgit v1.2.3


From be4c798a41bf626cdaacf96c382f116ed2f7dbe9 Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Wed, 11 Jun 2008 11:19:10 +0100
Subject: dccp: Bug in initial acknowledgment number assignment

Step 8.5 in RFC 4340 says for the newly cloned socket

           Initialize S.GAR := S.ISS,

but what in fact the code (minisocks.c) does is

           Initialize S.GAR := S.ISR,

which is wrong (typo?) -- fixed by the patch.

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
---
 net/dccp/minisocks.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c
index 33ad48321b08..66dca5bba858 100644
--- a/net/dccp/minisocks.c
+++ b/net/dccp/minisocks.c
@@ -165,12 +165,12 @@ out_free:
 		/* See dccp_v4_conn_request */
 		newdmsk->dccpms_sequence_window = req->rcv_wnd;
 
-		newdp->dccps_gar = newdp->dccps_isr = dreq->dreq_isr;
-		dccp_update_gsr(newsk, dreq->dreq_isr);
-
-		newdp->dccps_iss = dreq->dreq_iss;
+		newdp->dccps_gar = newdp->dccps_iss = dreq->dreq_iss;
 		dccp_update_gss(newsk, dreq->dreq_iss);
 
+		newdp->dccps_isr = dreq->dreq_isr;
+		dccp_update_gsr(newsk, dreq->dreq_isr);
+
 		/*
 		 * SWL and AWL are initially adjusted so that they are not less than
 		 * the initial Sequence Numbers received and sent, respectively:
-- 
cgit v1.2.3


From 20c61fbd8deb2ada0ac3acecf6156a986dbfff2d Mon Sep 17 00:00:00 2001
From: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Date: Mon, 28 Apr 2008 14:40:55 +0900
Subject: ipv6 mcast: Check address family of gf_group in
 getsockopt(MS_FILTER).

Signed-off-by: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
---
 net/ipv6/ipv6_sockglue.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net')

diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index 26b83e512a09..ce794d6acb70 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -874,6 +874,8 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
 			return -EINVAL;
 		if (copy_from_user(&gsf, optval, GROUP_FILTER_SIZE(0)))
 			return -EFAULT;
+		if (gsf.gf_group.ss_family != AF_INET6)
+			return -EADDRNOTAVAIL;
 		lock_sock(sk);
 		err = ip6_mc_msfget(sk, &gsf,
 			(struct group_filter __user *)optval, optlen);
-- 
cgit v1.2.3


From 36e3deae8ba84865fd9eb3f2f21bbc00d49b7544 Mon Sep 17 00:00:00 2001
From: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Date: Tue, 13 May 2008 02:52:55 +0900
Subject: ipv6 route: Fix route lifetime in netlink message.

1) We may have route lifetime larger than INT_MAX.
In that case we had wired value in lifetime.
Use INT_MAX if lifetime does not fit in s32.

2) Lifetime is valid iif RTF_EXPIRES is set.

Signed-off-by: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
---
 net/ipv6/route.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 220cffe9e63b..d1f3e19b06c7 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2196,8 +2196,12 @@ static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt,
 
 	NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
 
-	expires = (rt->rt6i_flags & RTF_EXPIRES) ?
-			rt->rt6i_expires - jiffies : 0;
+	if (!(rt->rt6i_flags & RTF_EXPIRES))
+		expires = 0;
+	else if (rt->rt6i_expires - jiffies < INT_MAX)
+		expires = rt->rt6i_expires - jiffies;
+	else
+		expires = INT_MAX;
 
 	if (rtnl_put_cacheinfo(skb, &rt->u.dst, 0, 0, 0,
 			       expires, rt->u.dst.error) < 0)
-- 
cgit v1.2.3


From e8766fc86b34d44a8c55a2f9d71da69e091b1ca4 Mon Sep 17 00:00:00 2001
From: Shan Wei <shanwei@cn.fujitsu.com>
Date: Tue, 10 Jun 2008 15:50:55 +0800
Subject: ipv6: Check the hop limit setting in ancillary data.

When specifing the outgoing hop limit as ancillary data for sendmsg(),
the kernel doesn't check the integer hop limit value as specified in
[RFC-3542] section 6.3.

Signed-off-by: Shan Wei <shanwei@cn.fujitsu.com>
Signed-off-by: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
---
 net/ipv6/datagram.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'net')

diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index b9c2de84a8a2..0f0f94a40335 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -705,6 +705,11 @@ int datagram_send_ctl(struct net *net,
 			}
 
 			*hlimit = *(int *)CMSG_DATA(cmsg);
+			if (*hlimit < -1 || *hlimit > 0xff) {
+				err = -EINVAL;
+				goto exit_f;
+			}
+
 			break;
 
 		case IPV6_TCLASS:
-- 
cgit v1.2.3


From 28d4488216645cd71402925cffde9528b0cfdb7e Mon Sep 17 00:00:00 2001
From: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Date: Thu, 12 Jun 2008 03:14:51 +0900
Subject: ipv6: Check IPV6_MULTICAST_LOOP option value.

Only 0 and 1 are valid for IPV6_MULTICAST_LOOP socket option,
and we should return an error of EINVAL otherwise, per RFC3493.

Based on patch from Shan Wei <shanwei@cn.fujitsu.com>.

Signed-off-by: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
---
 net/ipv6/ipv6_sockglue.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net')

diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index ce794d6acb70..9a3697172d5e 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -458,6 +458,8 @@ done:
 	case IPV6_MULTICAST_LOOP:
 		if (optlen < sizeof(int))
 			goto e_inval;
+		if (val != valbool)
+			goto e_inval;
 		np->mc_loop = valbool;
 		retv = 0;
 		break;
-- 
cgit v1.2.3


From 1717699cd5130009b7cd6756e883d8582c1fe706 Mon Sep 17 00:00:00 2001
From: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Date: Thu, 12 Jun 2008 03:27:26 +0900
Subject: ipv6: Fail with appropriate error code when setting not-applicable
 sockopt.

IPV6_MULTICAST_HOPS, for example, is not valid for stream sockets.
Since they are virtually unavailable for stream sockets,
we should return ENOPROTOOPT instead of EINVAL.

Signed-off-by: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
---
 net/ipv6/ipv6_sockglue.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index 9a3697172d5e..c042ce19bd14 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -67,7 +67,7 @@ int ip6_ra_control(struct sock *sk, int sel, void (*destructor)(struct sock *))
 
 	/* RA packet may be delivered ONLY to IPPROTO_RAW socket */
 	if (sk->sk_type != SOCK_RAW || inet_sk(sk)->num != IPPROTO_RAW)
-		return -EINVAL;
+		return -ENOPROTOOPT;
 
 	new_ra = (sel>=0) ? kmalloc(sizeof(*new_ra), GFP_KERNEL) : NULL;
 
@@ -446,7 +446,7 @@ done:
 
 	case IPV6_MULTICAST_HOPS:
 		if (sk->sk_type == SOCK_STREAM)
-			goto e_inval;
+			break;
 		if (optlen < sizeof(int))
 			goto e_inval;
 		if (val > 255 || val < -1)
@@ -466,7 +466,7 @@ done:
 
 	case IPV6_MULTICAST_IF:
 		if (sk->sk_type == SOCK_STREAM)
-			goto e_inval;
+			break;
 		if (optlen < sizeof(int))
 			goto e_inval;
 
@@ -862,7 +862,7 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
 		if (sk->sk_protocol != IPPROTO_UDP &&
 		    sk->sk_protocol != IPPROTO_UDPLITE &&
 		    sk->sk_protocol != IPPROTO_TCP)
-			return -EINVAL;
+			return -ENOPROTOOPT;
 		if (sk->sk_state != TCP_ESTABLISHED)
 			return -ENOTCONN;
 		val = sk->sk_family;
-- 
cgit v1.2.3


From b66985b11b8b00e1ec65b89a3112510ac9a9ec6e Mon Sep 17 00:00:00 2001
From: Eric Leblond <eric@inl.fr>
Date: Wed, 11 Jun 2008 17:50:27 -0700
Subject: netfilter: Make nflog quiet when no one listen in userspace.

The message "nf_log_packet: can't log since no backend logging module loaded
in! Please either load one, or disable logging explicitly" was displayed for
each logged packet when no userspace application is listening to nflog events.
The message seems to warn for a problem with a kernel module missing but as
said before this is not the case. I thus propose to suppress the message (I
don't see any reason to flood the log because a user application has crashed.)

Signed-off-by: Eric Leblond <eric@inl.fr>
Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/netfilter/nf_log.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c
index bc11d7092032..9fda6ee95a31 100644
--- a/net/netfilter/nf_log.c
+++ b/net/netfilter/nf_log.c
@@ -92,10 +92,6 @@ void nf_log_packet(int pf,
 		vsnprintf(prefix, sizeof(prefix), fmt, args);
 		va_end(args);
 		logger->logfn(pf, hooknum, skb, in, out, loginfo, prefix);
-	} else if (net_ratelimit()) {
-		printk(KERN_WARNING "nf_log_packet: can\'t log since "
-		       "no backend logging module loaded in! Please either "
-		       "load one, or disable logging explicitly\n");
 	}
 	rcu_read_unlock();
 }
-- 
cgit v1.2.3


From ceeff7541e5a4ba8e8d97ffbae32b3f283cb7a3f Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Wed, 11 Jun 2008 17:51:10 -0700
Subject: netfilter: nf_conntrack: fix ctnetlink related crash in
 nf_nat_setup_info()

When creation of a new conntrack entry in ctnetlink fails after having
set up the NAT mappings, the conntrack has an extension area allocated
that is not getting properly destroyed when freeing the conntrack again.
This means the NAT extension is still in the bysource hash, causing a
crash when walking over the hash chain the next time:

BUG: unable to handle kernel paging request at 00120fbd
IP: [<c03d394b>] nf_nat_setup_info+0x221/0x58a
*pde = 00000000
Oops: 0000 [#1] PREEMPT SMP

Pid: 2795, comm: conntrackd Not tainted (2.6.26-rc5 #1)
EIP: 0060:[<c03d394b>] EFLAGS: 00010206 CPU: 1
EIP is at nf_nat_setup_info+0x221/0x58a
EAX: 00120fbd EBX: 00120fbd ECX: 00000001 EDX: 00000000
ESI: 0000019e EDI: e853bbb4 EBP: e853bbc8 ESP: e853bb78
 DS: 007b ES: 007b FS: 00d8 GS: 0033 SS: 0068
Process conntrackd (pid: 2795, ti=e853a000 task=f7de10f0 task.ti=e853a000)
Stack: 00000000 e853bc2c e85672ec 00000008 c0561084 63c1db4a 00000000 00000000
       00000000 0002e109 61d2b1c3 00000000 00000000 00000000 01114e22 61d2b1c3
       00000000 00000000 f7444674 e853bc04 00000008 c038e728 0000000a f7444674
Call Trace:
 [<c038e728>] nla_parse+0x5c/0xb0
 [<c0397c1b>] ctnetlink_change_status+0x190/0x1c6
 [<c0397eec>] ctnetlink_new_conntrack+0x189/0x61f
 [<c0119aee>] update_curr+0x3d/0x52
 [<c03902d1>] nfnetlink_rcv_msg+0xc1/0xd8
 [<c0390228>] nfnetlink_rcv_msg+0x18/0xd8
 [<c0390210>] nfnetlink_rcv_msg+0x0/0xd8
 [<c038d2ce>] netlink_rcv_skb+0x2d/0x71
 [<c0390205>] nfnetlink_rcv+0x19/0x24
 [<c038d0f5>] netlink_unicast+0x1b3/0x216
 ...

Move invocation of the extension destructors to nf_conntrack_free()
to fix this problem.

Fixes http://bugzilla.kernel.org/show_bug.cgi?id=10875

Reported-and-Tested-by: Krzysztof Piotr Oledzki <ole@ans.pl>
Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/netfilter/nf_conntrack_core.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index c4b1799da5d7..662c1ccfee26 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -196,8 +196,6 @@ destroy_conntrack(struct nf_conntrack *nfct)
 	if (l4proto && l4proto->destroy)
 		l4proto->destroy(ct);
 
-	nf_ct_ext_destroy(ct);
-
 	rcu_read_unlock();
 
 	spin_lock_bh(&nf_conntrack_lock);
@@ -520,6 +518,7 @@ static void nf_conntrack_free_rcu(struct rcu_head *head)
 
 void nf_conntrack_free(struct nf_conn *ct)
 {
+	nf_ct_ext_destroy(ct);
 	call_rcu(&ct->rcu, nf_conntrack_free_rcu);
 }
 EXPORT_SYMBOL_GPL(nf_conntrack_free);
-- 
cgit v1.2.3


From f23d60de719e639690b2dc5c2d0e4243ff614b7a Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Thu, 12 Jun 2008 14:47:58 -0700
Subject: ipv6: Fix duplicate initialization of rawv6_prot.destroy

In changeset 22dd485022f3d0b162ceb5e67d85de7c3806aa20
("raw: Raw socket leak.") code was added so that we
flush pending frames on raw sockets to avoid leaks.

The ipv4 part was fine, but the ipv6 part was not
done correctly.  Unlike the ipv4 side, the ipv6 code
already has a .destroy method for rawv6_prot.

So now there were two assignments to this member, and
what the compiler does is use the last one, effectively
making the ipv6 parts of that changeset a NOP.

Fix this by removing the:

	.destroy	   = inet6_destroy_sock,

line, and adding an inet6_destroy_sock() call to the
end of raw6_destroy().

Noticed by Al Viro.

Signed-off-by: David S. Miller <davem@davemloft.net>
Acked-by: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
---
 net/ipv6/raw.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index 8fee9a15b2d3..3aee12310d94 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -1169,7 +1169,8 @@ static int raw6_destroy(struct sock *sk)
 	lock_sock(sk);
 	ip6_flush_pending_frames(sk);
 	release_sock(sk);
-	return 0;
+
+	return inet6_destroy_sock(sk);
 }
 
 static int rawv6_init_sk(struct sock *sk)
@@ -1200,7 +1201,6 @@ struct proto rawv6_prot = {
 	.disconnect	   = udp_disconnect,
 	.ioctl		   = rawv6_ioctl,
 	.init		   = rawv6_init_sk,
-	.destroy	   = inet6_destroy_sock,
 	.setsockopt	   = rawv6_setsockopt,
 	.getsockopt	   = rawv6_getsockopt,
 	.sendmsg	   = rawv6_sendmsg,
-- 
cgit v1.2.3


From ec0a196626bd12e0ba108d7daa6d95a4fb25c2c5 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Thu, 12 Jun 2008 16:31:35 -0700
Subject: tcp: Revert 'process defer accept as established' changes.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This reverts two changesets, ec3c0982a2dd1e671bad8e9d26c28dcba0039d87
("[TCP]: TCP_DEFER_ACCEPT updates - process as established") and
the follow-on bug fix 9ae27e0adbf471c7a6b80102e38e1d5a346b3b38
("tcp: Fix slab corruption with ipv6 and tcp6fuzz").

This change causes several problems, first reported by Ingo Molnar
as a distcc-over-loopback regression where connections were getting
stuck.

Ilpo Järvinen first spotted the locking problems.  The new function
added by this code, tcp_defer_accept_check(), only has the
child socket locked, yet it is modifying state of the parent
listening socket.

Fixing that is non-trivial at best, because we can't simply just grab
the parent listening socket lock at this point, because it would
create an ABBA deadlock.  The normal ordering is parent listening
socket --> child socket, but this code path would require the
reverse lock ordering.

Next is a problem noticed by Vitaliy Gusev, he noted:

----------------------------------------
>--- a/net/ipv4/tcp_timer.c
>+++ b/net/ipv4/tcp_timer.c
>@@ -481,6 +481,11 @@ static void tcp_keepalive_timer (unsigned long data)
> 		goto death;
> 	}
>
>+	if (tp->defer_tcp_accept.request && sk->sk_state == TCP_ESTABLISHED) {
>+		tcp_send_active_reset(sk, GFP_ATOMIC);
>+		goto death;

Here socket sk is not attached to listening socket's request queue. tcp_done()
will not call inet_csk_destroy_sock() (and tcp_v4_destroy_sock() which should
release this sk) as socket is not DEAD. Therefore socket sk will be lost for
freeing.
----------------------------------------

Finally, Alexey Kuznetsov argues that there might not even be any
real value or advantage to these new semantics even if we fix all
of the bugs:

----------------------------------------
Hiding from accept() sockets with only out-of-order data only
is the only thing which is impossible with old approach. Is this really
so valuable? My opinion: no, this is nothing but a new loophole
to consume memory without control.
----------------------------------------

So revert this thing for now.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/inet_connection_sock.c | 11 +++++++---
 net/ipv4/tcp.c                  | 18 ++++++++++-------
 net/ipv4/tcp_input.c            | 45 -----------------------------------------
 net/ipv4/tcp_ipv4.c             |  8 --------
 net/ipv4/tcp_minisocks.c        | 32 +++++++++++------------------
 net/ipv4/tcp_timer.c            |  5 -----
 6 files changed, 31 insertions(+), 88 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 828ea211ff21..045e799d3e1d 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -419,7 +419,8 @@ void inet_csk_reqsk_queue_prune(struct sock *parent,
 	struct inet_connection_sock *icsk = inet_csk(parent);
 	struct request_sock_queue *queue = &icsk->icsk_accept_queue;
 	struct listen_sock *lopt = queue->listen_opt;
-	int thresh = icsk->icsk_syn_retries ? : sysctl_tcp_synack_retries;
+	int max_retries = icsk->icsk_syn_retries ? : sysctl_tcp_synack_retries;
+	int thresh = max_retries;
 	unsigned long now = jiffies;
 	struct request_sock **reqp, *req;
 	int i, budget;
@@ -455,6 +456,9 @@ void inet_csk_reqsk_queue_prune(struct sock *parent,
 		}
 	}
 
+	if (queue->rskq_defer_accept)
+		max_retries = queue->rskq_defer_accept;
+
 	budget = 2 * (lopt->nr_table_entries / (timeout / interval));
 	i = lopt->clock_hand;
 
@@ -462,8 +466,9 @@ void inet_csk_reqsk_queue_prune(struct sock *parent,
 		reqp=&lopt->syn_table[i];
 		while ((req = *reqp) != NULL) {
 			if (time_after_eq(now, req->expires)) {
-				if (req->retrans < thresh &&
-				    !req->rsk_ops->rtx_syn_ack(parent, req)) {
+				if ((req->retrans < (inet_rsk(req)->acked ? max_retries : thresh)) &&
+				    (inet_rsk(req)->acked ||
+				     !req->rsk_ops->rtx_syn_ack(parent, req))) {
 					unsigned long timeo;
 
 					if (req->retrans++ == 0)
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index ab66683b8043..fc54a48fde1e 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2112,12 +2112,15 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
 		break;
 
 	case TCP_DEFER_ACCEPT:
-		if (val < 0) {
-			err = -EINVAL;
-		} else {
-			if (val > MAX_TCP_ACCEPT_DEFERRED)
-				val = MAX_TCP_ACCEPT_DEFERRED;
-			icsk->icsk_accept_queue.rskq_defer_accept = val;
+		icsk->icsk_accept_queue.rskq_defer_accept = 0;
+		if (val > 0) {
+			/* Translate value in seconds to number of
+			 * retransmits */
+			while (icsk->icsk_accept_queue.rskq_defer_accept < 32 &&
+			       val > ((TCP_TIMEOUT_INIT / HZ) <<
+				       icsk->icsk_accept_queue.rskq_defer_accept))
+				icsk->icsk_accept_queue.rskq_defer_accept++;
+			icsk->icsk_accept_queue.rskq_defer_accept++;
 		}
 		break;
 
@@ -2299,7 +2302,8 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
 			val = (val ? : sysctl_tcp_fin_timeout) / HZ;
 		break;
 	case TCP_DEFER_ACCEPT:
-		val = icsk->icsk_accept_queue.rskq_defer_accept;
+		val = !icsk->icsk_accept_queue.rskq_defer_accept ? 0 :
+			((TCP_TIMEOUT_INIT / HZ) << (icsk->icsk_accept_queue.rskq_defer_accept - 1));
 		break;
 	case TCP_WINDOW_CLAMP:
 		val = tp->window_clamp;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index eba873e9b560..cad73b7dfef0 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4541,49 +4541,6 @@ static void tcp_urg(struct sock *sk, struct sk_buff *skb, struct tcphdr *th)
 	}
 }
 
-static int tcp_defer_accept_check(struct sock *sk)
-{
-	struct tcp_sock *tp = tcp_sk(sk);
-
-	if (tp->defer_tcp_accept.request) {
-		int queued_data =  tp->rcv_nxt - tp->copied_seq;
-		int hasfin =  !skb_queue_empty(&sk->sk_receive_queue) ?
-			tcp_hdr((struct sk_buff *)
-				sk->sk_receive_queue.prev)->fin : 0;
-
-		if (queued_data && hasfin)
-			queued_data--;
-
-		if (queued_data &&
-		    tp->defer_tcp_accept.listen_sk->sk_state == TCP_LISTEN) {
-			if (sock_flag(sk, SOCK_KEEPOPEN)) {
-				inet_csk_reset_keepalive_timer(sk,
-							       keepalive_time_when(tp));
-			} else {
-				inet_csk_delete_keepalive_timer(sk);
-			}
-
-			inet_csk_reqsk_queue_add(
-				tp->defer_tcp_accept.listen_sk,
-				tp->defer_tcp_accept.request,
-				sk);
-
-			tp->defer_tcp_accept.listen_sk->sk_data_ready(
-				tp->defer_tcp_accept.listen_sk, 0);
-
-			sock_put(tp->defer_tcp_accept.listen_sk);
-			sock_put(sk);
-			tp->defer_tcp_accept.listen_sk = NULL;
-			tp->defer_tcp_accept.request = NULL;
-		} else if (hasfin ||
-			   tp->defer_tcp_accept.listen_sk->sk_state != TCP_LISTEN) {
-			tcp_reset(sk);
-			return -1;
-		}
-	}
-	return 0;
-}
-
 static int tcp_copy_to_iovec(struct sock *sk, struct sk_buff *skb, int hlen)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
@@ -4944,8 +4901,6 @@ step5:
 
 	tcp_data_snd_check(sk);
 	tcp_ack_snd_check(sk);
-
-	tcp_defer_accept_check(sk);
 	return 0;
 
 csum_error:
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 4f8485c67d1a..97a230026e13 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1918,14 +1918,6 @@ int tcp_v4_destroy_sock(struct sock *sk)
 		sk->sk_sndmsg_page = NULL;
 	}
 
-	if (tp->defer_tcp_accept.request) {
-		reqsk_free(tp->defer_tcp_accept.request);
-		sock_put(tp->defer_tcp_accept.listen_sk);
-		sock_put(sk);
-		tp->defer_tcp_accept.listen_sk = NULL;
-		tp->defer_tcp_accept.request = NULL;
-	}
-
 	atomic_dec(&tcp_sockets_allocated);
 
 	return 0;
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 019c8c16e5cc..8245247a6ceb 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -571,8 +571,10 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
 	   does sequence test, SYN is truncated, and thus we consider
 	   it a bare ACK.
 
-	   Both ends (listening sockets) accept the new incoming
-	   connection and try to talk to each other. 8-)
+	   If icsk->icsk_accept_queue.rskq_defer_accept, we silently drop this
+	   bare ACK.  Otherwise, we create an established connection.  Both
+	   ends (listening sockets) accept the new incoming connection and try
+	   to talk to each other. 8-)
 
 	   Note: This case is both harmless, and rare.  Possibility is about the
 	   same as us discovering intelligent life on another plant tomorrow.
@@ -640,6 +642,13 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
 		if (!(flg & TCP_FLAG_ACK))
 			return NULL;
 
+		/* If TCP_DEFER_ACCEPT is set, drop bare ACK. */
+		if (inet_csk(sk)->icsk_accept_queue.rskq_defer_accept &&
+		    TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) {
+			inet_rsk(req)->acked = 1;
+			return NULL;
+		}
+
 		/* OK, ACK is valid, create big socket and
 		 * feed this segment to it. It will repeat all
 		 * the tests. THIS SEGMENT MUST MOVE SOCKET TO
@@ -678,24 +687,7 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
 		inet_csk_reqsk_queue_unlink(sk, req, prev);
 		inet_csk_reqsk_queue_removed(sk, req);
 
-		if (inet_csk(sk)->icsk_accept_queue.rskq_defer_accept &&
-		    TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) {
-
-			/* the accept queue handling is done is est recv slow
-			 * path so lets make sure to start there
-			 */
-			tcp_sk(child)->pred_flags = 0;
-			sock_hold(sk);
-			sock_hold(child);
-			tcp_sk(child)->defer_tcp_accept.listen_sk = sk;
-			tcp_sk(child)->defer_tcp_accept.request = req;
-
-			inet_csk_reset_keepalive_timer(child,
-						       inet_csk(sk)->icsk_accept_queue.rskq_defer_accept * HZ);
-		} else {
-			inet_csk_reqsk_queue_add(sk, req, child);
-		}
-
+		inet_csk_reqsk_queue_add(sk, req, child);
 		return child;
 
 	listen_overflow:
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 4de68cf5f2aa..63ed9d6830e7 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -489,11 +489,6 @@ static void tcp_keepalive_timer (unsigned long data)
 		goto death;
 	}
 
-	if (tp->defer_tcp_accept.request && sk->sk_state == TCP_ESTABLISHED) {
-		tcp_send_active_reset(sk, GFP_ATOMIC);
-		goto death;
-	}
-
 	if (!sock_flag(sk, SOCK_KEEPOPEN) || sk->sk_state == TCP_CLOSE)
 		goto out;
 
-- 
cgit v1.2.3


From 5c5f9664d5284d8542062fed39e1f19b80db7aa5 Mon Sep 17 00:00:00 2001
From: Abhijeet Kolekar <abhijeet.kolekar@intel.com>
Date: Thu, 12 Jun 2008 09:47:16 +0800
Subject: mac80211 : fix for iwconfig in ad-hoc mode

The patch checks interface status, if it is in IBSS_JOINED mode
show cell id it is associated with.

Signed-off-by: Abhijeet Kolekar <abhijeet.kolekar@intel.com>
Signed-off-by: Zhu Yi <yi.zhu@intel.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/wext.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/mac80211/wext.c b/net/mac80211/wext.c
index a8bb8e31b1ec..6106cb79060c 100644
--- a/net/mac80211/wext.c
+++ b/net/mac80211/wext.c
@@ -496,7 +496,8 @@ static int ieee80211_ioctl_giwap(struct net_device *dev,
 	sdata = IEEE80211_DEV_TO_SUB_IF(dev);
 	if (sdata->vif.type == IEEE80211_IF_TYPE_STA ||
 	    sdata->vif.type == IEEE80211_IF_TYPE_IBSS) {
-		if (sdata->u.sta.state == IEEE80211_ASSOCIATED) {
+		if (sdata->u.sta.state == IEEE80211_ASSOCIATED ||
+		    sdata->u.sta.state == IEEE80211_IBSS_JOINED) {
 			ap_addr->sa_family = ARPHRD_ETHER;
 			memcpy(&ap_addr->sa_data, sdata->u.sta.bssid, ETH_ALEN);
 			return 0;
-- 
cgit v1.2.3


From 995ad6c5a415c9389d094d246ca1b305c1e31813 Mon Sep 17 00:00:00 2001
From: Tomas Winkler <tomas.winkler@intel.com>
Date: Thu, 12 Jun 2008 20:08:19 +0300
Subject: mac80211: add missing new line in debug print HT_DEBUG

This patch adds '\n' in debug printk (wme.c HT DEBUG)

Signed-off-by: Tomas Winkler <tomas.winkler@intel.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/wme.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/mac80211/wme.c b/net/mac80211/wme.c
index dc1598b86004..635b996c8c35 100644
--- a/net/mac80211/wme.c
+++ b/net/mac80211/wme.c
@@ -673,7 +673,7 @@ int ieee80211_ht_agg_queue_add(struct ieee80211_local *local,
 #ifdef CONFIG_MAC80211_HT_DEBUG
 			if (net_ratelimit())
 				printk(KERN_DEBUG "allocated aggregation queue"
-					" %d tid %d addr %s pool=0x%lX",
+					" %d tid %d addr %s pool=0x%lX\n",
 					i, tid, print_mac(mac, sta->addr),
 					q->qdisc_pool[0]);
 #endif /* CONFIG_MAC80211_HT_DEBUG */
-- 
cgit v1.2.3


From f9ffcedddba5b2fc5ab16ef08bca55af8be2717e Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <hawk@comx.dk>
Date: Mon, 16 Jun 2008 16:38:33 -0700
Subject: pkt_sched: HTB scheduler, change default hysteresis mode to off.

The HTB hysteresis mode reduce the CPU load, but at the
cost of scheduling accuracy.

On ADSL links (512 kbit/s upstream), this inaccuracy introduce
significant jitter, enought to disturbe VoIP.  For details see my
masters thesis (http://www.adsl-optimizer.dk/thesis/), chapter 7,
section 7.3.1, pp 69-70.

Signed-off-by: Jesper Dangaard Brouer <hawk@comx.dk>
Acked-by: Martin Devera <devik@cdi.cz>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_htb.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
index 5bc1ed490180..9134f029ee0f 100644
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
@@ -53,7 +53,7 @@
 */
 
 #define HTB_HSIZE 16		/* classid hash size */
-#define HTB_HYSTERESIS 1	/* whether to use mode hysteresis for speedup */
+#define HTB_HYSTERESIS 0	/* whether to use mode hysteresis for speedup */
 #define HTB_VER 0x30011		/* major must be matched with number suplied by TC as version */
 
 #if HTB_VER >> 16 != TC_HTB_PROTOVER
-- 
cgit v1.2.3


From 47083fc0735f5145b72fc31236d07339dc52b908 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <hawk@comx.dk>
Date: Mon, 16 Jun 2008 16:39:32 -0700
Subject: pkt_sched: Change HTB_HYSTERESIS to a runtime parameter
 htb_hysteresis.

Add a htb_hysteresis parameter to htb_sch.ko and by sysfs magic make
it runtime adjustable via
/sys/module/sch_htb/parameters/htb_hysteresis mode 640.

Signed-off-by: Jesper Dangaard Brouer <hawk@comx.dk>
Acked-by: Martin Devera <devik@cdi.cz>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_htb.c | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
index 9134f029ee0f..6807c97985a5 100644
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
@@ -28,6 +28,7 @@
  * $Id: sch_htb.c,v 1.25 2003/12/07 11:08:25 devik Exp devik $
  */
 #include <linux/module.h>
+#include <linux/moduleparam.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
 #include <linux/string.h>
@@ -53,13 +54,17 @@
 */
 
 #define HTB_HSIZE 16		/* classid hash size */
-#define HTB_HYSTERESIS 0	/* whether to use mode hysteresis for speedup */
+static int htb_hysteresis __read_mostly = 0; /* whether to use mode hysteresis for speedup */
 #define HTB_VER 0x30011		/* major must be matched with number suplied by TC as version */
 
 #if HTB_VER >> 16 != TC_HTB_PROTOVER
 #error "Mismatched sch_htb.c and pkt_sch.h"
 #endif
 
+/* Module parameter and sysfs export */
+module_param    (htb_hysteresis, int, 0640);
+MODULE_PARM_DESC(htb_hysteresis, "Hysteresis mode, less CPU load, less accurate");
+
 /* used internaly to keep status of single class */
 enum htb_cmode {
 	HTB_CANT_SEND,		/* class can't send and can't borrow */
@@ -462,19 +467,21 @@ static void htb_deactivate_prios(struct htb_sched *q, struct htb_class *cl)
 		htb_remove_class_from_row(q, cl, mask);
 }
 
-#if HTB_HYSTERESIS
 static inline long htb_lowater(const struct htb_class *cl)
 {
-	return cl->cmode != HTB_CANT_SEND ? -cl->cbuffer : 0;
+	if (htb_hysteresis)
+		return cl->cmode != HTB_CANT_SEND ? -cl->cbuffer : 0;
+	else
+		return 0;
 }
 static inline long htb_hiwater(const struct htb_class *cl)
 {
-	return cl->cmode == HTB_CAN_SEND ? -cl->buffer : 0;
+	if (htb_hysteresis)
+		return cl->cmode == HTB_CAN_SEND ? -cl->buffer : 0;
+	else
+		return 0;
 }
-#else
-#define htb_lowater(cl)	(0)
-#define htb_hiwater(cl)	(0)
-#endif
+
 
 /**
  * htb_class_mode - computes and returns current class mode
-- 
cgit v1.2.3


From 2b4743bd6be9fedaa560f8c6dc3997e9ec21b99b Mon Sep 17 00:00:00 2001
From: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Date: Mon, 16 Jun 2008 16:48:20 -0700
Subject: ipv6 sit: Avoid extra need for compat layer in PRL management.

We've introduced extra need of compat layer for ip_tunnel_prl{}
for PRL (Potential Router List) management.  Though compat_ioctl
is still missing in ipv4/ipv6, let's make the interface more
straight-forward and eliminate extra need for nasty compat layer
anyway since the interface is new for 2.6.26.

Signed-off-by: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/sit.c | 44 ++++++++++++++++++++++++--------------------
 1 file changed, 24 insertions(+), 20 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index 3de6ffdaedf2..32e871a6c25a 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -222,15 +222,18 @@ __ipip6_tunnel_locate_prl(struct ip_tunnel *t, __be32 addr)
 
 }
 
-static int ipip6_tunnel_get_prl(struct ip_tunnel *t, struct ip_tunnel_prl *a)
+static int ipip6_tunnel_get_prl(struct ip_tunnel *t,
+				struct ip_tunnel_prl __user *a)
 {
-	struct ip_tunnel_prl *kp;
+	struct ip_tunnel_prl kprl, *kp;
 	struct ip_tunnel_prl_entry *prl;
 	unsigned int cmax, c = 0, ca, len;
 	int ret = 0;
 
-	cmax = a->datalen / sizeof(*a);
-	if (cmax > 1 && a->addr != htonl(INADDR_ANY))
+	if (copy_from_user(&kprl, a, sizeof(kprl)))
+		return -EFAULT;
+	cmax = kprl.datalen / sizeof(kprl);
+	if (cmax > 1 && kprl.addr != htonl(INADDR_ANY))
 		cmax = 1;
 
 	/* For simple GET or for root users,
@@ -261,26 +264,25 @@ static int ipip6_tunnel_get_prl(struct ip_tunnel *t, struct ip_tunnel_prl *a)
 	for (prl = t->prl; prl; prl = prl->next) {
 		if (c > cmax)
 			break;
-		if (a->addr != htonl(INADDR_ANY) && prl->addr != a->addr)
+		if (kprl.addr != htonl(INADDR_ANY) && prl->addr != kprl.addr)
 			continue;
 		kp[c].addr = prl->addr;
 		kp[c].flags = prl->flags;
 		c++;
-		if (a->addr != htonl(INADDR_ANY))
+		if (kprl.addr != htonl(INADDR_ANY))
 			break;
 	}
 out:
 	read_unlock(&ipip6_lock);
 
 	len = sizeof(*kp) * c;
-	ret = len ? copy_to_user(a->data, kp, len) : 0;
+	ret = 0;
+	if ((len && copy_to_user(a + 1, kp, len)) || put_user(len, &a->datalen))
+		ret = -EFAULT;
 
 	kfree(kp);
-	if (ret)
-		return -EFAULT;
 
-	a->datalen = len;
-	return 0;
+	return ret;
 }
 
 static int
@@ -873,11 +875,20 @@ ipip6_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
 		break;
 
 	case SIOCGETPRL:
+		err = -EINVAL;
+		if (dev == sitn->fb_tunnel_dev)
+			goto done;
+		err = -ENOENT;
+		if (!(t = netdev_priv(dev)))
+			goto done;
+		err = ipip6_tunnel_get_prl(t, ifr->ifr_ifru.ifru_data);
+		break;
+
 	case SIOCADDPRL:
 	case SIOCDELPRL:
 	case SIOCCHGPRL:
 		err = -EPERM;
-		if (cmd != SIOCGETPRL && !capable(CAP_NET_ADMIN))
+		if (!capable(CAP_NET_ADMIN))
 			goto done;
 		err = -EINVAL;
 		if (dev == sitn->fb_tunnel_dev)
@@ -890,12 +901,6 @@ ipip6_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
 			goto done;
 
 		switch (cmd) {
-		case SIOCGETPRL:
-			err = ipip6_tunnel_get_prl(t, &prl);
-			if (!err && copy_to_user(ifr->ifr_ifru.ifru_data,
-						 &prl, sizeof(prl)))
-				err = -EFAULT;
-			break;
 		case SIOCDELPRL:
 			err = ipip6_tunnel_del_prl(t, &prl);
 			break;
@@ -904,8 +909,7 @@ ipip6_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
 			err = ipip6_tunnel_add_prl(t, &prl, cmd == SIOCCHGPRL);
 			break;
 		}
-		if (cmd != SIOCGETPRL)
-			netdev_state_change(dev);
+		netdev_state_change(dev);
 		break;
 
 	default:
-- 
cgit v1.2.3


From 93653e0448196344d7699ccad395eaebd30359d1 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Mon, 16 Jun 2008 16:57:40 -0700
Subject: tcp: Revert reset of deferred accept changes in 2.6.26

Ingo's system is still seeing strange behavior, and he
reports that is goes away if the rest of the deferred
accept changes are reverted too.

Therefore this reverts e4c78840284f3f51b1896cf3936d60a6033c4d2c
("[TCP]: TCP_DEFER_ACCEPT updates - dont retxmt synack") and
539fae89bebd16ebeafd57a87169bc56eb530d76 ("[TCP]: TCP_DEFER_ACCEPT
updates - defer timeout conflicts with max_thresh").

Just like the other revert, these ideas can be revisited for
2.6.27

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/inet_connection_sock.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 045e799d3e1d..ec834480abe7 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -466,9 +466,9 @@ void inet_csk_reqsk_queue_prune(struct sock *parent,
 		reqp=&lopt->syn_table[i];
 		while ((req = *reqp) != NULL) {
 			if (time_after_eq(now, req->expires)) {
-				if ((req->retrans < (inet_rsk(req)->acked ? max_retries : thresh)) &&
-				    (inet_rsk(req)->acked ||
-				     !req->rsk_ops->rtx_syn_ack(parent, req))) {
+				if ((req->retrans < thresh ||
+				     (inet_rsk(req)->acked && req->retrans < max_retries))
+				    && !req->rsk_ops->rtx_syn_ack(parent, req)) {
 					unsigned long timeo;
 
 					if (req->retrans++ == 0)
-- 
cgit v1.2.3


From 80896a3584bbff9ff9ad4dde735517c4de68d736 Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yjwei@cn.fujitsu.com>
Date: Mon, 16 Jun 2008 16:59:55 -0700
Subject: sctp: Correctly cleanup procfs entries upon failure.

This patch remove the proc fs entry which has been created if fail to
set up proc fs entry for the SCTP protocol.

Signed-off-by: Wei Yongjun <yjwei@cn.fujitsu.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: Vlad Yasevich <vladislav.yasevich@hp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/protocol.c | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index b435a193c5df..9258dfe784ae 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -108,14 +108,23 @@ static __init int sctp_proc_init(void)
 	}
 
 	if (sctp_snmp_proc_init())
-		goto out_nomem;
+		goto out_snmp_proc_init;
 	if (sctp_eps_proc_init())
-		goto out_nomem;
+		goto out_eps_proc_init;
 	if (sctp_assocs_proc_init())
-		goto out_nomem;
+		goto out_assocs_proc_init;
 
 	return 0;
 
+out_assocs_proc_init:
+	sctp_eps_proc_exit();
+out_eps_proc_init:
+	sctp_snmp_proc_exit();
+out_snmp_proc_init:
+	if (proc_net_sctp) {
+		proc_net_sctp = NULL;
+		remove_proc_entry("sctp", init_net.proc_net);
+	}
 out_nomem:
 	return -ENOMEM;
 }
-- 
cgit v1.2.3


From 319fa2a24f652dc35e613360c4532b8d2a771add Mon Sep 17 00:00:00 2001
From: Vlad Yasevich <vladislav.yasevich@hp.com>
Date: Mon, 16 Jun 2008 17:00:29 -0700
Subject: sctp: Correclty set changeover_active for SFR-CACC

Right now, any time we set a primary transport we set
the changeover_active flag.  As a result, we invoke SFR-CACC
even when there has been no changeover events.

Only set changeover_active, when there is a true changeover
event, i.e. we had a primary path and we are changing to
another transport.

Signed-off-by: Vlad Yasevich <vladislav.yasevich@hp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/associola.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index 532634861db1..024c3ebd9661 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -474,6 +474,15 @@ static void sctp_association_destroy(struct sctp_association *asoc)
 void sctp_assoc_set_primary(struct sctp_association *asoc,
 			    struct sctp_transport *transport)
 {
+	int changeover = 0;
+
+	/* it's a changeover only if we already have a primary path
+	 * that we are changing
+	 */
+	if (asoc->peer.primary_path != NULL &&
+	    asoc->peer.primary_path != transport)
+		changeover = 1 ;
+
 	asoc->peer.primary_path = transport;
 
 	/* Set a default msg_name for events. */
@@ -499,12 +508,12 @@ void sctp_assoc_set_primary(struct sctp_association *asoc,
 	 * double switch to the same destination address.
 	 */
 	if (transport->cacc.changeover_active)
-		transport->cacc.cycling_changeover = 1;
+		transport->cacc.cycling_changeover = changeover;
 
 	/* 2) The sender MUST set CHANGEOVER_ACTIVE to indicate that
 	 * a changeover has occurred.
 	 */
-	transport->cacc.changeover_active = 1;
+	transport->cacc.changeover_active = changeover;
 
 	/* 3) The sender MUST store the next TSN to be sent in
 	 * next_tsn_at_change.
-- 
cgit v1.2.3


From 6de329e26caed7bbbf51229c80f3948549d3c010 Mon Sep 17 00:00:00 2001
From: Ben Hutchings <bhutchings@solarflare.com>
Date: Mon, 16 Jun 2008 17:02:28 -0700
Subject: net: Fix test for VLAN TX checksum offload capability

Selected device feature bits can be propagated to VLAN devices, so we
can make use of TX checksum offload and TSO on VLAN-tagged packets.
However, if the physical device does not do VLAN tag insertion or
generic checksum offload then the test for TX checksum offload in
dev_queue_xmit() will see a protocol of htons(ETH_P_8021Q) and yield
false.

This splits the checksum offload test into two functions:

- can_checksum_protocol() tests a given protocol against a feature bitmask

- dev_can_checksum() first tests the skb protocol against the device
  features; if that fails and the protocol is htons(ETH_P_8021Q) then
  it tests the encapsulated protocol against the effective device
  features for VLANs

Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
Acked-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 34 ++++++++++++++++++++++++++--------
 1 file changed, 26 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/net/core/dev.c b/net/core/dev.c
index 582963077877..68d8df0992ab 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -119,6 +119,7 @@
 #include <linux/err.h>
 #include <linux/ctype.h>
 #include <linux/if_arp.h>
+#include <linux/if_vlan.h>
 
 #include "net-sysfs.h"
 
@@ -1362,6 +1363,29 @@ void netif_device_attach(struct net_device *dev)
 }
 EXPORT_SYMBOL(netif_device_attach);
 
+static bool can_checksum_protocol(unsigned long features, __be16 protocol)
+{
+	return ((features & NETIF_F_GEN_CSUM) ||
+		((features & NETIF_F_IP_CSUM) &&
+		 protocol == htons(ETH_P_IP)) ||
+		((features & NETIF_F_IPV6_CSUM) &&
+		 protocol == htons(ETH_P_IPV6)));
+}
+
+static bool dev_can_checksum(struct net_device *dev, struct sk_buff *skb)
+{
+	if (can_checksum_protocol(dev->features, skb->protocol))
+		return true;
+
+	if (skb->protocol == htons(ETH_P_8021Q)) {
+		struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
+		if (can_checksum_protocol(dev->features & dev->vlan_features,
+					  veh->h_vlan_encapsulated_proto))
+			return true;
+	}
+
+	return false;
+}
 
 /*
  * Invalidate hardware checksum when packet is to be mangled, and
@@ -1640,14 +1664,8 @@ int dev_queue_xmit(struct sk_buff *skb)
 	if (skb->ip_summed == CHECKSUM_PARTIAL) {
 		skb_set_transport_header(skb, skb->csum_start -
 					      skb_headroom(skb));
-
-		if (!(dev->features & NETIF_F_GEN_CSUM) &&
-		    !((dev->features & NETIF_F_IP_CSUM) &&
-		      skb->protocol == htons(ETH_P_IP)) &&
-		    !((dev->features & NETIF_F_IPV6_CSUM) &&
-		      skb->protocol == htons(ETH_P_IPV6)))
-			if (skb_checksum_help(skb))
-				goto out_kfree_skb;
+		if (!dev_can_checksum(dev, skb) && skb_checksum_help(skb))
+			goto out_kfree_skb;
 	}
 
 gso:
-- 
cgit v1.2.3


From 68be802cd5ad040fe8cfa33ce3031405df2d9117 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <dada1@cosmosbay.com>
Date: Mon, 16 Jun 2008 17:03:32 -0700
Subject: raw: Restore /proc/net/raw correct behavior

I just noticed "cat /proc/net/raw" was buggy, missing '\n' separators.

I believe this was introduced by commit 8cd850efa4948d57a2ed836911cfd1ab299e89c6
([RAW]: Cleanup IPv4 raw_seq_show.)

This trivial patch restores correct behavior, and applies to current
Linus tree (should also be applied to stable tree as well.)

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/raw.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index e7e091d365ff..37a1ecd9d600 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -934,7 +934,7 @@ static void raw_sock_seq_show(struct seq_file *seq, struct sock *sp, int i)
 	      srcp  = inet->num;
 
 	seq_printf(seq, "%4d: %08X:%04X %08X:%04X"
-		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p %d",
+		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p %d\n",
 		i, src, srcp, dest, destp, sp->sk_state,
 		atomic_read(&sp->sk_wmem_alloc),
 		atomic_read(&sp->sk_rmem_alloc),
-- 
cgit v1.2.3


From a9d246dbb07cf0bd32bbfc5d184ed738bf2af4f8 Mon Sep 17 00:00:00 2001
From: Rami Rosen <ramirose@gmail.com>
Date: Mon, 16 Jun 2008 17:07:16 -0700
Subject: ipv4: Remove unused definitions in net/ipv4/tcp_ipv4.c.

1) Remove ICMP_MIN_LENGTH, as it is unused.

2) Remove unneeded tcp_v4_send_check() declaration.

Signed-off-by: Rami Rosen <ramirose@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_ipv4.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 97a230026e13..12695be2c255 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -85,10 +85,6 @@
 int sysctl_tcp_tw_reuse __read_mostly;
 int sysctl_tcp_low_latency __read_mostly;
 
-/* Check TCP sequence numbers in ICMP packets. */
-#define ICMP_MIN_LENGTH 8
-
-void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb);
 
 #ifdef CONFIG_TCP_MD5SIG
 static struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk,
-- 
cgit v1.2.3


From 27141666b69f535a4d63d7bc6d9e84ee5032f82a Mon Sep 17 00:00:00 2001
From: "Jorge Boncompte [DTI2]" <jorge@dti2.net>
Date: Mon, 16 Jun 2008 17:15:33 -0700
Subject: atm: [br2684] Fix oops due to skb->dev being NULL

It happens that if a packet arrives in a VC between the call to open it on
the hardware and the call to change the backend to br2684, br2684_regvcc
processes the packet and oopses dereferencing skb->dev because it is
NULL before the call to br2684_push().

Signed-off-by: Jorge Boncompte [DTI2] <jorge@dti2.net>
Signed-off-by: Chas Williams <chas@cmf.nrl.navy.mil>
---
 net/atm/br2684.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/atm/br2684.c b/net/atm/br2684.c
index 9d52ebfc1962..ac6035046adc 100644
--- a/net/atm/br2684.c
+++ b/net/atm/br2684.c
@@ -518,9 +518,9 @@ static int br2684_regvcc(struct atm_vcc *atmvcc, void __user * arg)
 		struct sk_buff *next = skb->next;
 
 		skb->next = skb->prev = NULL;
+		br2684_push(atmvcc, skb);
 		BRPRIV(skb->dev)->stats.rx_bytes -= skb->len;
 		BRPRIV(skb->dev)->stats.rx_packets--;
-		br2684_push(atmvcc, skb);
 
 		skb = next;
 	}
-- 
cgit v1.2.3


From 7e903c2ae36efb526eacab3b25d00e90424bd8a8 Mon Sep 17 00:00:00 2001
From: Eric Kinzie <ekinzie@cmf.nrl.navy.mil>
Date: Mon, 16 Jun 2008 17:18:18 -0700
Subject: atm: [br2864] fix routed vcmux support

From: Eric Kinzie <ekinzie@cmf.nrl.navy.mil>
Signed-off-by: Chas Williams <chas@cmf.nrl.navy.mil>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/atm/br2684.c | 76 ++++++++++++++++++++++++++++++++------------------------
 1 file changed, 44 insertions(+), 32 deletions(-)

(limited to 'net')

diff --git a/net/atm/br2684.c b/net/atm/br2684.c
index ac6035046adc..05fafdc2eea3 100644
--- a/net/atm/br2684.c
+++ b/net/atm/br2684.c
@@ -188,10 +188,13 @@ static int br2684_xmit_vcc(struct sk_buff *skb, struct br2684_dev *brdev,
 				return 0;
 			}
 		}
-	} else {
-		skb_push(skb, 2);
-		if (brdev->payload == p_bridged)
+	} else { /* e_vc */
+		if (brdev->payload == p_bridged) {
+			skb_push(skb, 2);
 			memset(skb->data, 0, 2);
+		} else { /* p_routed */
+			skb_pull(skb, ETH_HLEN);
+		}
 	}
 	skb_debug(skb);
 
@@ -377,11 +380,8 @@ static void br2684_push(struct atm_vcc *atmvcc, struct sk_buff *skb)
 				 (skb->data + 6, ethertype_ipv4,
 				  sizeof(ethertype_ipv4)) == 0)
 				skb->protocol = __constant_htons(ETH_P_IP);
-			else {
-				brdev->stats.rx_errors++;
-				dev_kfree_skb(skb);
-				return;
-			}
+			else
+				goto error;
 			skb_pull(skb, sizeof(llc_oui_ipv4));
 			skb_reset_network_header(skb);
 			skb->pkt_type = PACKET_HOST;
@@ -394,44 +394,56 @@ static void br2684_push(struct atm_vcc *atmvcc, struct sk_buff *skb)
 			   (memcmp(skb->data, llc_oui_pid_pad, 7) == 0)) {
 			skb_pull(skb, sizeof(llc_oui_pid_pad));
 			skb->protocol = eth_type_trans(skb, net_dev);
-		} else {
-			brdev->stats.rx_errors++;
-			dev_kfree_skb(skb);
-			return;
-		}
+		} else
+			goto error;
 
-	} else {
-		/* first 2 chars should be 0 */
-		if (*((u16 *) (skb->data)) != 0) {
-			brdev->stats.rx_errors++;
-			dev_kfree_skb(skb);
-			return;
+	} else { /* e_vc */
+		if (brdev->payload == p_routed) {
+			struct iphdr *iph;
+
+			skb_reset_network_header(skb);
+			iph = ip_hdr(skb);
+			if (iph->version == 4)
+				skb->protocol = __constant_htons(ETH_P_IP);
+			else if (iph->version == 6)
+				skb->protocol = __constant_htons(ETH_P_IPV6);
+			else
+				goto error;
+			skb->pkt_type = PACKET_HOST;
+		} else { /* p_bridged */
+			/* first 2 chars should be 0 */
+			if (*((u16 *) (skb->data)) != 0)
+				goto error;
+			skb_pull(skb, BR2684_PAD_LEN);
+			skb->protocol = eth_type_trans(skb, net_dev);
 		}
-		skb_pull(skb, BR2684_PAD_LEN + ETH_HLEN);	/* pad, dstmac, srcmac, ethtype */
-		skb->protocol = eth_type_trans(skb, net_dev);
 	}
 
 #ifdef CONFIG_ATM_BR2684_IPFILTER
-	if (unlikely(packet_fails_filter(skb->protocol, brvcc, skb))) {
-		brdev->stats.rx_dropped++;
-		dev_kfree_skb(skb);
-		return;
-	}
+	if (unlikely(packet_fails_filter(skb->protocol, brvcc, skb)))
+		goto dropped;
 #endif /* CONFIG_ATM_BR2684_IPFILTER */
 	skb->dev = net_dev;
 	ATM_SKB(skb)->vcc = atmvcc;	/* needed ? */
 	pr_debug("received packet's protocol: %x\n", ntohs(skb->protocol));
 	skb_debug(skb);
-	if (unlikely(!(net_dev->flags & IFF_UP))) {
-		/* sigh, interface is down */
-		brdev->stats.rx_dropped++;
-		dev_kfree_skb(skb);
-		return;
-	}
+	/* sigh, interface is down? */
+	if (unlikely(!(net_dev->flags & IFF_UP)))
+		goto dropped;
 	brdev->stats.rx_packets++;
 	brdev->stats.rx_bytes += skb->len;
 	memset(ATM_SKB(skb), 0, sizeof(struct atm_skb_data));
 	netif_rx(skb);
+	return;
+
+dropped:
+	brdev->stats.rx_dropped++;
+	goto free_skb;
+error:
+	brdev->stats.rx_errors++;
+free_skb:
+	dev_kfree_skb(skb);
+	return;
 }
 
 /*
-- 
cgit v1.2.3


From 68b80f11380889996aa7eadba29dbbb5c29a5864 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Tue, 17 Jun 2008 15:51:47 -0700
Subject: netfilter: nf_nat: fix RCU races

Fix three ct_extend/NAT extension related races:

- When cleaning up the extension area and removing it from the bysource hash,
  the nat->ct pointer must not be set to NULL since it may still be used in
  a RCU read side

- When replacing a NAT extension area in the bysource hash, the nat->ct
  pointer must be assigned before performing the replacement

- When reallocating extension storage in ct_extend, the old memory must
  not be freed immediately since it may still be used by a RCU read side

Possibly fixes https://bugzilla.redhat.com/show_bug.cgi?id=449315
and/or http://bugzilla.kernel.org/show_bug.cgi?id=10875

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/netfilter/nf_nat_core.c    | 3 +--
 net/netfilter/nf_conntrack_extend.c | 9 ++++++++-
 2 files changed, 9 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c
index 04578593e100..d2a887fc8d9b 100644
--- a/net/ipv4/netfilter/nf_nat_core.c
+++ b/net/ipv4/netfilter/nf_nat_core.c
@@ -556,7 +556,6 @@ static void nf_nat_cleanup_conntrack(struct nf_conn *ct)
 
 	spin_lock_bh(&nf_nat_lock);
 	hlist_del_rcu(&nat->bysource);
-	nat->ct = NULL;
 	spin_unlock_bh(&nf_nat_lock);
 }
 
@@ -570,8 +569,8 @@ static void nf_nat_move_storage(void *new, void *old)
 		return;
 
 	spin_lock_bh(&nf_nat_lock);
-	hlist_replace_rcu(&old_nat->bysource, &new_nat->bysource);
 	new_nat->ct = ct;
+	hlist_replace_rcu(&old_nat->bysource, &new_nat->bysource);
 	spin_unlock_bh(&nf_nat_lock);
 }
 
diff --git a/net/netfilter/nf_conntrack_extend.c b/net/netfilter/nf_conntrack_extend.c
index bcc19fa4ed1e..8a3f8b34e466 100644
--- a/net/netfilter/nf_conntrack_extend.c
+++ b/net/netfilter/nf_conntrack_extend.c
@@ -59,12 +59,19 @@ nf_ct_ext_create(struct nf_ct_ext **ext, enum nf_ct_ext_id id, gfp_t gfp)
 	if (!*ext)
 		return NULL;
 
+	INIT_RCU_HEAD(&(*ext)->rcu);
 	(*ext)->offset[id] = off;
 	(*ext)->len = len;
 
 	return (void *)(*ext) + off;
 }
 
+static void __nf_ct_ext_free_rcu(struct rcu_head *head)
+{
+	struct nf_ct_ext *ext = container_of(head, struct nf_ct_ext, rcu);
+	kfree(ext);
+}
+
 void *__nf_ct_ext_add(struct nf_conn *ct, enum nf_ct_ext_id id, gfp_t gfp)
 {
 	struct nf_ct_ext *new;
@@ -106,7 +113,7 @@ void *__nf_ct_ext_add(struct nf_conn *ct, enum nf_ct_ext_id id, gfp_t gfp)
 					(void *)ct->ext + ct->ext->offset[i]);
 			rcu_read_unlock();
 		}
-		kfree(ct->ext);
+		call_rcu(&ct->ext->rcu, __nf_ct_ext_free_rcu);
 		ct->ext = new;
 	}
 
-- 
cgit v1.2.3


From 8a548868db62422113104ebc658065e3fe976951 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Tue, 17 Jun 2008 15:52:07 -0700
Subject: netfilter: nf_conntrack_h323: fix memory leak in module
 initialization error path

Properly free h323_buffer when helper registration fails.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/netfilter/nf_conntrack_h323_main.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/netfilter/nf_conntrack_h323_main.c b/net/netfilter/nf_conntrack_h323_main.c
index 95da1a24aab7..99e385d5b707 100644
--- a/net/netfilter/nf_conntrack_h323_main.c
+++ b/net/netfilter/nf_conntrack_h323_main.c
@@ -1799,6 +1799,7 @@ err3:
 err2:
 	nf_conntrack_helper_unregister(&nf_conntrack_helper_q931[0]);
 err1:
+	kfree(h323_buffer);
 	return ret;
 }
 
-- 
cgit v1.2.3


From a56b8f81580761c65e4d8d0c04ac1cb7a788bdf1 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Tue, 17 Jun 2008 15:52:32 -0700
Subject: netfilter: nf_conntrack_h323: fix module unload crash

The H.245 helper is not registered/unregistered, but assigned to
connections manually from the Q.931 helper. This means on unload
existing expectations and connections using the helper are not
cleaned up, leading to the following oops on module unload:

CPU 0 Unable to handle kernel paging request at virtual address c00a6828, epc == 802224dc, ra == 801d4e7c
Oops[#1]:
Cpu 0
$ 0   : 00000000 00000000 00000004 c00a67f0
$ 4   : 802a5ad0 81657e00 00000000 00000000
$ 8   : 00000008 801461c8 00000000 80570050
$12   : 819b0280 819b04b0 00000006 00000000
$16   : 802a5a60 80000000 80b46000 80321010
$20   : 00000000 00000004 802a5ad0 00000001
$24   : 00000000 802257a8
$28   : 802a4000 802a59e8 00000004 801d4e7c
Hi    : 0000000b
Lo    : 00506320
epc   : 802224dc ip_conntrack_help+0x38/0x74     Tainted: P
ra    : 801d4e7c nf_iterate+0xbc/0x130
Status: 1000f403    KERNEL EXL IE
Cause : 00800008
BadVA : c00a6828
PrId  : 00019374
Modules linked in: ip_nat_pptp ip_conntrack_pptp ath_pktlog wlan_acl wlan_wep wlan_tkip wlan_ccmp wlan_xauth ath_pci ath_dev ath_dfs ath_rate_atheros wlan ath_hal ip_nat_tftp ip_conntrack_tftp ip_nat_ftp ip_conntrack_ftp pppoe ppp_async ppp_deflate ppp_mppe pppox ppp_generic slhc
Process swapper (pid: 0, threadinfo=802a4000, task=802a6000)
Stack : 801e7d98 00000004 802a5a60 80000000 801d4e7c 801d4e7c 802a5ad0 00000004
        00000000 00000000 801e7d98 00000000 00000004 802a5ad0 00000000 00000010
        801e7d98 80b46000 802a5a60 80320000 80000000 801d4f8c 802a5b00 00000002
        80063834 00000000 80b46000 802a5a60 801e7d98 80000000 802ba854 00000000
        81a02180 80b7e260 81a021b0 819b0000 819b0000 80570056 00000000 00000001
        ...
Call Trace:
 [<801e7d98>] ip_finish_output+0x0/0x23c
 [<801d4e7c>] nf_iterate+0xbc/0x130
 [<801d4e7c>] nf_iterate+0xbc/0x130
 [<801e7d98>] ip_finish_output+0x0/0x23c
 [<801e7d98>] ip_finish_output+0x0/0x23c
 [<801d4f8c>] nf_hook_slow+0x9c/0x1a4

One way to fix this would be to split helper cleanup from the unregistration
function and invoke it for the H.245 helper, but since ctnetlink needs to be
able to find the helper for synchonization purposes, a better fix is to
register it normally and make sure its not assigned to connections during
helper lookup. The missing l3num initialization is enough for this, this
patch changes it to use AF_UNSPEC to make it more explicit though.

Reported-by: liannan <liannan@twsz.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/netfilter/nf_conntrack_h323_main.c | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/nf_conntrack_h323_main.c b/net/netfilter/nf_conntrack_h323_main.c
index 99e385d5b707..2f83c158934d 100644
--- a/net/netfilter/nf_conntrack_h323_main.c
+++ b/net/netfilter/nf_conntrack_h323_main.c
@@ -619,6 +619,7 @@ static const struct nf_conntrack_expect_policy h245_exp_policy = {
 static struct nf_conntrack_helper nf_conntrack_helper_h245 __read_mostly = {
 	.name			= "H.245",
 	.me			= THIS_MODULE,
+	.tuple.src.l3num	= AF_UNSPEC,
 	.tuple.dst.protonum	= IPPROTO_UDP,
 	.help			= h245_help,
 	.expect_policy		= &h245_exp_policy,
@@ -1765,6 +1766,7 @@ static void __exit nf_conntrack_h323_fini(void)
 	nf_conntrack_helper_unregister(&nf_conntrack_helper_ras[0]);
 	nf_conntrack_helper_unregister(&nf_conntrack_helper_q931[1]);
 	nf_conntrack_helper_unregister(&nf_conntrack_helper_q931[0]);
+	nf_conntrack_helper_unregister(&nf_conntrack_helper_h245);
 	kfree(h323_buffer);
 	pr_debug("nf_ct_h323: fini\n");
 }
@@ -1777,27 +1779,32 @@ static int __init nf_conntrack_h323_init(void)
 	h323_buffer = kmalloc(65536, GFP_KERNEL);
 	if (!h323_buffer)
 		return -ENOMEM;
-	ret = nf_conntrack_helper_register(&nf_conntrack_helper_q931[0]);
+	ret = nf_conntrack_helper_register(&nf_conntrack_helper_h245);
 	if (ret < 0)
 		goto err1;
-	ret = nf_conntrack_helper_register(&nf_conntrack_helper_q931[1]);
+	ret = nf_conntrack_helper_register(&nf_conntrack_helper_q931[0]);
 	if (ret < 0)
 		goto err2;
-	ret = nf_conntrack_helper_register(&nf_conntrack_helper_ras[0]);
+	ret = nf_conntrack_helper_register(&nf_conntrack_helper_q931[1]);
 	if (ret < 0)
 		goto err3;
-	ret = nf_conntrack_helper_register(&nf_conntrack_helper_ras[1]);
+	ret = nf_conntrack_helper_register(&nf_conntrack_helper_ras[0]);
 	if (ret < 0)
 		goto err4;
+	ret = nf_conntrack_helper_register(&nf_conntrack_helper_ras[1]);
+	if (ret < 0)
+		goto err5;
 	pr_debug("nf_ct_h323: init success\n");
 	return 0;
 
-err4:
+err5:
 	nf_conntrack_helper_unregister(&nf_conntrack_helper_ras[0]);
-err3:
+err4:
 	nf_conntrack_helper_unregister(&nf_conntrack_helper_q931[1]);
-err2:
+err3:
 	nf_conntrack_helper_unregister(&nf_conntrack_helper_q931[0]);
+err2:
+	nf_conntrack_helper_unregister(&nf_conntrack_helper_h245);
 err1:
 	kfree(h323_buffer);
 	return ret;
-- 
cgit v1.2.3


From fe833fca2eac6b3d3ad5e35f44ad4638362f1da8 Mon Sep 17 00:00:00 2001
From: Steffen Klassert <steffen.klassert@secunet.com>
Date: Tue, 17 Jun 2008 16:37:13 -0700
Subject: xfrm: fix fragmentation for ipv4 xfrm tunnel

When generating the ip header for the transformed packet we just copy
the frag_off field of the ip header from the original packet to the ip
header of the new generated packet. If we receive a packet as a chain
of fragments, all but the last of the new generated packets have the
IP_MF flag set. We have to mask the frag_off field to only keep the
IP_DF flag from the original packet. This got lost with git commit
36cf9acf93e8561d9faec24849e57688a81eb9c5 ("[IPSEC]: Separate
inner/outer mode processing on output")

Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
Acked-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/xfrm4_mode_tunnel.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c
index 584e6d74e3a9..7135279f3f84 100644
--- a/net/ipv4/xfrm4_mode_tunnel.c
+++ b/net/ipv4/xfrm4_mode_tunnel.c
@@ -52,7 +52,7 @@ static int xfrm4_mode_tunnel_output(struct xfrm_state *x, struct sk_buff *skb)
 		IP_ECN_clear(top_iph);
 
 	top_iph->frag_off = (flags & XFRM_STATE_NOPMTUDISC) ?
-			    0 : XFRM_MODE_SKB_CB(skb)->frag_off;
+		0 : (XFRM_MODE_SKB_CB(skb)->frag_off & htons(IP_DF));
 	ip_select_ident(top_iph, dst->child, NULL);
 
 	top_iph->ttl = dst_metric(dst->child, RTAX_HOPLIMIT);
-- 
cgit v1.2.3


From 3c73419c09a5ef73d56472dbfdade9e311496e9b Mon Sep 17 00:00:00 2001
From: Rainer Weikusat <rweikusat@mssgmbh.com>
Date: Tue, 17 Jun 2008 22:28:05 -0700
Subject: af_unix: fix 'poll for write'/ connected DGRAM sockets

The unix_dgram_sendmsg routine implements a (somewhat crude)
form of receiver-imposed flow control by comparing the length of the
receive queue of the 'peer socket' with the max_ack_backlog value
stored in the corresponding sock structure, either blocking
the thread which caused the send-routine to be called or returning
EAGAIN. This routine is used by both SOCK_DGRAM and SOCK_SEQPACKET
sockets. The poll-implementation for these socket types is
datagram_poll from core/datagram.c. A socket is deemed to be writeable
by this routine when the memory presently consumed by datagrams
owned by it is less than the configured socket send buffer size. This
is always wrong for connected PF_UNIX non-stream sockets when the
abovementioned receive queue is currently considered to be full.
'poll' will then return, indicating that the socket is writeable, but
a subsequent write result in EAGAIN, effectively causing an
(usual) application to 'poll for writeability by repeated send request
with O_NONBLOCK set' until it has consumed its time quantum.

The change below uses a suitably modified variant of the datagram_poll
routines for both type of PF_UNIX sockets, which tests if the
recv-queue of the peer a socket is connected to is presently
considered to be 'full' as part of the 'is this socket
writeable'-checking code. The socket being polled is additionally
put onto the peer_wait wait queue associated with its peer, because the
unix_dgram_sendmsg routine does a wake up on this queue after a
datagram was received and the 'other wakeup call' is done implicitly
as part of skb destruction, meaning, a process blocked in poll
because of a full peer receive queue could otherwise sleep forever
if no datagram owned by its socket was already sitting on this queue.
Among this change is a small (inline) helper routine named
'unix_recvq_full', which consolidates the actual testing code (in three
different places) into a single location.

Signed-off-by: Rainer Weikusat <rweikusat@mssgmbh.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/unix/af_unix.c | 79 +++++++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 70 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index e18cd3628db4..657835f227d3 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -169,6 +169,11 @@ static inline int unix_may_send(struct sock *sk, struct sock *osk)
 	return (unix_peer(osk) == NULL || unix_our_peer(sk, osk));
 }
 
+static inline int unix_recvq_full(struct sock const *sk)
+{
+	return skb_queue_len(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
+}
+
 static struct sock *unix_peer_get(struct sock *s)
 {
 	struct sock *peer;
@@ -482,6 +487,8 @@ static int unix_socketpair(struct socket *, struct socket *);
 static int unix_accept(struct socket *, struct socket *, int);
 static int unix_getname(struct socket *, struct sockaddr *, int *, int);
 static unsigned int unix_poll(struct file *, struct socket *, poll_table *);
+static unsigned int unix_datagram_poll(struct file *, struct socket *,
+				       poll_table *);
 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
 static int unix_shutdown(struct socket *, int);
 static int unix_stream_sendmsg(struct kiocb *, struct socket *,
@@ -527,7 +534,7 @@ static const struct proto_ops unix_dgram_ops = {
 	.socketpair =	unix_socketpair,
 	.accept =	sock_no_accept,
 	.getname =	unix_getname,
-	.poll =		datagram_poll,
+	.poll =		unix_datagram_poll,
 	.ioctl =	unix_ioctl,
 	.listen =	sock_no_listen,
 	.shutdown =	unix_shutdown,
@@ -548,7 +555,7 @@ static const struct proto_ops unix_seqpacket_ops = {
 	.socketpair =	unix_socketpair,
 	.accept =	unix_accept,
 	.getname =	unix_getname,
-	.poll =		datagram_poll,
+	.poll =		unix_datagram_poll,
 	.ioctl =	unix_ioctl,
 	.listen =	unix_listen,
 	.shutdown =	unix_shutdown,
@@ -983,8 +990,7 @@ static long unix_wait_for_peer(struct sock *other, long timeo)
 
 	sched = !sock_flag(other, SOCK_DEAD) &&
 		!(other->sk_shutdown & RCV_SHUTDOWN) &&
-		(skb_queue_len(&other->sk_receive_queue) >
-		 other->sk_max_ack_backlog);
+		unix_recvq_full(other);
 
 	unix_state_unlock(other);
 
@@ -1058,8 +1064,7 @@ restart:
 	if (other->sk_state != TCP_LISTEN)
 		goto out_unlock;
 
-	if (skb_queue_len(&other->sk_receive_queue) >
-	    other->sk_max_ack_backlog) {
+	if (unix_recvq_full(other)) {
 		err = -EAGAIN;
 		if (!timeo)
 			goto out_unlock;
@@ -1428,9 +1433,7 @@ restart:
 			goto out_unlock;
 	}
 
-	if (unix_peer(other) != sk &&
-	    (skb_queue_len(&other->sk_receive_queue) >
-	     other->sk_max_ack_backlog)) {
+	if (unix_peer(other) != sk && unix_recvq_full(other)) {
 		if (!timeo) {
 			err = -EAGAIN;
 			goto out_unlock;
@@ -1991,6 +1994,64 @@ static unsigned int unix_poll(struct file * file, struct socket *sock, poll_tabl
 	return mask;
 }
 
+static unsigned int unix_datagram_poll(struct file *file, struct socket *sock,
+				       poll_table *wait)
+{
+	struct sock *sk = sock->sk, *peer;
+	unsigned int mask;
+
+	poll_wait(file, sk->sk_sleep, wait);
+
+	peer = unix_peer_get(sk);
+	if (peer) {
+		if (peer != sk) {
+			/*
+			 * Writability of a connected socket additionally
+			 * depends on the state of the receive queue of the
+			 * peer.
+			 */
+			poll_wait(file, &unix_sk(peer)->peer_wait, wait);
+		} else {
+			sock_put(peer);
+			peer = NULL;
+		}
+	}
+
+	mask = 0;
+
+	/* exceptional events? */
+	if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
+		mask |= POLLERR;
+	if (sk->sk_shutdown & RCV_SHUTDOWN)
+		mask |= POLLRDHUP;
+	if (sk->sk_shutdown == SHUTDOWN_MASK)
+		mask |= POLLHUP;
+
+	/* readable? */
+	if (!skb_queue_empty(&sk->sk_receive_queue) ||
+	    (sk->sk_shutdown & RCV_SHUTDOWN))
+		mask |= POLLIN | POLLRDNORM;
+
+	/* Connection-based need to check for termination and startup */
+	if (sk->sk_type == SOCK_SEQPACKET) {
+		if (sk->sk_state == TCP_CLOSE)
+			mask |= POLLHUP;
+		/* connection hasn't started yet? */
+		if (sk->sk_state == TCP_SYN_SENT)
+			return mask;
+	}
+
+	/* writable? */
+	if (unix_writable(sk) && !(peer && unix_recvq_full(peer)))
+		mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
+	else
+		set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+
+	if (peer)
+		sock_put(peer);
+
+	return mask;
+}
 
 #ifdef CONFIG_PROC_FS
 static struct sock *first_unix_socket(int *i)
-- 
cgit v1.2.3


From 3a5be7d4b079f3a9ce1e8ce4a93ba15ae6d00111 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Wed, 18 Jun 2008 01:19:51 -0700
Subject: Revert "mac80211: Use skb_header_cloned() on TX path."

This reverts commit 608961a5eca8d3c6bd07172febc27b5559408c5d.

The problem is that the mac80211 stack not only needs to be able to
muck with the link-level headers, it also might need to mangle all of
the packet data if doing sw wireless encryption.

This fixes kernel bugzilla #10903.  Thanks to Didier Raboud (for the
bugzilla report), Andrew Prince (for bisecting), Johannes Berg (for
bringing this bisection analysis to my attention), and Ilpo (for
trying to analyze this purely from the TCP side).

In 2.6.27 we can take another stab at this, by using something like
skb_cow_data() when the TX path of mac80211 ends up with a non-NULL
tx->key.  The ESP protocol code in the IPSEC stack can be used as a
model for implementation.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/mac80211/tx.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 1d7dd54aacef..28d8bd53bd3a 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -1562,13 +1562,13 @@ int ieee80211_subif_start_xmit(struct sk_buff *skb,
 	 * be cloned. This could happen, e.g., with Linux bridge code passing
 	 * us broadcast frames. */
 
-	if (head_need > 0 || skb_header_cloned(skb)) {
+	if (head_need > 0 || skb_cloned(skb)) {
 #if 0
 		printk(KERN_DEBUG "%s: need to reallocate buffer for %d bytes "
 		       "of headroom\n", dev->name, head_need);
 #endif
 
-		if (skb_header_cloned(skb))
+		if (skb_cloned(skb))
 			I802_DEBUG_INC(local->tx_expand_skb_head_cloned);
 		else
 			I802_DEBUG_INC(local->tx_expand_skb_head);
-- 
cgit v1.2.3


From 6d1a3fb567a728d31474636e167c324702a0c38b Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Wed, 18 Jun 2008 02:07:07 -0700
Subject: netlink: genl: fix circular locking

genetlink has a circular locking dependency when dumping the registered
families:

- dump start:
genl_rcv()            : take genl_mutex
genl_rcv_msg()        : call netlink_dump_start() while holding genl_mutex
netlink_dump_start(),
netlink_dump()        : take nlk->cb_mutex
ctrl_dumpfamily()     : try to detect this case and not take genl_mutex a
                        second time

- dump continuance:
netlink_rcv()         : call netlink_dump
netlink_dump          : take nlk->cb_mutex
ctrl_dumpfamily()     : take genl_mutex

Register genl_lock as callback mutex with netlink to fix this. This slightly
widens an already existing module unload race, the genl ops used during the
dump might go away when the module is unloaded. Thomas Graf is working on a
seperate fix for this.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/netlink/genetlink.c | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c
index f5aa23c3e886..3e1191cecaf0 100644
--- a/net/netlink/genetlink.c
+++ b/net/netlink/genetlink.c
@@ -444,8 +444,11 @@ static int genl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 		if (ops->dumpit == NULL)
 			return -EOPNOTSUPP;
 
-		return netlink_dump_start(genl_sock, skb, nlh,
-					  ops->dumpit, ops->done);
+		genl_unlock();
+		err = netlink_dump_start(genl_sock, skb, nlh,
+					 ops->dumpit, ops->done);
+		genl_lock();
+		return err;
 	}
 
 	if (ops->doit == NULL)
@@ -603,9 +606,6 @@ static int ctrl_dumpfamily(struct sk_buff *skb, struct netlink_callback *cb)
 	int chains_to_skip = cb->args[0];
 	int fams_to_skip = cb->args[1];
 
-	if (chains_to_skip != 0)
-		genl_lock();
-
 	for (i = 0; i < GENL_FAM_TAB_SIZE; i++) {
 		if (i < chains_to_skip)
 			continue;
@@ -623,9 +623,6 @@ static int ctrl_dumpfamily(struct sk_buff *skb, struct netlink_callback *cb)
 	}
 
 errout:
-	if (chains_to_skip != 0)
-		genl_unlock();
-
 	cb->args[0] = i;
 	cb->args[1] = n;
 
@@ -770,7 +767,7 @@ static int __init genl_init(void)
 
 	/* we'll bump the group number right afterwards */
 	genl_sock = netlink_kernel_create(&init_net, NETLINK_GENERIC, 0,
-					  genl_rcv, NULL, THIS_MODULE);
+					  genl_rcv, &genl_mutex, THIS_MODULE);
 	if (genl_sock == NULL)
 		panic("GENL: Cannot initialize generic netlink\n");
 
-- 
cgit v1.2.3


From ef3a62d272f033989e83eb1f26505f93f93e3e69 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Wed, 18 Jun 2008 15:39:48 -0700
Subject: mac80211: detect driver tx bugs

When a driver rejects a frame in it's ->tx() callback, it must also
stop queues, otherwise mac80211 can go into a loop here. Detect this
situation and abort the loop after five retries, warning about the
driver bug.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/mac80211/tx.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 28d8bd53bd3a..c80d5899f279 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -1132,7 +1132,7 @@ static int ieee80211_tx(struct net_device *dev, struct sk_buff *skb,
 	ieee80211_tx_handler *handler;
 	struct ieee80211_tx_data tx;
 	ieee80211_tx_result res = TX_DROP, res_prepare;
-	int ret, i;
+	int ret, i, retries = 0;
 
 	WARN_ON(__ieee80211_queue_pending(local, control->queue));
 
@@ -1216,6 +1216,13 @@ retry:
 		if (!__ieee80211_queue_stopped(local, control->queue)) {
 			clear_bit(IEEE80211_LINK_STATE_PENDING,
 				  &local->state[control->queue]);
+			retries++;
+			/*
+			 * Driver bug, it's rejecting packets but
+			 * not stopping queues.
+			 */
+			if (WARN_ON_ONCE(retries > 5))
+				goto drop;
 			goto retry;
 		}
 		memcpy(&store->control, control,
-- 
cgit v1.2.3


From aea7427f70cce5fa8f99ce447b213e9e3b49f24c Mon Sep 17 00:00:00 2001
From: Shan Wei <shanwei@cn.fujitsu.com>
Date: Thu, 19 Jun 2008 16:29:39 -0700
Subject: ipv6: Remove options header when setsockopt's optlen is 0

Remove the sticky Hop-by-Hop options header by calling setsockopt()
for IPV6_HOPOPTS with a zero option length, per RFC3542.

Routing header and Destination options header does the same as
Hop-by-Hop options header.

Signed-off-by: Shan Wei <shanwei@cn.fujitsu.com>
Acked-by: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ipv6_sockglue.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index c042ce19bd14..86e28a75267f 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -345,18 +345,21 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
 	case IPV6_DSTOPTS:
 	{
 		struct ipv6_txoptions *opt;
+
+		/* remove any sticky options header with a zero option
+		 * length, per RFC3542.
+		 */
 		if (optlen == 0)
 			optval = NULL;
+		else if (optlen < sizeof(struct ipv6_opt_hdr) ||
+			 optlen & 0x7 || optlen > 8 * 255)
+			goto e_inval;
 
 		/* hop-by-hop / destination options are privileged option */
 		retv = -EPERM;
 		if (optname != IPV6_RTHDR && !capable(CAP_NET_RAW))
 			break;
 
-		if (optlen < sizeof(struct ipv6_opt_hdr) ||
-		    optlen & 0x7 || optlen > 8 * 255)
-			goto e_inval;
-
 		opt = ipv6_renew_options(sk, np->opt, optname,
 					 (struct ipv6_opt_hdr __user *)optval,
 					 optlen);
-- 
cgit v1.2.3


From f630e43a215a3129d0c1173cae0bce6ea4855cf7 Mon Sep 17 00:00:00 2001
From: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Date: Thu, 19 Jun 2008 16:33:57 -0700
Subject: ipv6: Drop packets for loopback address from outside of the box.

[ Based upon original report and patch by Karsten Keil.  Karsten
  has verified that this fixes the TAHI test case "ICMPv6 test
  v6LC.5.1.2 Part F". -DaveM ]

Signed-off-by: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_input.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'net')

diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
index 4e5c8615832c..17eb48b8e329 100644
--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -102,6 +102,15 @@ int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt
 	if (hdr->version != 6)
 		goto err;
 
+	/*
+	 * RFC4291 2.5.3
+	 * A packet received on an interface with a destination address
+	 * of loopback must be dropped.
+	 */
+	if (!(dev->flags & IFF_LOOPBACK) &&
+	    ipv6_addr_loopback(&hdr->daddr))
+		goto err;
+
 	skb->transport_header = skb->network_header + sizeof(*hdr);
 	IP6CB(skb)->nhoff = offsetof(struct ipv6hdr, nexthdr);
 
-- 
cgit v1.2.3


From 735ce972fbc8a65fb17788debd7bbe7b4383cc62 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Fri, 20 Jun 2008 22:04:34 -0700
Subject: sctp: Make sure N * sizeof(union sctp_addr) does not overflow.

As noticed by Gabriel Campana, the kmalloc() length arg
passed in by sctp_getsockopt_local_addrs_old() can overflow
if ->addr_num is large enough.

Therefore, enforce an appropriate limit.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/socket.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index e7e3baf7009e..0dbcde6758ea 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -4401,7 +4401,9 @@ static int sctp_getsockopt_local_addrs_old(struct sock *sk, int len,
 	if (copy_from_user(&getaddrs, optval, len))
 		return -EFAULT;
 
-	if (getaddrs.addr_num <= 0) return -EINVAL;
+	if (getaddrs.addr_num <= 0 ||
+	    getaddrs.addr_num >= (INT_MAX / sizeof(union sctp_addr)))
+		return -EINVAL;
 	/*
 	 *  For UDP-style sockets, id specifies the association to query.
 	 *  If the id field is set to the value '0' then the locally bound
-- 
cgit v1.2.3


From b9f75f45a6b46a0ab4eb0857d437a0845871f314 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 20 Jun 2008 22:16:51 -0700
Subject: netns: Don't receive new packets in a dead network namespace.

Alexey Dobriyan <adobriyan@gmail.com> writes:
> Subject: ICMP sockets destruction vs ICMP packets oops

> After icmp_sk_exit() nuked ICMP sockets, we get an interrupt.
> icmp_reply() wants ICMP socket.
>
> Steps to reproduce:
>
> 	launch shell in new netns
> 	move real NIC to netns
> 	setup routing
> 	ping -i 0
> 	exit from shell
>
> BUG: unable to handle kernel NULL pointer dereference at 0000000000000000
> IP: [<ffffffff803fce17>] icmp_sk+0x17/0x30
> PGD 17f3cd067 PUD 17f3ce067 PMD 0
> Oops: 0000 [1] PREEMPT SMP DEBUG_PAGEALLOC
> CPU 0
> Modules linked in: usblp usbcore
> Pid: 0, comm: swapper Not tainted 2.6.26-rc6-netns-ct #4
> RIP: 0010:[<ffffffff803fce17>]  [<ffffffff803fce17>] icmp_sk+0x17/0x30
> RSP: 0018:ffffffff8057fc30  EFLAGS: 00010286
> RAX: 0000000000000000 RBX: 0000000000000000 RCX: ffff81017c7db900
> RDX: 0000000000000034 RSI: ffff81017c7db900 RDI: ffff81017dc41800
> RBP: ffffffff8057fc40 R08: 0000000000000001 R09: 000000000000a815
> R10: 0000000000000000 R11: 0000000000000001 R12: ffffffff8057fd28
> R13: ffffffff8057fd00 R14: ffff81017c7db938 R15: ffff81017dc41800
> FS:  0000000000000000(0000) GS:ffffffff80525000(0000) knlGS:0000000000000000
> CS:  0010 DS: 0018 ES: 0018 CR0: 000000008005003b
> CR2: 0000000000000000 CR3: 000000017fcda000 CR4: 00000000000006e0
> DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
> Process swapper (pid: 0, threadinfo ffffffff8053a000, task ffffffff804fa4a0)
> Stack:  0000000000000000 ffff81017c7db900 ffffffff8057fcf0 ffffffff803fcfe4
>  ffffffff804faa38 0000000000000246 0000000000005a40 0000000000000246
>  000000000001ffff ffff81017dd68dc0 0000000000005a40 0000000055342436
> Call Trace:
>  <IRQ>  [<ffffffff803fcfe4>] icmp_reply+0x44/0x1e0
>  [<ffffffff803d3a0a>] ? ip_route_input+0x23a/0x1360
>  [<ffffffff803fd645>] icmp_echo+0x65/0x70
>  [<ffffffff803fd300>] icmp_rcv+0x180/0x1b0
>  [<ffffffff803d6d84>] ip_local_deliver+0xf4/0x1f0
>  [<ffffffff803d71bb>] ip_rcv+0x33b/0x650
>  [<ffffffff803bb16a>] netif_receive_skb+0x27a/0x340
>  [<ffffffff803be57d>] process_backlog+0x9d/0x100
>  [<ffffffff803bdd4d>] net_rx_action+0x18d/0x250
>  [<ffffffff80237be5>] __do_softirq+0x75/0x100
>  [<ffffffff8020c97c>] call_softirq+0x1c/0x30
>  [<ffffffff8020f085>] do_softirq+0x65/0xa0
>  [<ffffffff80237af7>] irq_exit+0x97/0xa0
>  [<ffffffff8020f198>] do_IRQ+0xa8/0x130
>  [<ffffffff80212ee0>] ? mwait_idle+0x0/0x60
>  [<ffffffff8020bc46>] ret_from_intr+0x0/0xf
>  <EOI>  [<ffffffff80212f2c>] ? mwait_idle+0x4c/0x60
>  [<ffffffff80212f23>] ? mwait_idle+0x43/0x60
>  [<ffffffff8020a217>] ? cpu_idle+0x57/0xa0
>  [<ffffffff8040f380>] ? rest_init+0x70/0x80
> Code: 10 5b 41 5c 41 5d 41 5e c9 c3 66 2e 0f 1f 84 00 00 00 00 00 55 48 89 e5 53
> 48 83 ec 08 48 8b 9f 78 01 00 00 e8 2b c7 f1 ff 89 c0 <48> 8b 04 c3 48 83 c4 08
> 5b c9 c3 66 66 66 66 66 2e 0f 1f 84 00
> RIP  [<ffffffff803fce17>] icmp_sk+0x17/0x30
>  RSP <ffffffff8057fc30>
> CR2: 0000000000000000
> ---[ end trace ea161157b76b33e8 ]---
> Kernel panic - not syncing: Aiee, killing interrupt handler!

Receiving packets while we are cleaning up a network namespace is a
racy proposition. It is possible when the packet arrives that we have
removed some but not all of the state we need to fully process it.  We
have the choice of either playing wack-a-mole with the cleanup routines
or simply dropping packets when we don't have a network namespace to
handle them.

Since the check looks inexpensive in netif_receive_skb let's just
drop the incoming packets.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c           | 4 ++++
 net/core/net_namespace.c | 3 +++
 2 files changed, 7 insertions(+)

(limited to 'net')

diff --git a/net/core/dev.c b/net/core/dev.c
index 68d8df0992ab..c421a1f8f0b9 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2077,6 +2077,10 @@ int netif_receive_skb(struct sk_buff *skb)
 
 	rcu_read_lock();
 
+	/* Don't receive packets in an exiting network namespace */
+	if (!net_alive(dev_net(skb->dev)))
+		goto out;
+
 #ifdef CONFIG_NET_CLS_ACT
 	if (skb->tc_verd & TC_NCLS) {
 		skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 72b4c184dd84..7c52fe277b62 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -140,6 +140,9 @@ static void cleanup_net(struct work_struct *work)
 	struct pernet_operations *ops;
 	struct net *net;
 
+	/* Be very certain incoming network packets will not find us */
+	rcu_barrier();
+
 	net = container_of(work, struct net, work);
 
 	mutex_lock(&net_mutex);
-- 
cgit v1.2.3