From 56f16c74ca25649691fcfe4eedfb8ecadec830cb Mon Sep 17 00:00:00 2001 From: Dimitris Michailidis Date: Fri, 13 Jun 2014 14:11:14 -0700 Subject: MAINTAINERS: update cxgb4 maintainer Hari's been doing the patch submissions for a while now and he'll be taking over as maintainer. Signed-off-by: Dimitris Michailidis Signed-off-by: David S. Miller --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 055f95238d88..134483f206e4 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2594,7 +2594,7 @@ S: Supported F: drivers/infiniband/hw/cxgb3/ CXGB4 ETHERNET DRIVER (CXGB4) -M: Dimitris Michailidis +M: Hariprasad S L: netdev@vger.kernel.org W: http://www.chelsio.com S: Supported -- cgit v1.2.3 From 2853af6a2ea1a8ed09b09dd4fb578e7f435e8d34 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Thu, 12 Jun 2014 11:53:10 -0700 Subject: vxlan: use dev->needed_headroom instead of dev->hard_header_len When we mirror packets from a vxlan tunnel to other device, the mirror device should see the same packets (that is, without outer header). Because vxlan tunnel sets dev->hard_header_len, tcf_mirred() resets mac header back to outer mac, the mirror device actually sees packets with outer headers Vxlan tunnel should set dev->needed_headroom instead of dev->hard_header_len, like what other ip tunnels do. This fixes the above problem. Cc: "David S. Miller" Cc: stephen hemminger Cc: Pravin B Shelar Signed-off-by: Cong Wang Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- drivers/net/vxlan.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index 1610d51dbb5c..e0995ffd046d 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -2247,9 +2247,9 @@ static void vxlan_setup(struct net_device *dev) eth_hw_addr_random(dev); ether_setup(dev); if (vxlan->default_dst.remote_ip.sa.sa_family == AF_INET6) - dev->hard_header_len = ETH_HLEN + VXLAN6_HEADROOM; + dev->needed_headroom = ETH_HLEN + VXLAN6_HEADROOM; else - dev->hard_header_len = ETH_HLEN + VXLAN_HEADROOM; + dev->needed_headroom = ETH_HLEN + VXLAN_HEADROOM; dev->netdev_ops = &vxlan_netdev_ops; dev->destructor = free_netdev; @@ -2646,8 +2646,7 @@ static int vxlan_newlink(struct net *net, struct net_device *dev, if (!tb[IFLA_MTU]) dev->mtu = lowerdev->mtu - (use_ipv6 ? VXLAN6_HEADROOM : VXLAN_HEADROOM); - /* update header length based on lower device */ - dev->hard_header_len = lowerdev->hard_header_len + + dev->needed_headroom = lowerdev->hard_header_len + (use_ipv6 ? VXLAN6_HEADROOM : VXLAN_HEADROOM); } else if (use_ipv6) vxlan->flags |= VXLAN_F_IPV6; -- cgit v1.2.3 From 63c6f81cdde58c41da62a8d8a209592e42a0203e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 12 Jun 2014 16:13:06 -0700 Subject: udp: ipv4: do not waste time in __udp4_lib_mcast_demux_lookup Its too easy to add thousand of UDP sockets on a particular bucket, and slow down an innocent multicast receiver. Early demux is supposed to be an optimization, we should avoid spending too much time in it. It is interesting to note __udp4_lib_demux_lookup() only tries to match first socket in the chain. 10 is the threshold we already have in __udp4_lib_lookup() to switch to secondary hash. Fixes: 421b3885bf6d5 ("udp: ipv4: Add udp early demux") Signed-off-by: Eric Dumazet Reported-by: David Held Cc: Shawn Bohrer Signed-off-by: David S. Miller --- net/ipv4/udp.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 185ed3e59802..d92f94b7e402 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1861,6 +1861,10 @@ static struct sock *__udp4_lib_mcast_demux_lookup(struct net *net, unsigned int count, slot = udp_hashfn(net, hnum, udp_table.mask); struct udp_hslot *hslot = &udp_table.hash[slot]; + /* Do not bother scanning a too big list */ + if (hslot->count > 10) + return NULL; + rcu_read_lock(); begin: count = 0; -- cgit v1.2.3 From 4b28252cada3d0521ab59751f4240ecdfb9bba18 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Sat, 14 Jun 2014 23:23:52 -0700 Subject: net: Fix GSO constants to match NETIF flags Joseph Gasparakis reported that VXLAN GSO offload stopped working with i40e device after recent UDP changes. The problem is that the SKB_GSO_* bits are out of sync with the corresponding NETIF flags. This patch fixes that. Also, we add BUILD_BUG_ONs in net_gso_ok for several GSO constants that were missing to avoid the problem in the future. Reported-by: Joseph Gasparakis Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/linux/netdev_features.h | 1 + include/linux/netdevice.h | 7 +++++++ include/linux/skbuff.h | 11 ++++++----- 3 files changed, 14 insertions(+), 5 deletions(-) diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h index e5a589435e2b..d99800cbdcf3 100644 --- a/include/linux/netdev_features.h +++ b/include/linux/netdev_features.h @@ -117,6 +117,7 @@ enum { #define NETIF_F_GSO_IPIP __NETIF_F(GSO_IPIP) #define NETIF_F_GSO_SIT __NETIF_F(GSO_SIT) #define NETIF_F_GSO_UDP_TUNNEL __NETIF_F(GSO_UDP_TUNNEL) +#define NETIF_F_GSO_UDP_TUNNEL_CSUM __NETIF_F(GSO_UDP_TUNNEL_CSUM) #define NETIF_F_GSO_MPLS __NETIF_F(GSO_MPLS) #define NETIF_F_HW_VLAN_STAG_FILTER __NETIF_F(HW_VLAN_STAG_FILTER) #define NETIF_F_HW_VLAN_STAG_RX __NETIF_F(HW_VLAN_STAG_RX) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index abe3de1db932..66f9a04ec270 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3305,6 +3305,13 @@ static inline bool net_gso_ok(netdev_features_t features, int gso_type) BUILD_BUG_ON(SKB_GSO_TCP_ECN != (NETIF_F_TSO_ECN >> NETIF_F_GSO_SHIFT)); BUILD_BUG_ON(SKB_GSO_TCPV6 != (NETIF_F_TSO6 >> NETIF_F_GSO_SHIFT)); BUILD_BUG_ON(SKB_GSO_FCOE != (NETIF_F_FSO >> NETIF_F_GSO_SHIFT)); + BUILD_BUG_ON(SKB_GSO_GRE != (NETIF_F_GSO_GRE >> NETIF_F_GSO_SHIFT)); + BUILD_BUG_ON(SKB_GSO_GRE_CSUM != (NETIF_F_GSO_GRE_CSUM >> NETIF_F_GSO_SHIFT)); + BUILD_BUG_ON(SKB_GSO_IPIP != (NETIF_F_GSO_IPIP >> NETIF_F_GSO_SHIFT)); + BUILD_BUG_ON(SKB_GSO_SIT != (NETIF_F_GSO_SIT >> NETIF_F_GSO_SHIFT)); + BUILD_BUG_ON(SKB_GSO_UDP_TUNNEL != (NETIF_F_GSO_UDP_TUNNEL >> NETIF_F_GSO_SHIFT)); + BUILD_BUG_ON(SKB_GSO_UDP_TUNNEL_CSUM != (NETIF_F_GSO_UDP_TUNNEL_CSUM >> NETIF_F_GSO_SHIFT)); + BUILD_BUG_ON(SKB_GSO_MPLS != (NETIF_F_GSO_MPLS >> NETIF_F_GSO_SHIFT)); return (features & feature) == feature; } diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 5b5cd3189c98..e13ed90be7c2 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -338,17 +338,18 @@ enum { SKB_GSO_GRE = 1 << 6, - SKB_GSO_IPIP = 1 << 7, + SKB_GSO_GRE_CSUM = 1 << 7, - SKB_GSO_SIT = 1 << 8, + SKB_GSO_IPIP = 1 << 8, - SKB_GSO_UDP_TUNNEL = 1 << 9, + SKB_GSO_SIT = 1 << 9, - SKB_GSO_MPLS = 1 << 10, + SKB_GSO_UDP_TUNNEL = 1 << 10, SKB_GSO_UDP_TUNNEL_CSUM = 1 << 11, - SKB_GSO_GRE_CSUM = 1 << 12, + SKB_GSO_MPLS = 1 << 12, + }; #if BITS_PER_LONG > 32 -- cgit v1.2.3 From 46fb51eb96cafb2c148b7b5119adb5e31a2bf3c4 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Sat, 14 Jun 2014 23:24:03 -0700 Subject: net: Fix save software checksum complete Geert reported issues regarding checksum complete and UDP. The logic introduced in commit 7e3cead5172927732f51fde ("net: Save software checksum complete") is not correct. This patch: 1) Restores code in __skb_checksum_complete_header except for setting CHECKSUM_UNNECESSARY. This function may be calculating checksum on something less than skb->len. 2) Adds saving checksum to __skb_checksum_complete. The full packet checksum 0..skb->len is calculated without adding in pseudo header. This value is saved in skb->csum and then the pseudo header is added to that to derive the checksum for validation. 3) In both __skb_checksum_complete_header and __skb_checksum_complete, set skb->csum_valid to whether checksum of zero was computed. This allows skb_csum_unnecessary to return true without changing to CHECKSUM_UNNECESSARY which was done previously. 4) Copy new csum related bits in __copy_skb_header. Reported-by: Geert Uytterhoeven Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- net/core/datagram.c | 36 ++++++++++++++++++++++++++---------- net/core/skbuff.c | 3 +++ 2 files changed, 29 insertions(+), 10 deletions(-) diff --git a/net/core/datagram.c b/net/core/datagram.c index 6b1c04ca1d50..488dd1a825c0 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -739,22 +739,38 @@ __sum16 __skb_checksum_complete_head(struct sk_buff *skb, int len) __sum16 sum; sum = csum_fold(skb_checksum(skb, 0, len, skb->csum)); - if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && !sum && - !skb->csum_complete_sw) - netdev_rx_csum_fault(skb->dev); - - /* Save checksum complete for later use */ - skb->csum = sum; - skb->ip_summed = CHECKSUM_COMPLETE; - skb->csum_complete_sw = 1; - + if (likely(!sum)) { + if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && + !skb->csum_complete_sw) + netdev_rx_csum_fault(skb->dev); + } + skb->csum_valid = !sum; return sum; } EXPORT_SYMBOL(__skb_checksum_complete_head); __sum16 __skb_checksum_complete(struct sk_buff *skb) { - return __skb_checksum_complete_head(skb, skb->len); + __wsum csum; + __sum16 sum; + + csum = skb_checksum(skb, 0, skb->len, 0); + + /* skb->csum holds pseudo checksum */ + sum = csum_fold(csum_add(skb->csum, csum)); + if (likely(!sum)) { + if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && + !skb->csum_complete_sw) + netdev_rx_csum_fault(skb->dev); + } + + /* Save full packet checksum */ + skb->csum = csum; + skb->ip_summed = CHECKSUM_COMPLETE; + skb->csum_complete_sw = 1; + skb->csum_valid = !sum; + + return sum; } EXPORT_SYMBOL(__skb_checksum_complete); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index bf92824af3f7..9cd5344fad73 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -689,6 +689,9 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old) new->ooo_okay = old->ooo_okay; new->no_fcs = old->no_fcs; new->encapsulation = old->encapsulation; + new->encap_hdr_csum = old->encap_hdr_csum; + new->csum_valid = old->csum_valid; + new->csum_complete_sw = old->csum_complete_sw; #ifdef CONFIG_XFRM new->sp = secpath_get(old->sp); #endif -- cgit v1.2.3 From bbdff225ede6527f91184b2a7903df8aad803ace Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Sat, 14 Jun 2014 23:24:20 -0700 Subject: udp: call __skb_checksum_complete when doing full checksum In __udp_lib_checksum_complete check if checksum is being done over all the data (len is equal to skb->len) and if it is call __skb_checksum_complete instead of __skb_checksum_complete_head. This allows checksum to be saved in checksum complete. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/net/udp.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/include/net/udp.h b/include/net/udp.h index 2ecfc6e15609..68a1fefe3dfe 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -111,7 +111,9 @@ struct sk_buff; */ static inline __sum16 __udp_lib_checksum_complete(struct sk_buff *skb) { - return __skb_checksum_complete_head(skb, UDP_SKB_CB(skb)->cscov); + return (UDP_SKB_CB(skb)->cscov == skb->len ? + __skb_checksum_complete(skb) : + __skb_checksum_complete_head(skb, UDP_SKB_CB(skb)->cscov)); } static inline int udp_lib_checksum_complete(struct sk_buff *skb) -- cgit v1.2.3 From e5eb4e30a51236079fb22bb9f75fcd31915b03c6 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Sat, 14 Jun 2014 23:24:28 -0700 Subject: net: add skb_pop_rcv_encapsulation This function is used by UDP encapsulation protocols in RX when crossing encapsulation boundary. If ip_summed is set to CHECKSUM_UNNECESSARY and encapsulation is not set, change to CHECKSUM_NONE since the checksum has not been validated within the encapsulation. Clears csum_valid by the same rationale. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/linux/skbuff.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index e13ed90be7c2..ec89301ada41 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1854,6 +1854,18 @@ static inline int pskb_network_may_pull(struct sk_buff *skb, unsigned int len) return pskb_may_pull(skb, skb_network_offset(skb) + len); } +static inline void skb_pop_rcv_encapsulation(struct sk_buff *skb) +{ + /* Only continue with checksum unnecessary if device indicated + * it is valid across encapsulation (skb->encapsulation was set). + */ + if (skb->ip_summed == CHECKSUM_UNNECESSARY && !skb->encapsulation) + skb->ip_summed = CHECKSUM_NONE; + + skb->encapsulation = 0; + skb->csum_valid = 0; +} + /* * CPUs often take a performance hit when accessing unaligned memory * locations. The actual performance hit varies, it can be small if the -- cgit v1.2.3 From f79b064c15068176e3f6f67715aafd7fe183120c Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Sat, 14 Jun 2014 23:24:36 -0700 Subject: vxlan: Checksum fixes Call skb_pop_rcv_encapsulation and postpull_rcsum for the Ethernet header to work properly with checksum complete. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- drivers/net/vxlan.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index e0995ffd046d..ade33ef82823 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -1156,15 +1156,7 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb) if (!vs) goto drop; - /* If the NIC driver gave us an encapsulated packet - * with the encapsulation mark, the device checksummed it - * for us. Otherwise force the upper layers to verify it. - */ - if ((skb->ip_summed != CHECKSUM_UNNECESSARY && skb->ip_summed != CHECKSUM_PARTIAL) || - !skb->encapsulation) - skb->ip_summed = CHECKSUM_NONE; - - skb->encapsulation = 0; + skb_pop_rcv_encapsulation(skb); vs->rcv(vs, skb, vxh->vx_vni); return 0; @@ -1201,6 +1193,7 @@ static void vxlan_rcv(struct vxlan_sock *vs, skb_reset_mac_header(skb); skb_scrub_packet(skb, !net_eq(vxlan->net, dev_net(vxlan->dev))); skb->protocol = eth_type_trans(skb, vxlan->dev); + skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); /* Ignore packet loops (and multicast echo) */ if (ether_addr_equal(eth_hdr(skb)->h_source, vxlan->dev->dev_addr)) -- cgit v1.2.3 From b58537a1f5629bdc98a8b9dc2051ce0e952f6b4b Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sun, 15 Jun 2014 00:59:14 +0200 Subject: net: sctp: fix permissions for rto_alpha and rto_beta knobs Commit 3fd091e73b81 ("[SCTP]: Remove multiple levels of msecs to jiffies conversions.") has silently changed permissions for rto_alpha and rto_beta knobs from 0644 to 0444. The purpose of this was to discourage users from tweaking rto_alpha and rto_beta knobs in production environments since they are key to correctly compute rtt/srtt. RFC4960 under section 6.3.1. RTO Calculation says regarding rto_alpha and rto_beta under rule C3 and C4: [...] C3) When a new RTT measurement R' is made, set RTTVAR <- (1 - RTO.Beta) * RTTVAR + RTO.Beta * |SRTT - R'| and SRTT <- (1 - RTO.Alpha) * SRTT + RTO.Alpha * R' Note: The value of SRTT used in the update to RTTVAR is its value before updating SRTT itself using the second assignment. After the computation, update RTO <- SRTT + 4 * RTTVAR. C4) When data is in flight and when allowed by rule C5 below, a new RTT measurement MUST be made each round trip. Furthermore, new RTT measurements SHOULD be made no more than once per round trip for a given destination transport address. There are two reasons for this recommendation: First, it appears that measuring more frequently often does not in practice yield any significant benefit [ALLMAN99]; second, if measurements are made more often, then the values of RTO.Alpha and RTO.Beta in rule C3 above should be adjusted so that SRTT and RTTVAR still adjust to changes at roughly the same rate (in terms of how many round trips it takes them to reflect new values) as they would if making only one measurement per round-trip and using RTO.Alpha and RTO.Beta as given in rule C3. However, the exact nature of these adjustments remains a research issue. [...] While it is discouraged to adjust rto_alpha and rto_beta and not further specified how to adjust them, the RFC also doesn't explicitly forbid it, but rather gives a RECOMMENDED default value (rto_alpha=3, rto_beta=2). We have a couple of users relying on the old permissions before they got changed. That said, if someone really has the urge to adjust them, we could allow it with a warning in the log. Fixes: 3fd091e73b81 ("[SCTP]: Remove multiple levels of msecs to jiffies conversions.") Signed-off-by: Daniel Borkmann Cc: Vlad Yasevich Signed-off-by: David S. Miller --- net/sctp/sysctl.c | 32 ++++++++++++++++++++++++++++---- 1 file changed, 28 insertions(+), 4 deletions(-) diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c index 7e5eb7554990..dcb19592761e 100644 --- a/net/sctp/sysctl.c +++ b/net/sctp/sysctl.c @@ -34,6 +34,8 @@ * Sridhar Samudrala */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include @@ -46,6 +48,11 @@ static int sack_timer_min = 1; static int sack_timer_max = 500; static int addr_scope_max = 3; /* check sctp_scope_policy_t in include/net/sctp/constants.h for max entries */ static int rwnd_scale_max = 16; +static int rto_alpha_min = 0; +static int rto_beta_min = 0; +static int rto_alpha_max = 1000; +static int rto_beta_max = 1000; + static unsigned long max_autoclose_min = 0; static unsigned long max_autoclose_max = (MAX_SCHEDULE_TIMEOUT / HZ > UINT_MAX) @@ -64,6 +71,9 @@ static int proc_sctp_do_rto_min(struct ctl_table *ctl, int write, static int proc_sctp_do_rto_max(struct ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos); +static int proc_sctp_do_alpha_beta(struct ctl_table *ctl, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos); static int proc_sctp_do_auth(struct ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos); @@ -126,15 +136,19 @@ static struct ctl_table sctp_net_table[] = { .procname = "rto_alpha_exp_divisor", .data = &init_net.sctp.rto_alpha, .maxlen = sizeof(int), - .mode = 0444, - .proc_handler = proc_dointvec, + .mode = 0644, + .proc_handler = proc_sctp_do_alpha_beta, + .extra1 = &rto_alpha_min, + .extra2 = &rto_alpha_max, }, { .procname = "rto_beta_exp_divisor", .data = &init_net.sctp.rto_beta, .maxlen = sizeof(int), - .mode = 0444, - .proc_handler = proc_dointvec, + .mode = 0644, + .proc_handler = proc_sctp_do_alpha_beta, + .extra1 = &rto_beta_min, + .extra2 = &rto_beta_max, }, { .procname = "max_burst", @@ -403,6 +417,16 @@ static int proc_sctp_do_rto_max(struct ctl_table *ctl, int write, return ret; } +static int proc_sctp_do_alpha_beta(struct ctl_table *ctl, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + pr_warn_once("Changing rto_alpha or rto_beta may lead to " + "suboptimal rtt/srtt estimations!\n"); + + return proc_dointvec_minmax(ctl, write, buffer, lenp, ppos); +} + static int proc_sctp_do_auth(struct ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos) -- cgit v1.2.3