diff options
author | Marcel Ziswiler <marcel.ziswiler@toradex.com> | 2019-12-18 22:52:20 +0100 |
---|---|---|
committer | Marcel Ziswiler <marcel.ziswiler@toradex.com> | 2019-12-18 22:52:20 +0100 |
commit | 1ddf624b0b268fdc0b80b1de618b98f8d117afea (patch) | |
tree | 3d3218332bcb34cb0afa01d6ad996058a3dbcb77 /net/ipv4 | |
parent | 6b774eec1f9d3064e9b33634dfa99d5666d0a73a (diff) | |
parent | bfb9e5c03076a446b1f4f6a523ddc8d723c907a6 (diff) |
Merge tag 'v4.14.159' into 4.14-2.0.x-imx
This is the 4.14.159 stable release
Conflicts:
arch/arm/Kconfig.debug
arch/arm/boot/dts/imx7s.dtsi
arch/arm/mach-imx/cpuidle-imx6sx.c
drivers/crypto/caam/caamalg.c
drivers/crypto/mxs-dcp.c
drivers/dma/imx-sdma.c
drivers/input/keyboard/imx_keypad.c
drivers/net/can/flexcan.c
drivers/net/can/rx-offload.c
drivers/net/wireless/ath/ath10k/pci.c
drivers/pci/dwc/pci-imx6.c
drivers/spi/spi-fsl-lpspi.c
drivers/usb/dwc3/gadget.c
Diffstat (limited to 'net/ipv4')
-rw-r--r-- | net/ipv4/datagram.c | 2 | ||||
-rw-r--r-- | net/ipv4/devinet.c | 8 | ||||
-rw-r--r-- | net/ipv4/fib_semantics.c | 2 | ||||
-rw-r--r-- | net/ipv4/gre_demux.c | 7 | ||||
-rw-r--r-- | net/ipv4/igmp.c | 8 | ||||
-rw-r--r-- | net/ipv4/inet_hashtables.c | 2 | ||||
-rw-r--r-- | net/ipv4/ip_gre.c | 13 | ||||
-rw-r--r-- | net/ipv4/ip_tunnel.c | 8 | ||||
-rw-r--r-- | net/ipv4/ip_tunnel_core.c | 9 | ||||
-rw-r--r-- | net/ipv4/netfilter/ipt_rpfilter.c | 1 | ||||
-rw-r--r-- | net/ipv4/netfilter/nf_nat_masquerade_ipv4.c | 22 | ||||
-rw-r--r-- | net/ipv4/proc.c | 1 | ||||
-rw-r--r-- | net/ipv4/raw.c | 2 | ||||
-rw-r--r-- | net/ipv4/raw_diag.c | 3 | ||||
-rw-r--r-- | net/ipv4/route.c | 14 | ||||
-rw-r--r-- | net/ipv4/sysctl_net_ipv4.c | 11 | ||||
-rw-r--r-- | net/ipv4/tcp.c | 40 | ||||
-rw-r--r-- | net/ipv4/tcp_cong.c | 6 | ||||
-rw-r--r-- | net/ipv4/tcp_input.c | 34 | ||||
-rw-r--r-- | net/ipv4/tcp_ipv4.c | 5 | ||||
-rw-r--r-- | net/ipv4/tcp_output.c | 24 | ||||
-rw-r--r-- | net/ipv4/tcp_timer.c | 21 | ||||
-rw-r--r-- | net/ipv4/udp.c | 35 |
23 files changed, 194 insertions, 84 deletions
diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c index f915abff1350..d3eddfd13875 100644 --- a/net/ipv4/datagram.c +++ b/net/ipv4/datagram.c @@ -75,7 +75,7 @@ int __ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len inet->inet_dport = usin->sin_port; sk->sk_state = TCP_ESTABLISHED; sk_set_txhash(sk); - inet->inet_id = jiffies; + inet->inet_id = prandom_u32(); sk_dst_set(sk, &rt->dst); err = 0; diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index bffa88ecc534..5264510c9983 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -66,6 +66,11 @@ #include <net/net_namespace.h> #include <net/addrconf.h> +#define IPV6ONLY_FLAGS \ + (IFA_F_NODAD | IFA_F_OPTIMISTIC | IFA_F_DADFAILED | \ + IFA_F_HOMEADDRESS | IFA_F_TENTATIVE | \ + IFA_F_MANAGETEMPADDR | IFA_F_STABLE_PRIVACY) + static struct ipv4_devconf ipv4_devconf = { .data = { [IPV4_DEVCONF_ACCEPT_REDIRECTS - 1] = 1, @@ -455,6 +460,9 @@ static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh, ifa->ifa_flags &= ~IFA_F_SECONDARY; last_primary = &in_dev->ifa_list; + /* Don't set IPv6 only flags to IPv4 addresses */ + ifa->ifa_flags &= ~IPV6ONLY_FLAGS; + for (ifap = &in_dev->ifa_list; (ifa1 = *ifap) != NULL; ifap = &ifa1->ifa_next) { if (!(ifa1->ifa_flags & IFA_F_SECONDARY) && diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index e76b8a7bb891..eff703cb13b6 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -1471,8 +1471,8 @@ int fib_sync_down_addr(struct net_device *dev, __be32 local) int ret = 0; unsigned int hash = fib_laddr_hashfn(local); struct hlist_head *head = &fib_info_laddrhash[hash]; + int tb_id = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN; struct net *net = dev_net(dev); - int tb_id = l3mdev_fib_table(dev); struct fib_info *fi; if (!fib_info_laddrhash || local == 0) diff --git a/net/ipv4/gre_demux.c b/net/ipv4/gre_demux.c index b798862b6be5..7efe740c06eb 100644 --- a/net/ipv4/gre_demux.c +++ b/net/ipv4/gre_demux.c @@ -86,13 +86,14 @@ int gre_parse_header(struct sk_buff *skb, struct tnl_ptk_info *tpi, options = (__be32 *)(greh + 1); if (greh->flags & GRE_CSUM) { - if (skb_checksum_simple_validate(skb)) { + if (!skb_checksum_simple_validate(skb)) { + skb_checksum_try_convert(skb, IPPROTO_GRE, 0, + null_compute_pseudo); + } else if (csum_err) { *csum_err = true; return -EINVAL; } - skb_checksum_try_convert(skb, IPPROTO_GRE, 0, - null_compute_pseudo); options++; } diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 94604f9bd0d7..b6f0ee01f2e0 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -1220,12 +1220,8 @@ static void igmpv3_del_delrec(struct in_device *in_dev, struct ip_mc_list *im) im->interface = pmc->interface; im->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv; if (im->sfmode == MCAST_INCLUDE) { - im->tomb = pmc->tomb; - pmc->tomb = NULL; - - im->sources = pmc->sources; - pmc->sources = NULL; - + swap(im->tomb, pmc->tomb); + swap(im->sources, pmc->sources); for (psf = im->sources; psf; psf = psf->sf_next) psf->sf_crcount = im->crcount; } diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 24b066c32e06..1f26627c7fad 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -193,7 +193,7 @@ static inline int compute_score(struct sock *sk, struct net *net, if (sk->sk_bound_dev_if) score += 4; } - if (sk->sk_incoming_cpu == raw_smp_processor_id()) + if (READ_ONCE(sk->sk_incoming_cpu) == raw_smp_processor_id()) score++; } return score; diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 0fc499db6da2..9940a59306b5 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -230,13 +230,10 @@ static void gre_err(struct sk_buff *skb, u32 info) const int type = icmp_hdr(skb)->type; const int code = icmp_hdr(skb)->code; struct tnl_ptk_info tpi; - bool csum_err = false; - if (gre_parse_header(skb, &tpi, &csum_err, htons(ETH_P_IP), - iph->ihl * 4) < 0) { - if (!csum_err) /* ignore csum errors. */ - return; - } + if (gre_parse_header(skb, &tpi, NULL, htons(ETH_P_IP), + iph->ihl * 4) < 0) + return; if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) { ipv4_update_pmtu(skb, dev_net(skb->dev), info, @@ -592,6 +589,9 @@ static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev, truncate = true; } + if (tun_info->options_len < sizeof(*md)) + goto err_free_rt; + md = ip_tunnel_info_opts(tun_info); if (!md) goto err_free_rt; @@ -1424,6 +1424,7 @@ nla_put_failure: static void erspan_setup(struct net_device *dev) { ether_setup(dev); + dev->max_mtu = 0; dev->netdev_ops = &erspan_netdev_ops; dev->priv_flags &= ~IFF_TX_SKB_SHARING; dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index fabc299cb875..7a31287ff123 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -661,13 +661,19 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, dst = tnl_params->daddr; if (dst == 0) { /* NBMA tunnel */ + struct ip_tunnel_info *tun_info; if (!skb_dst(skb)) { dev->stats.tx_fifo_errors++; goto tx_error; } - if (skb->protocol == htons(ETH_P_IP)) { + tun_info = skb_tunnel_info(skb); + if (tun_info && (tun_info->mode & IP_TUNNEL_INFO_TX) && + ip_tunnel_info_af(tun_info) == AF_INET && + tun_info->key.u.ipv4.dst) + dst = tun_info->key.u.ipv4.dst; + else if (skb->protocol == htons(ETH_P_IP)) { rt = skb_rtable(skb); dst = rt_nexthop(rt, inner_iph->daddr); } diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index 423091727e15..2aaf7f8a3a96 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -89,9 +89,12 @@ void iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb, __ip_select_ident(net, iph, skb_shinfo(skb)->gso_segs ?: 1); err = ip_local_out(net, sk, skb); - if (unlikely(net_xmit_eval(err))) - pkt_len = 0; - iptunnel_xmit_stats(dev, pkt_len); + + if (dev) { + if (unlikely(net_xmit_eval(err))) + pkt_len = 0; + iptunnel_xmit_stats(dev, pkt_len); + } } EXPORT_SYMBOL_GPL(iptunnel_xmit); diff --git a/net/ipv4/netfilter/ipt_rpfilter.c b/net/ipv4/netfilter/ipt_rpfilter.c index 37fb9552e858..341d1bd637af 100644 --- a/net/ipv4/netfilter/ipt_rpfilter.c +++ b/net/ipv4/netfilter/ipt_rpfilter.c @@ -96,6 +96,7 @@ static bool rpfilter_mt(const struct sk_buff *skb, struct xt_action_param *par) flow.flowi4_mark = info->flags & XT_RPFILTER_VALID_MARK ? skb->mark : 0; flow.flowi4_tos = RT_TOS(iph->tos); flow.flowi4_scope = RT_SCOPE_UNIVERSE; + flow.flowi4_oif = l3mdev_master_ifindex_rcu(xt_in(par)); return rpfilter_lookup_reverse(xt_net(par), &flow, xt_in(par), info->flags) ^ invert; } diff --git a/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c b/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c index 0c366aad89cb..b531fe204323 100644 --- a/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c +++ b/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c @@ -105,12 +105,26 @@ static int masq_device_event(struct notifier_block *this, return NOTIFY_DONE; } +static int inet_cmp(struct nf_conn *ct, void *ptr) +{ + struct in_ifaddr *ifa = (struct in_ifaddr *)ptr; + struct net_device *dev = ifa->ifa_dev->dev; + struct nf_conntrack_tuple *tuple; + + if (!device_cmp(ct, (void *)(long)dev->ifindex)) + return 0; + + tuple = &ct->tuplehash[IP_CT_DIR_REPLY].tuple; + + return ifa->ifa_address == tuple->dst.u3.ip; +} + static int masq_inet_event(struct notifier_block *this, unsigned long event, void *ptr) { struct in_device *idev = ((struct in_ifaddr *)ptr)->ifa_dev; - struct netdev_notifier_info info; + struct net *net = dev_net(idev->dev); /* The masq_dev_notifier will catch the case of the device going * down. So if the inetdev is dead and being destroyed we have @@ -120,8 +134,10 @@ static int masq_inet_event(struct notifier_block *this, if (idev->dead) return NOTIFY_DONE; - netdev_notifier_info_init(&info, idev->dev); - return masq_device_event(this, event, &info); + if (event == NETDEV_DOWN) + nf_ct_iterate_cleanup_net(net, inet_cmp, ptr, 0, 0); + + return NOTIFY_DONE; } static struct notifier_block masq_dev_notifier = { diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 3fbf688a1943..88aaf14983e8 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -299,6 +299,7 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPKeepAlive", LINUX_MIB_TCPKEEPALIVE), SNMP_MIB_ITEM("TCPMTUPFail", LINUX_MIB_TCPMTUPFAIL), SNMP_MIB_ITEM("TCPMTUPSuccess", LINUX_MIB_TCPMTUPSUCCESS), + SNMP_MIB_ITEM("TCPWqueueTooBig", LINUX_MIB_TCPWQUEUETOOBIG), SNMP_MIB_SENTINEL }; diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 115d9fd413e2..53a11894f9e4 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -202,7 +202,7 @@ static int raw_v4_input(struct sk_buff *skb, const struct iphdr *iph, int hash) } sk = __raw_v4_lookup(net, sk_next(sk), iph->protocol, iph->saddr, iph->daddr, - skb->dev->ifindex, sdif); + dif, sdif); } out: read_unlock(&raw_v4_hashinfo.lock); diff --git a/net/ipv4/raw_diag.c b/net/ipv4/raw_diag.c index c200065ef9a5..6367ecdf76c4 100644 --- a/net/ipv4/raw_diag.c +++ b/net/ipv4/raw_diag.c @@ -23,9 +23,6 @@ raw_get_hashinfo(const struct inet_diag_req_v2 *r) return &raw_v6_hashinfo; #endif } else { - pr_warn_once("Unexpected inet family %d\n", - r->sdiag_family); - WARN_ON_ONCE(1); return ERR_PTR(-EINVAL); } } diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 3b72990a8bb9..de7f955ffd0a 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -925,16 +925,15 @@ void ip_rt_send_redirect(struct sk_buff *skb) if (peer->rate_tokens == 0 || time_after(jiffies, (peer->rate_last + - (ip_rt_redirect_load << peer->rate_tokens)))) { + (ip_rt_redirect_load << peer->n_redirects)))) { __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr); icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw); peer->rate_last = jiffies; - ++peer->rate_tokens; ++peer->n_redirects; #ifdef CONFIG_IP_ROUTE_VERBOSE if (log_martians && - peer->rate_tokens == ip_rt_redirect_number) + peer->n_redirects == ip_rt_redirect_number) net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n", &ip_hdr(skb)->saddr, inet_iif(skb), &ip_hdr(skb)->daddr, &gw); @@ -2352,14 +2351,17 @@ struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4, int orig_oif = fl4->flowi4_oif; unsigned int flags = 0; struct rtable *rth; - int err = -ENETUNREACH; + int err; if (fl4->saddr) { - rth = ERR_PTR(-EINVAL); if (ipv4_is_multicast(fl4->saddr) || ipv4_is_lbcast(fl4->saddr) || - ipv4_is_zeronet(fl4->saddr)) + ipv4_is_zeronet(fl4->saddr)) { + rth = ERR_PTR(-EINVAL); goto out; + } + + rth = ERR_PTR(-ENETUNREACH); /* I removed check for oif == dev_out->oif here. It was wrong for two reasons: diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index e8caab8e2f5c..78771272f613 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -37,6 +37,8 @@ static int ip_local_port_range_min[] = { 1, 1 }; static int ip_local_port_range_max[] = { 65535, 65535 }; static int tcp_adv_win_scale_min = -31; static int tcp_adv_win_scale_max = 31; +static int tcp_min_snd_mss_min = TCP_MIN_SND_MSS; +static int tcp_min_snd_mss_max = 65535; static int ip_privileged_port_min; static int ip_privileged_port_max = 65535; static int ip_ttl_min = 1; @@ -944,6 +946,15 @@ static struct ctl_table ipv4_net_table[] = { .proc_handler = proc_dointvec, }, { + .procname = "tcp_min_snd_mss", + .data = &init_net.ipv4.sysctl_tcp_min_snd_mss, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &tcp_min_snd_mss_min, + .extra2 = &tcp_min_snd_mss_max, + }, + { .procname = "tcp_probe_threshold", .data = &init_net.ipv4.sysctl_tcp_probe_threshold, .maxlen = sizeof(int), diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 00ae9a1d44ed..8f07655718f3 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -574,7 +574,7 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait) } /* This barrier is coupled with smp_wmb() in tcp_reset() */ smp_rmb(); - if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue)) + if (sk->sk_err || !skb_queue_empty_lockless(&sk->sk_error_queue)) mask |= POLLERR; return mask; @@ -914,6 +914,22 @@ static int tcp_send_mss(struct sock *sk, int *size_goal, int flags) return mss_now; } +/* In some cases, both sendpage() and sendmsg() could have added + * an skb to the write queue, but failed adding payload on it. + * We need to remove it to consume less memory, but more + * importantly be able to generate EPOLLOUT for Edge Trigger epoll() + * users. + */ +static void tcp_remove_empty_skb(struct sock *sk, struct sk_buff *skb) +{ + if (skb && !skb->len && + TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq) { + tcp_unlink_write_queue(skb, sk); + tcp_check_send_head(sk, skb); + sk_wmem_free_skb(sk, skb); + } +} + ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset, size_t size, int flags) { @@ -1034,6 +1050,7 @@ out: return copied; do_error: + tcp_remove_empty_skb(sk, tcp_write_queue_tail(sk)); if (copied) goto out; out_err: @@ -1412,17 +1429,11 @@ out_nopush: sock_zerocopy_put(uarg); return copied + copied_syn; +do_error: + skb = tcp_write_queue_tail(sk); do_fault: - if (!skb->len) { - tcp_unlink_write_queue(skb, sk); - /* It is the one place in all of TCP, except connection - * reset, where we can be unlinking the send_head. - */ - tcp_check_send_head(sk, skb); - sk_wmem_free_skb(sk, skb); - } + tcp_remove_empty_skb(sk, skb); -do_error: if (copied + copied_syn) goto out; out_err: @@ -1776,7 +1787,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, if (unlikely(flags & MSG_ERRQUEUE)) return inet_recv_error(sk, msg, len, addr_len); - if (sk_can_busy_loop(sk) && skb_queue_empty(&sk->sk_receive_queue) && + if (sk_can_busy_loop(sk) && skb_queue_empty_lockless(&sk->sk_receive_queue) && (sk->sk_state == TCP_ESTABLISHED)) sk_busy_loop(sk, nonblock); @@ -2366,6 +2377,8 @@ int tcp_disconnect(struct sock *sk, int flags) dst_release(sk->sk_rx_dst); sk->sk_rx_dst = NULL; tcp_saved_syn_free(tp); + tp->bytes_acked = 0; + tp->bytes_received = 0; /* Clean up fastopen related fields */ tcp_free_fastopen_req(tp); @@ -2500,7 +2513,9 @@ static int do_tcp_setsockopt(struct sock *sk, int level, name[val] = 0; lock_sock(sk); - err = tcp_set_congestion_control(sk, name, true, true); + err = tcp_set_congestion_control(sk, name, true, true, + ns_capable(sock_net(sk)->user_ns, + CAP_NET_ADMIN)); release_sock(sk); return err; } @@ -3480,6 +3495,7 @@ void __init tcp_init(void) unsigned long limit; unsigned int i; + BUILD_BUG_ON(TCP_MIN_SND_MSS <= MAX_TCP_OPTION_SPACE); BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > FIELD_SIZEOF(struct sk_buff, cb)); diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index 2f26124fd160..494e3c3a21a1 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -338,7 +338,8 @@ out: * tcp_reinit_congestion_control (if the current congestion control was * already initialized. */ -int tcp_set_congestion_control(struct sock *sk, const char *name, bool load, bool reinit) +int tcp_set_congestion_control(struct sock *sk, const char *name, bool load, + bool reinit, bool cap_net_admin) { struct inet_connection_sock *icsk = inet_csk(sk); const struct tcp_congestion_ops *ca; @@ -372,8 +373,7 @@ int tcp_set_congestion_control(struct sock *sk, const char *name, bool load, boo } else { err = -EBUSY; } - } else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) || - ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))) { + } else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) || cap_net_admin)) { err = -EPERM; } else if (!try_module_get(ca->owner)) { err = -EBUSY; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 657d33e2ff6a..55253ba0681f 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -247,7 +247,7 @@ static void tcp_ecn_accept_cwr(struct tcp_sock *tp, const struct sk_buff *skb) static void tcp_ecn_withdraw_cwr(struct tcp_sock *tp) { - tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR; + tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR; } static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb) @@ -1329,7 +1329,7 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, TCP_SKB_CB(skb)->seq += shifted; tcp_skb_pcount_add(prev, pcount); - BUG_ON(tcp_skb_pcount(skb) < pcount); + WARN_ON_ONCE(tcp_skb_pcount(skb) < pcount); tcp_skb_pcount_add(skb, -pcount); /* When we're adding to gso_segs == 1, gso_size will be zero, @@ -1396,6 +1396,21 @@ static int skb_can_shift(const struct sk_buff *skb) return !skb_headlen(skb) && skb_is_nonlinear(skb); } +int tcp_skb_shift(struct sk_buff *to, struct sk_buff *from, + int pcount, int shiftlen) +{ + /* TCP min gso_size is 8 bytes (TCP_MIN_GSO_SIZE) + * Since TCP_SKB_CB(skb)->tcp_gso_segs is 16 bits, we need + * to make sure not storing more than 65535 * 8 bytes per skb, + * even if current MSS is bigger. + */ + if (unlikely(to->len + shiftlen >= 65535 * TCP_MIN_GSO_SIZE)) + return 0; + if (unlikely(tcp_skb_pcount(to) + pcount > 65535)) + return 0; + return skb_shift(to, from, shiftlen); +} + /* Try collapsing SACK blocks spanning across multiple skbs to a single * skb. */ @@ -1407,6 +1422,7 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb, struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *prev; int mss; + int next_pcount; int pcount = 0; int len; int in_sack; @@ -1504,7 +1520,7 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb, if (!after(TCP_SKB_CB(skb)->seq + len, tp->snd_una)) goto fallback; - if (!skb_shift(prev, skb, len)) + if (!tcp_skb_shift(prev, skb, pcount, len)) goto fallback; if (!tcp_shifted_skb(sk, skb, state, pcount, len, mss, dup_sack)) goto out; @@ -1523,11 +1539,11 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb, goto out; len = skb->len; - if (skb_shift(prev, skb, len)) { - pcount += tcp_skb_pcount(skb); - tcp_shifted_skb(sk, skb, state, tcp_skb_pcount(skb), len, mss, 0); + next_pcount = tcp_skb_pcount(skb); + if (tcp_skb_shift(prev, skb, next_pcount, len)) { + pcount += next_pcount; + tcp_shifted_skb(sk, skb, state, next_pcount, len, mss, 0); } - out: state->fack_count += pcount; return prev; @@ -2810,9 +2826,9 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked, bool do_lost = is_dupack || ((flag & FLAG_DATA_SACKED) && (tcp_fackets_out(tp) > tp->reordering)); - if (WARN_ON(!tp->packets_out && tp->sacked_out)) + if (!tp->packets_out && tp->sacked_out) tp->sacked_out = 0; - if (WARN_ON(!tp->sacked_out && tp->fackets_out)) + if (!tp->sacked_out && tp->fackets_out) tp->fackets_out = 0; /* Now state machine starts. diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 97a414dbdaf4..44a41ac2b0ca 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -245,7 +245,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) inet->inet_daddr); } - inet->inet_id = tp->write_seq ^ jiffies; + inet->inet_id = prandom_u32(); if (tcp_fastopen_defer_connect(sk, &err)) return err; @@ -1368,7 +1368,7 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, inet_csk(newsk)->icsk_ext_hdr_len = 0; if (inet_opt) inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen; - newinet->inet_id = newtp->write_seq ^ jiffies; + newinet->inet_id = prandom_u32(); if (!dst) { dst = inet_csk_route_child_sock(sk, newsk, req); @@ -2477,6 +2477,7 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_ecn_fallback = 1; net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS; + net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS; net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD; net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 24bad638c2ec..6025cc509d97 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1264,6 +1264,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *buff; int nsize, old_factor; + long limit; int nlen; u8 flags; @@ -1274,6 +1275,19 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, if (nsize < 0) nsize = 0; + /* tcp_sendmsg() can overshoot sk_wmem_queued by one full size skb. + * We need some allowance to not penalize applications setting small + * SO_SNDBUF values. + * Also allow first and last skb in retransmit queue to be split. + */ + limit = sk->sk_sndbuf + 2 * SKB_TRUESIZE(GSO_MAX_SIZE); + if (unlikely((sk->sk_wmem_queued >> 1) > limit && + skb != tcp_rtx_queue_head(sk) && + skb != tcp_rtx_queue_tail(sk))) { + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPWQUEUETOOBIG); + return -ENOMEM; + } + if (skb_unclone(skb, gfp)) return -ENOMEM; @@ -1442,8 +1456,7 @@ static inline int __tcp_mtu_to_mss(struct sock *sk, int pmtu) mss_now -= icsk->icsk_ext_hdr_len; /* Then reserve room for full set of TCP options and 8 bytes of data */ - if (mss_now < 48) - mss_now = 48; + mss_now = max(mss_now, sock_net(sk)->ipv4.sysctl_tcp_min_snd_mss); return mss_now; } @@ -2012,7 +2025,7 @@ static bool tcp_can_coalesce_send_queue_head(struct sock *sk, int len) if (len <= skb->len) break; - if (unlikely(TCP_SKB_CB(skb)->eor)) + if (unlikely(TCP_SKB_CB(skb)->eor) || tcp_has_tx_tstamp(skb)) return false; len -= skb->len; @@ -2135,6 +2148,7 @@ static int tcp_mtu_probe(struct sock *sk) * we need to propagate it to the new skb. */ TCP_SKB_CB(nskb)->eor = TCP_SKB_CB(skb)->eor; + tcp_skb_collapse_tstamp(nskb, skb); tcp_unlink_write_queue(skb, sk); sk_wmem_free_skb(sk, skb); } else { @@ -2724,7 +2738,7 @@ static bool tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb) if (next_skb_size <= skb_availroom(skb)) skb_copy_bits(next_skb, 0, skb_put(skb, next_skb_size), next_skb_size); - else if (!skb_shift(skb, next_skb, next_skb_size)) + else if (!tcp_skb_shift(skb, next_skb, 1, next_skb_size)) return false; } tcp_highest_sack_replace(sk, next_skb, skb); @@ -2918,7 +2932,7 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) if (likely(!err)) { TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS; } else if (err != -EBUSY) { - NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL); + NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL, segs); } return err; } diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index a845b7692c1b..895129b0928c 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -141,6 +141,7 @@ static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk) mss = tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low) >> 1; mss = min(net->ipv4.sysctl_tcp_base_mss, mss); mss = max(mss, 68 - tp->tcp_header_len); + mss = max(mss, net->ipv4.sysctl_tcp_min_snd_mss); icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss); tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); } @@ -357,7 +358,7 @@ static void tcp_probe_timer(struct sock *sk) return; } - if (icsk->icsk_probes_out > max_probes) { + if (icsk->icsk_probes_out >= max_probes) { abort: tcp_write_err(sk); } else { /* Only send another probe if we didn't close things up. */ @@ -412,6 +413,7 @@ void tcp_retransmit_timer(struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); struct inet_connection_sock *icsk = inet_csk(sk); + struct sk_buff *skb; if (tp->fastopen_rsk) { WARN_ON_ONCE(sk->sk_state != TCP_SYN_RECV && @@ -422,10 +424,13 @@ void tcp_retransmit_timer(struct sock *sk) */ return; } + if (!tp->packets_out) - goto out; + return; - WARN_ON(tcp_write_queue_empty(sk)); + skb = tcp_rtx_queue_head(sk); + if (WARN_ON_ONCE(!skb)) + return; tp->tlp_high_seq = 0; @@ -458,16 +463,17 @@ void tcp_retransmit_timer(struct sock *sk) goto out; } tcp_enter_loss(sk); - tcp_retransmit_skb(sk, tcp_write_queue_head(sk), 1); + tcp_retransmit_skb(sk, skb, 1); __sk_dst_reset(sk); goto out_reset_timer; } + __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPTIMEOUTS); if (tcp_write_timeout(sk)) goto out; if (icsk->icsk_retransmits == 0) { - int mib_idx; + int mib_idx = 0; if (icsk->icsk_ca_state == TCP_CA_Recovery) { if (tcp_is_sack(tp)) @@ -482,10 +488,9 @@ void tcp_retransmit_timer(struct sock *sk) mib_idx = LINUX_MIB_TCPSACKFAILURES; else mib_idx = LINUX_MIB_TCPRENOFAILURES; - } else { - mib_idx = LINUX_MIB_TCPTIMEOUTS; } - __NET_INC_STATS(sock_net(sk), mib_idx); + if (mib_idx) + __NET_INC_STATS(sock_net(sk), mib_idx); } tcp_enter_loss(sk); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index b89920c0f226..ab3f272a0884 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -419,7 +419,7 @@ static int compute_score(struct sock *sk, struct net *net, score += 4; } - if (sk->sk_incoming_cpu == raw_smp_processor_id()) + if (READ_ONCE(sk->sk_incoming_cpu) == raw_smp_processor_id()) score++; return score; } @@ -563,7 +563,11 @@ static inline struct sock *__udp4_lib_lookup_skb(struct sk_buff *skb, struct sock *udp4_lib_lookup_skb(struct sk_buff *skb, __be16 sport, __be16 dport) { - return __udp4_lib_lookup_skb(skb, sport, dport, &udp_table); + const struct iphdr *iph = ip_hdr(skb); + + return __udp4_lib_lookup(dev_net(skb->dev), iph->saddr, sport, + iph->daddr, dport, inet_iif(skb), + inet_sdif(skb), &udp_table, NULL); } EXPORT_SYMBOL_GPL(udp4_lib_lookup_skb); @@ -1191,6 +1195,20 @@ static void udp_set_dev_scratch(struct sk_buff *skb) scratch->_tsize_state |= UDP_SKB_IS_STATELESS; } +static void udp_skb_csum_unnecessary_set(struct sk_buff *skb) +{ + /* We come here after udp_lib_checksum_complete() returned 0. + * This means that __skb_checksum_complete() might have + * set skb->csum_valid to 1. + * On 64bit platforms, we can set csum_unnecessary + * to true, but only if the skb is not shared. + */ +#if BITS_PER_LONG == 64 + if (!skb_shared(skb)) + udp_skb_scratch(skb)->csum_unnecessary = true; +#endif +} + static int udp_skb_truesize(struct sk_buff *skb) { return udp_skb_scratch(skb)->_tsize_state & ~UDP_SKB_IS_STATELESS; @@ -1426,10 +1444,7 @@ static struct sk_buff *__first_packet_length(struct sock *sk, *total += skb->truesize; kfree_skb(skb); } else { - /* the csum related bits could be changed, refresh - * the scratch area - */ - udp_set_dev_scratch(skb); + udp_skb_csum_unnecessary_set(skb); break; } } @@ -1453,7 +1468,7 @@ static int first_packet_length(struct sock *sk) spin_lock_bh(&rcvq->lock); skb = __first_packet_length(sk, rcvq, &total); - if (!skb && !skb_queue_empty(sk_queue)) { + if (!skb && !skb_queue_empty_lockless(sk_queue)) { spin_lock(&sk_queue->lock); skb_queue_splice_tail_init(sk_queue, rcvq); spin_unlock(&sk_queue->lock); @@ -1528,7 +1543,7 @@ struct sk_buff *__skb_recv_udp(struct sock *sk, unsigned int flags, return skb; } - if (skb_queue_empty(sk_queue)) { + if (skb_queue_empty_lockless(sk_queue)) { spin_unlock_bh(&queue->lock); goto busy_check; } @@ -1555,7 +1570,7 @@ busy_check: break; sk_busy_loop(sk, flags & MSG_DONTWAIT); - } while (!skb_queue_empty(sk_queue)); + } while (!skb_queue_empty_lockless(sk_queue)); /* sk_queue is empty, reader_queue may contain peeked packets */ } while (timeo && @@ -2535,7 +2550,7 @@ unsigned int udp_poll(struct file *file, struct socket *sock, poll_table *wait) unsigned int mask = datagram_poll(file, sock, wait); struct sock *sk = sock->sk; - if (!skb_queue_empty(&udp_sk(sk)->reader_queue)) + if (!skb_queue_empty_lockless(&udp_sk(sk)->reader_queue)) mask |= POLLIN | POLLRDNORM; sock_rps_record_flow(sk); |