summaryrefslogtreecommitdiff
path: root/net/ipv4/tcp.c
diff options
context:
space:
mode:
authorEric Dumazet <edumazet@google.com>2012-09-23 23:04:42 +0000
committerDavid S. Miller <davem@davemloft.net>2012-09-24 16:31:37 -0400
commit5640f7685831e088fe6c2e1f863a6805962f8e81 (patch)
treefb7660173338a45c27d610eb59ba20cf5c2b91b8 /net/ipv4/tcp.c
parentb98b8babd6e3370fadb7c6eaacb00eb2f6344a6c (diff)
net: use a per task frag allocator
We currently use a per socket order-0 page cache for tcp_sendmsg() operations. This page is used to build fragments for skbs. Its done to increase probability of coalescing small write() into single segments in skbs still in write queue (not yet sent) But it wastes a lot of memory for applications handling many mostly idle sockets, since each socket holds one page in sk->sk_sndmsg_page Its also quite inefficient to build TSO 64KB packets, because we need about 16 pages per skb on arches where PAGE_SIZE = 4096, so we hit page allocator more than wanted. This patch adds a per task frag allocator and uses bigger pages, if available. An automatic fallback is done in case of memory pressure. (up to 32768 bytes per frag, thats order-3 pages on x86) This increases TCP stream performance by 20% on loopback device, but also benefits on other network devices, since 8x less frags are mapped on transmit and unmapped on tx completion. Alexander Duyck mentioned a probable performance win on systems with IOMMU enabled. Its possible some SG enabled hardware cant cope with bigger fragments, but their ndo_start_xmit() should already handle this, splitting a fragment in sub fragments, since some arches have PAGE_SIZE=65536 Successfully tested on various ethernet devices. (ixgbe, igb, bnx2x, tg3, mellanox mlx4) Signed-off-by: Eric Dumazet <edumazet@google.com> Cc: Ben Hutchings <bhutchings@solarflare.com> Cc: Vijay Subramanian <subramanian.vijay@gmail.com> Cc: Alexander Duyck <alexander.h.duyck@intel.com> Tested-by: Vijay Subramanian <subramanian.vijay@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/ipv4/tcp.c')
-rw-r--r--net/ipv4/tcp.c79
1 files changed, 22 insertions, 57 deletions
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 7b1e940393cf..72ea4752f21b 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1150,78 +1150,43 @@ new_segment:
if (err)
goto do_fault;
} else {
- bool merge = false;
+ bool merge = true;
int i = skb_shinfo(skb)->nr_frags;
- struct page *page = sk->sk_sndmsg_page;
- int off;
-
- if (page && page_count(page) == 1)
- sk->sk_sndmsg_off = 0;
-
- off = sk->sk_sndmsg_off;
-
- if (skb_can_coalesce(skb, i, page, off) &&
- off != PAGE_SIZE) {
- /* We can extend the last page
- * fragment. */
- merge = true;
- } else if (i == MAX_SKB_FRAGS || !sg) {
- /* Need to add new fragment and cannot
- * do this because interface is non-SG,
- * or because all the page slots are
- * busy. */
- tcp_mark_push(tp, skb);
- goto new_segment;
- } else if (page) {
- if (off == PAGE_SIZE) {
- put_page(page);
- sk->sk_sndmsg_page = page = NULL;
- off = 0;
+ struct page_frag *pfrag = sk_page_frag(sk);
+
+ if (!sk_page_frag_refill(sk, pfrag))
+ goto wait_for_memory;
+
+ if (!skb_can_coalesce(skb, i, pfrag->page,
+ pfrag->offset)) {
+ if (i == MAX_SKB_FRAGS || !sg) {
+ tcp_mark_push(tp, skb);
+ goto new_segment;
}
- } else
- off = 0;
+ merge = false;
+ }
- if (copy > PAGE_SIZE - off)
- copy = PAGE_SIZE - off;
+ copy = min_t(int, copy, pfrag->size - pfrag->offset);
if (!sk_wmem_schedule(sk, copy))
goto wait_for_memory;
- if (!page) {
- /* Allocate new cache page. */
- if (!(page = sk_stream_alloc_page(sk)))
- goto wait_for_memory;
- }
-
- /* Time to copy data. We are close to
- * the end! */
err = skb_copy_to_page_nocache(sk, from, skb,
- page, off, copy);
- if (err) {
- /* If this page was new, give it to the
- * socket so it does not get leaked.
- */
- if (!sk->sk_sndmsg_page) {
- sk->sk_sndmsg_page = page;
- sk->sk_sndmsg_off = 0;
- }
+ pfrag->page,
+ pfrag->offset,
+ copy);
+ if (err)
goto do_error;
- }
/* Update the skb. */
if (merge) {
skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
} else {
- skb_fill_page_desc(skb, i, page, off, copy);
- if (sk->sk_sndmsg_page) {
- get_page(page);
- } else if (off + copy < PAGE_SIZE) {
- get_page(page);
- sk->sk_sndmsg_page = page;
- }
+ skb_fill_page_desc(skb, i, pfrag->page,
+ pfrag->offset, copy);
+ get_page(pfrag->page);
}
-
- sk->sk_sndmsg_off = off + copy;
+ pfrag->offset += copy;
}
if (!copied)