[net-next,2/2] udp: implement and use per cpu rx skbs cache

Message ID	dacbc7a3626bb170629e02159ed2f90120f06382.1524045911.git.pabeni@redhat.com
State	Deferred, archived
Delegated to:	David Miller
Headers	show Return-Path: <netdev-owner@vger.kernel.org> From: Paolo Abeni <pabeni@redhat.com> To: netdev@vger.kernel.org Cc: "David S. Miller" <davem@davemloft.net>, Eric Dumazet <eric.dumazet@gmail.com> Subject: [PATCH net-next 2/2] udp: implement and use per cpu rx skbs cache Date: Wed, 18 Apr 2018 12:22:38 +0200 Message-Id: <dacbc7a3626bb170629e02159ed2f90120f06382.1524045911.git.pabeni@redhat.com> In-Reply-To: <cover.1524045911.git.pabeni@redhat.com> References: <cover.1524045911.git.pabeni@redhat.com> Sender: netdev-owner@vger.kernel.org Precedence: bulk
Series	UDP: introduce RX skb cache \| expand [net-next,0/2] UDP: introduce RX skb cache [net-next,1/2] udp: if the rx queue is full, free the skb in __udp_enqueue_schedule_skb() [net-next,2/2] udp: implement and use per cpu rx skbs cache

diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 3fb0fbf4977d..bb1879cd51b4 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -125,6 +125,26 @@ EXPORT_SYMBOL(sysctl_udp_mem); atomic_long_t udp_memory_allocated; EXPORT_SYMBOL(udp_memory_allocated); +struct skb_cache_entry { + int size; + int head; + struct sk_buff *skbs[0]; +}; + +static struct skb_cache_entry __percpu *skb_cache; + +/* Under socket memory pressure, small packets are copied to a percpu cache + * before enqueuing them, do decrease the load on the receiver process. + * To avoid excessive copy overhead we use a small skb size threshold. + * Each percpu cache should be able to cope with at least a socket under + * memory pressure. It doesn't need to handle many of them: if there are + * more than a few sockets under memory pressure, the user-space is most + * probably too lazy and there is no gain using the cache + */ +#define UDP_CACHE_MAX_SKB_LEN 512 +#define UDP_CACHE_MIN_SIZE _SK_MEM_PACKETS +#define UDP_CACHE_MAX_SIZE (_SK_MEM_PACKETS * 3) + #define MAX_UDP_PORTS 65536 #define PORTS_PER_CHAIN (MAX_UDP_PORTS / UDP_HTABLE_SIZE_MIN) @@ -1246,6 +1266,82 @@ static void udp_skb_dtor_locked(struct sock *sk, struct sk_buff *skb) udp_rmem_release(sk, udp_skb_truesize(skb), 1, true); } +static inline struct sk_buff *udp_cache_get_skb(void) +{ + struct skb_cache_entry *cache; + struct sk_buff *skb; + + if (unlikely(!skb_cache)) + return NULL; + + cache = this_cpu_ptr(skb_cache); + skb = cache->skbs[cache->head]; + if (refcount_read(&skb->users) != 1) + return NULL; + + /* peeking with offset clones the queued skbs, we must check that all + * the cloned references are gone. + * This barrier is paried with the implicit one in skb_unref(), while + * decrementing skb->users. + */ + rmb(); + if (unlikely(skb->cloned)) { + if (atomic_read(&skb_shinfo(skb)->dataref) != 1) + return NULL; + skb->cloned = 0; + } + + cache->head++; + if (cache->head == cache->size) + cache->head = 0; + refcount_inc(&skb->users); + return skb; +} + +static bool udp_copy_to_cache(struct sk_buff **s) +{ + struct sk_buff *skb2, *skb = *s; + int hlen; + + /* check if we can copy the specified skb into the cache: data + l3 + + * l4 must be below the the cached skb size and no head states must + * be attached. + */ + hlen = skb_network_header_len(skb) + sizeof(struct udphdr); + if ((hlen + skb->len) >= UDP_CACHE_MAX_SKB_LEN || skb_sec_path(skb)) + return false; + + skb2 = udp_cache_get_skb(); + if (!skb2) + return false; + + /* copy the relevant header: we skip the head states - we know no state + * is attached to 'skb' - the unrelevant part of the CB, and + * skb->dev - will be overwritten later by udp_set_dev_scratch() + */ + skb2->tstamp = skb->tstamp; + *UDP_SKB_CB(skb2) = *UDP_SKB_CB(skb); + skb2->queue_mapping = skb->queue_mapping; + memcpy(&skb2->headers_start, &skb->headers_start, + offsetof(struct sk_buff, headers_end) - + offsetof(struct sk_buff, headers_start)); + + /* skip the mac header, we don't need it */ + skb_copy_bits(skb, -hlen, skb2->head, skb->len + hlen); + + /* override the relevant offsets: skb2 starts from the network hdr */ + skb2->transport_header = hlen - sizeof(struct udphdr); + skb2->network_header = 0; + skb2->mac_header = 0; + skb2->data = skb2->head + hlen; + skb_set_tail_pointer(skb2, skb->len); + skb2->len = skb->len; + consume_skb(skb); + + *s = skb2; + return true; +} + /* Idea of busylocks is to let producers grab an extra spinlock * to relieve pressure on the receive_queue spinlock shared by consumer. * Under flood, this means that only one producer can be in line @@ -1290,9 +1386,12 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb) * - Reduce memory overhead and thus increase receive queue capacity * - Less cache line misses at copyout() time * - Less work at consume_skb() (less alien page frag freeing) + * Additionally, processing skbs from the cache allows udp_recvmsg() + * to 'free' them with a single atomic operation on a hot cacheline */ if (rmem > (sk->sk_rcvbuf >> 1)) { - skb_condense(skb); + if (!udp_copy_to_cache(&skb)) + skb_condense(skb); busy = busylock_acquire(sk); } @@ -2858,6 +2957,64 @@ static struct pernet_operations __net_initdata udp_sysctl_ops = { .init = udp_sysctl_init, }; +static void udp_free_cache(int nr) +{ + int i, cpu; + + for_each_possible_cpu(cpu) + for (i = 0; i < nr; ++i) + kfree_skb(per_cpu_ptr(skb_cache, cpu)->skbs[i]); + + free_percpu(skb_cache); + skb_cache = NULL; +} + +static void udp_init_cache(unsigned long max_size) +{ + size_t skb_guessed_size, per_cpu_size; + unsigned long total_size = 0; + struct sk_buff *skb; + int i, nr, cpu = 0; + + /* try to fill the cache only if we can allocate a reasonable number + * of skbs + */ + skb_guessed_size = SKB_TRUESIZE(UDP_CACHE_MAX_SKB_LEN); + nr = min_t(unsigned long, UDP_CACHE_MAX_SIZE, + max_size / (nr_cpu_ids * skb_guessed_size)); + if (nr < UDP_CACHE_MIN_SIZE) { + pr_info("low memory, UDP skbs cache will not be allocated\n"); + return; + } + + per_cpu_size = nr * sizeof(void *) + sizeof(struct skb_cache_entry); + skb_cache = __alloc_percpu_gfp(per_cpu_size, L1_CACHE_BYTES, + GFP_KERNEL | __GFP_ZERO); + if (!skb_cache) { + pr_warn("Can't allocate UDP skb cache\n"); + return; + } + + pr_info("allocating %d skbs on %d CPUs for rx cache\n", nr, nr_cpu_ids); + for (i = 0; i < nr && total_size < max_size; ++i) { + for_each_possible_cpu(cpu) { + skb = __alloc_skb(UDP_CACHE_MAX_SKB_LEN, GFP_KERNEL, + 0, cpu_to_node(cpu)); + if (!skb) { + pr_warn("allocation failure, cache disabled"); + udp_free_cache(nr); + return; + } + + total_size += skb->truesize; + per_cpu_ptr(skb_cache, cpu)->skbs[i] = skb; + } + } + + for_each_possible_cpu(cpu) + per_cpu_ptr(skb_cache, cpu)->size = nr; +} + void __init udp_init(void) { unsigned long limit; @@ -2871,6 +3028,7 @@ void __init udp_init(void) sysctl_udp_mem[2] = sysctl_udp_mem[0] * 2; __udp_sysctl_init(&init_net); + udp_init_cache(sysctl_udp_mem[0] / 100 * PAGE_SIZE); /* 16 spinlocks per cpu */ udp_busylocks_log = ilog2(nr_cpu_ids) + 4;

[net-next,2/2] udp: implement and use per cpu rx skbs cache

Commit Message

Comments

Patch