[bpf-next,7/9] bpf: Sample NRM BPF program to limit egress bw

Message ID	20190219053837.2086945-1-brakmo@fb.com
State	Changes Requested
Delegated to:	BPF Maintainers
Headers	show Return-Path: <netdev-owner@vger.kernel.org> Smtp-Origin-Hostprefix: devbig From: brakmo <brakmo@fb.com> Smtp-Origin-Hostname: devbig009.ftw2.facebook.com To: netdev <netdev@vger.kernel.org> CC: Martin Lau <kafai@fb.com>, Alexei Starovoitov <ast@fb.com>, Daniel Borkmann --cc=Kernel Team <"daniel@iogearbox.netKernel-team"@fb.com> Smtp-Origin-Cluster: ftw2c04 Subject: [PATCH bpf-next 7/9] bpf: Sample NRM BPF program to limit egress bw Date: Mon, 18 Feb 2019 21:38:37 -0800 Message-ID: <20190219053837.2086945-1-brakmo@fb.com> MIME-Version: 1.0 Content-Type: text/plain Sender: netdev-owner@vger.kernel.org Precedence: bulk
Series	None \| expand [bpf-next,4/9] bpf: Add bpf helper bpf_tcp_check_probe_timer [bpf-next,5/9] bpf: sync bpf.h to tools and update bpf_helpers.h [bpf-next,6/9] bpf: Sample program to load cg skb BPF programs [bpf-next,7/9] bpf: Sample NRM BPF program to limit egress bw [bpf-next,8/9] bpf: User program for testing NRM [bpf-next,9/9] bpf: NRM test script

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index 0cf3347c7443..86633af239d4 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -169,6 +169,7 @@ always += xdpsock_kern.o always += xdp_fwd_kern.o always += task_fd_query_kern.o always += xdp_sample_pkts_kern.o +always += nrm_out_kern.o KBUILD_HOSTCFLAGS += -I$(objtree)/usr/include KBUILD_HOSTCFLAGS += -I$(srctree)/tools/lib/ @@ -268,6 +269,7 @@ $(BPF_SAMPLES_PATH)/*.c: verify_target_bpf $(LIBBPF) $(src)/*.c: verify_target_bpf $(LIBBPF) $(obj)/tracex5_kern.o: $(obj)/syscall_nrs.h +$(obj)/nrm_out_kern.o: $(src)/nrm.h $(src)/nrm_kern.h # asm/sysreg.h - inline assembly used by it is incompatible with llvm. # But, there is no easy way to fix it, so just exclude it since it is diff --git a/samples/bpf/nrm.h b/samples/bpf/nrm.h new file mode 100644 index 000000000000..7aa113f8b9df --- /dev/null +++ b/samples/bpf/nrm.h @@ -0,0 +1,31 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * Copyright (c) 2019 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Include file for NRM programs + */ +struct bpf_vqueue { + struct bpf_spin_lock lock; + /* 4 byte hole */ + unsigned long long lasttime; /* In ns */ + int credit; /* In bytes */ + unsigned int rate; /* In bytes per NS << 20 */ +}; + +struct queue_stats { + unsigned long rate; /* in Mbps*/ + unsigned long stats:1, /* get NRM stats (marked, dropped,..) */ + loopback:1; /* also limit flows using loopback */ + unsigned long long pkts_marked; + unsigned long long bytes_marked; + unsigned long long pkts_dropped; + unsigned long long bytes_dropped; + unsigned long long pkts_total; + unsigned long long bytes_total; + unsigned long long firstPacketTime; + unsigned long long lastPacketTime; +}; diff --git a/samples/bpf/nrm_kern.h b/samples/bpf/nrm_kern.h new file mode 100644 index 000000000000..438a14e2ddba --- /dev/null +++ b/samples/bpf/nrm_kern.h @@ -0,0 +1,109 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * Copyright (c) 2019 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Include file for sample NRM BPF programs + */ +#define KBUILD_MODNAME "foo" +#include <stddef.h> +#include <stdbool.h> +#include <uapi/linux/bpf.h> +#include <uapi/linux/if_ether.h> +#include <uapi/linux/if_packet.h> +#include <uapi/linux/ip.h> +#include <uapi/linux/ipv6.h> +#include <uapi/linux/in.h> +#include <uapi/linux/tcp.h> +#include <uapi/linux/filter.h> +#include <uapi/linux/pkt_cls.h> +#include <net/ipv6.h> +#include "bpf_endian.h" +#include "bpf_helpers.h" +#include "nrm.h" + +#define DROP_PKT 0 +#define ALLOW_PKT 1 +#define TCP_ECN_OK 1 + +#define NRM_DEBUG 0 // Set to 1 to enable debugging +#if NRM_DEBUG +#define bpf_printk(fmt, ...) \ +({ \ + char ____fmt[] = fmt; \ + bpf_trace_printk(____fmt, sizeof(____fmt), \ + ##__VA_ARGS__); \ +}) +#else +#define bpf_printk(fmt, ...) +#endif + +#define INITIAL_CREDIT_PACKETS 100 +#define MAX_BYTES_PER_PACKET 1500 +#define MARK_THRESH (80 * MAX_BYTES_PER_PACKET) +#define DROP_THRESH (80 * 5 * MAX_BYTES_PER_PACKET) +#define LARGE_PKT_DROP_THRESH (DROP_THRESH - (15 * MAX_BYTES_PER_PACKET)) +#define MARK_REGION_SIZE (LARGE_PKT_DROP_THRESH - MARK_THRESH) +#define LARGE_PKT_THRESH 120 +#define MAX_CREDIT (100 * MAX_BYTES_PER_PACKET) +#define INIT_CREDIT (INITIAL_CREDIT_PACKETS * MAX_BYTES_PER_PACKET) + +// rate in bytes per ns << 20 +#define CREDIT_PER_NS(delta, rate) ((((u64)(delta)) * (rate)) >> 20) + +struct bpf_map_def SEC("maps") queue_state = { + .type = BPF_MAP_TYPE_CGROUP_STORAGE, + .key_size = sizeof(struct bpf_cgroup_storage_key), + .value_size = sizeof(struct bpf_vqueue), +}; +BPF_ANNOTATE_KV_PAIR(queue_state, struct bpf_cgroup_storage_key, + struct bpf_vqueue); + +struct bpf_map_def SEC("maps") queue_stats = { + .type = BPF_MAP_TYPE_ARRAY, + .key_size = sizeof(u32), + .value_size = sizeof(struct queue_stats), + .max_entries = 1, +}; +BPF_ANNOTATE_KV_PAIR(queue_stats, int, struct queue_stats); + +struct nrm_pkt_info { + bool is_ip; + bool is_tcp; + short ecn; +}; + +static __always_inline void get_nrm_pkt_info(struct bpf_sock *sk, + struct nrm_pkt_info *pkti) +{ + if (sk->family == AF_INET6 || sk->family == AF_INET) { + pkti->is_ip = true; + pkti->is_tcp = (sk->protocol == IPPROTO_TCP); + if (pkti->is_tcp) { + struct bpf_tcp_sock *tp; + + tp = bpf_tcp_sock(sk); + if (tp) + pkti->ecn = tp->ecn_flags & TCP_ECN_OK; + else + pkti->ecn = 0; + } else { + pkti->ecn = 0; + } + } else { + pkti->is_ip = false; + pkti->is_tcp = false; + pkti->ecn = 0; + } +} + +static __always_inline void init_bpf_vqueue(struct bpf_vqueue *qdp, int rate) +{ + bpf_printk("Initializing queue_state, rate:%d\n", rate * 128); + qdp->lasttime = bpf_ktime_get_ns(); + qdp->credit = INIT_CREDIT; + qdp->rate = rate * 128; +} diff --git a/samples/bpf/nrm_out_kern.c b/samples/bpf/nrm_out_kern.c new file mode 100644 index 000000000000..591a30a18a22 --- /dev/null +++ b/samples/bpf/nrm_out_kern.c @@ -0,0 +1,213 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2019 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Sample Network Resource Manager (NRM) BPF program. + * + * A cgroup skb BPF egress program to limit cgroup output bandwidth. + * It uses a modified virtual token bucket queue to limit average + * egress bandwidth. The implementation uses credits instead of tokens. + * Negative credits imply that queueing would have happened (this is + * a virtual queue, so no queueing is done by it. However, queueing may + * occur at the actual qdisc (which is not used for rate limiting). + * + * This implementation uses 3 thresholds, one to start marking packets and + * the other two to drop packets: + * CREDIT + * - <--------------------------|------------------------> + + * | | | 0 + * | Large pkt | + * | drop thresh | + * Small pkt drop Mark threshold + * thresh + * + * The effect of marking depends on the type of packet: + * a) If the packet is ECN enabled and it is a TCP packet, then the packet + * is ECN marked. + * b) If the packet is a TCP packet, then we probabilistically call tcp_cwr + * to reduce the congestion window. The current implementation uses a linear + * distribution (0% probability at marking threshold, 100% probability + * at drop threshold). + * c) If the packet is not a TCP packet, then it is dropped. + * + * If the credit is below the drop threshold, the packet is dropped. If it + * is a TCP packet, then it also calls tcp_cwr since packets dropped by + * by a cgroup skb BPF program do not automatically trigger a call to + * tcp_cwr in the current kernel code. + * + * This BPF program actually uses 2 drop thresholds, one threshold + * for larger packets (>= 120 bytes) and another for smaller packets. This + * protects smaller packets such as SYNs, ACKs, etc. + * + * The default bandwidth limit is set at 1Gbps but this can be changed by + * a user program through a shared BPF map. In addition, by default this BPF + * program does not limit connections using loopback. This behavior can be + * overwritten by the user program. There is also an option to calculate + * some statistics, such as percent of packets marked or dropped, which + * the user program can access. + * + * A latter patch provides such a program (nrm.c) + */ + +#include "nrm_kern.h" + +SEC("cgroup/skb") +int _nrm_out_cg(struct __sk_buff *skb) +{ + struct bpf_sock *sk; + struct nrm_pkt_info pkti; + int len = skb->len; + unsigned int queue_index = 0; + unsigned long long curtime; + int credit; + signed long long delta = 0, zero = 0; + int max_credit = MAX_CREDIT; + bool mark_flag = false; + bool drop_flag = false; + bool cwr_flag = false; + struct bpf_vqueue *qdp; + struct queue_stats *qsp = NULL; + int rv = ALLOW_PKT; + unsigned int payload; + unsigned int pkts; + + qsp = bpf_map_lookup_elem(&queue_stats, &queue_index); + if (qsp != NULL && !qsp->loopback && (skb->ifindex == 1)) + return ALLOW_PKT; + + sk = skb->sk; + if (!sk) + return ALLOW_PKT; + sk = bpf_sk_fullsock(sk); + if (!sk) + return ALLOW_PKT; + + get_nrm_pkt_info(sk, &pkti); + + // We may want to account for the length of headers in len + // calculation, like ETH header + overhead, specially if it + // is a gso packet. But I am not doing it right now. + + qdp = bpf_get_local_storage(&queue_state, 0); + if (!qdp) + return ALLOW_PKT; + else if (qdp->lasttime == 0) + init_bpf_vqueue(qdp, 1024); + + curtime = bpf_ktime_get_ns(); + + // Begin critical section + bpf_spin_lock(&qdp->lock); + credit = qdp->credit; + delta = curtime - qdp->lasttime; + /* delta < 0 implies that another process with a curtime greater + * than ours beat us to the critical section and already added + * the new credit, so we should not add it ourselves + */ + if (delta > 0) { + qdp->lasttime = curtime; + credit += CREDIT_PER_NS(delta, qdp->rate); + if (credit > MAX_CREDIT) + credit = MAX_CREDIT; + } + credit -= len; + qdp->credit = credit; + bpf_spin_unlock(&qdp->lock); + // End critical section + + // Check if we should update rate + if (qsp != NULL && (qsp->rate * 128) != qdp->rate) { + qdp->rate = qsp->rate * 128; + bpf_printk("Updating rate: %d (1sec:%llu bits)\n", + (int)qdp->rate, + CREDIT_PER_NS(1000000000, qdp->rate) * 8); + } + + // Set flags (drop, mark, cwr) + if (pkti.is_ip) { + if (credit < -DROP_THRESH || + (len > LARGE_PKT_THRESH && + credit < -LARGE_PKT_DROP_THRESH)) { + drop_flag = true; + mark_flag = false; + if (pkti.is_tcp) { + if (pkti.ecn == 0) + cwr_flag = true; + } + } else if (credit < 0) { + if (pkti.is_tcp) { + if (credit < -MARK_THRESH) + mark_flag = true; + else + mark_flag = false; + } else { + mark_flag = true; + } + } else { + mark_flag = false; + } + + if (mark_flag) { + if (pkti.is_tcp && pkti.ecn > 0) { + bpf_skb_set_ecn(skb, 3); + } else if (pkti.is_tcp) { + unsigned int rand = bpf_get_prandom_u32(); + + if (-credit >= MARK_THRESH + + (rand % MARK_REGION_SIZE)) { + // Do cong avoidance + cwr_flag = true; + } + } else if (len > LARGE_PKT_THRESH) { + // Problem if too many small packets? + drop_flag = true; + mark_flag = false; + } + } + + if (pkti.is_tcp) { + struct bpf_tcp_sock *tp; + + tp = bpf_tcp_sock(sk); + if (tp && drop_flag) + bpf_tcp_check_probe_timer(tp, 20000); + if (tp && cwr_flag) + bpf_tcp_enter_cwr(tp); + } + if (drop_flag) + rv = DROP_PKT; + } else if (credit < -MARK_THRESH) { + drop_flag = true; + rv = DROP_PKT; + } + + if (qsp != NULL) { + // Following is needed for work conserving + __sync_add_and_fetch(&(qsp->bytes_total), len); + if (qsp->stats) { + // Optionally update statistics + if (qsp->firstPacketTime == 0) + qsp->firstPacketTime = curtime; + qsp->lastPacketTime = curtime; + __sync_add_and_fetch(&(qsp->pkts_total), 1); + if (mark_flag) { + __sync_add_and_fetch(&(qsp->pkts_marked), 1); + __sync_add_and_fetch(&(qsp->bytes_marked), len); + } + if (drop_flag) { + __sync_add_and_fetch(&(qsp->pkts_dropped), 1); + __sync_add_and_fetch(&(qsp->bytes_dropped), + len); + } + } + } + + if (rv == DROP_PKT) + __sync_add_and_fetch(&(qdp->credit), len); + + return rv; +} +char _license[] SEC("license") = "GPL";

[bpf-next,7/9] bpf: Sample NRM BPF program to limit egress bw

Commit Message

Comments

Patch