[RFC,bpf-next,2/7] bpf: extend bpf_pcap support to tracing programs
diff mbox series

Message ID 1567892444-16344-3-git-send-email-alan.maguire@oracle.com
State RFC
Delegated to: BPF Maintainers
Headers show
Series
  • bpf: packet capture helpers, bpftool support
Related show

Commit Message

Alan Maguire Sept. 7, 2019, 9:40 p.m. UTC
packet capture is especially valuable in tracing contexts, so
extend bpf_pcap helper to take a tracing-derived skb pointer
as an argument.

In the case of tracing programs, the starting protocol
(corresponding to libpcap DLT_* values; 1 for Ethernet, 12 for
IP, etc) needs to be specified and should reflect the protocol
type which is pointed to by the skb->data pointer; i.e. the
start of the packet.  This can derived in a limited set of cases,
but should be specified where possible.  For skb and xdp programs
this protocol will nearly always be 1 (BPF_PCAP_TYPE_ETH).

Example usage for a tracing program, where we use a
struct bpf_pcap_hdr array map to pass in preferences for
protocol and max len:

struct bpf_map_def SEC("maps") pcap_conf_map = {
	.type = BPF_MAP_TYPE_ARRAY,
	.key_size = sizeof(int),
	.value_size = sizeof(struct bpf_pcap_hdr),
	.max_entries = 1,
};

struct bpf_map_def SEC("maps") pcap_map = {
	.type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
	.key_size = sizeof(int),
	.value_size = sizeof(int),
	.max_entries = 1024,
};

SEC("kprobe/kfree_skb")
int probe_kfree_skb(struct pt_regs *ctx)
{
	struct bpf_pcap_hdr *conf;
	int key = 0;

	conf = bpf_map_lookup_elem(&pcap_conf_map, &key);
	if (!conf)
		return 0;

	bpf_pcap((void *)PT_REGS_PARM1(ctx), conf->cap_len, &pcap_map,
		 conf->protocol, BPF_F_CURRENT_CPU);
	return 0;
}

Signed-off-by: Alan Maguire <alan.maguire@oracle.com>
---
 include/uapi/linux/bpf.h |  21 ++++-
 kernel/trace/bpf_trace.c | 214 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 233 insertions(+), 2 deletions(-)

Comments

Yonghong Song Sept. 8, 2019, 10:18 p.m. UTC | #1
On 9/7/19 2:40 PM, Alan Maguire wrote:
> packet capture is especially valuable in tracing contexts, so
> extend bpf_pcap helper to take a tracing-derived skb pointer
> as an argument.
> 
> In the case of tracing programs, the starting protocol
> (corresponding to libpcap DLT_* values; 1 for Ethernet, 12 for
> IP, etc) needs to be specified and should reflect the protocol
> type which is pointed to by the skb->data pointer; i.e. the
> start of the packet.  This can derived in a limited set of cases,
> but should be specified where possible.  For skb and xdp programs
> this protocol will nearly always be 1 (BPF_PCAP_TYPE_ETH).
> 
> Example usage for a tracing program, where we use a
> struct bpf_pcap_hdr array map to pass in preferences for
> protocol and max len:
> 
> struct bpf_map_def SEC("maps") pcap_conf_map = {
> 	.type = BPF_MAP_TYPE_ARRAY,
> 	.key_size = sizeof(int),
> 	.value_size = sizeof(struct bpf_pcap_hdr),
> 	.max_entries = 1,
> };
> 
> struct bpf_map_def SEC("maps") pcap_map = {
> 	.type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
> 	.key_size = sizeof(int),
> 	.value_size = sizeof(int),
> 	.max_entries = 1024,
> };
> 
> SEC("kprobe/kfree_skb")
> int probe_kfree_skb(struct pt_regs *ctx)
> {
> 	struct bpf_pcap_hdr *conf;
> 	int key = 0;
> 
> 	conf = bpf_map_lookup_elem(&pcap_conf_map, &key);
> 	if (!conf)
> 		return 0;
> 
> 	bpf_pcap((void *)PT_REGS_PARM1(ctx), conf->cap_len, &pcap_map,
> 		 conf->protocol, BPF_F_CURRENT_CPU);
> 	return 0;
> }
> 
> Signed-off-by: Alan Maguire <alan.maguire@oracle.com>
> ---
[...]
> @@ -2977,6 +2992,8 @@ enum bpf_func_id {
>   
>   /* BPF_FUNC_pcap flags */
>   #define	BPF_F_PCAP_ID_MASK		0xffffffffffff
> +#define BPF_F_PCAP_ID_IIFINDEX		(1ULL << 48)
> +#define BPF_F_PCAP_STRICT_TYPE         (1ULL << 56)
>   
>   /* Mode for BPF_FUNC_skb_adjust_room helper. */
>   enum bpf_adj_room_mode {
> diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
> index ca1255d..311883b 100644
> --- a/kernel/trace/bpf_trace.c
> +++ b/kernel/trace/bpf_trace.c
> @@ -13,6 +13,8 @@
>   #include <linux/kprobes.h>
>   #include <linux/syscalls.h>
>   #include <linux/error-injection.h>
> +#include <linux/skbuff.h>
> +#include <linux/ip.h>
>   
>   #include <asm/tlb.h>
>   
> @@ -530,6 +532,216 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
>   	return __bpf_perf_event_output(regs, map, flags, sd);
>   }
>   
> +/* Essentially just skb_copy_bits() using probe_kernel_read() where needed. */
> +static unsigned long bpf_trace_skb_copy(void *tobuf, const void *from,
> +					unsigned long offset,
> +					unsigned long len)
> +{
> +	const struct sk_buff *frag_iterp, *skb = from;
> +	struct skb_shared_info *shinfop, shinfo;
> +	struct sk_buff frag_iter;
> +	unsigned long copy, start;
> +	void *to = tobuf;
> +	int i, ret;
> +
> +	start = skb_headlen(skb);
> +
> +	copy = start - offset;
> +	if (copy > 0) {
> +		if (copy > len)
> +			copy = len;
> +		ret = probe_kernel_read(to, skb->data, copy);
> +		if (unlikely(ret < 0))
> +			goto out;
> +		len -= copy;
> +		if (len == 0)
> +			return 0;
> +		offset += copy;
> +		to += copy;
> +	}
> +
> +	if (skb->data_len == 0)
> +		goto out;
> +
> +	shinfop = skb_shinfo(skb);
> +
> +	ret = probe_kernel_read(&shinfo, shinfop, sizeof(shinfo));
> +	if (unlikely(ret < 0))
> +		goto out;
> +
> +	if (shinfo.nr_frags > MAX_SKB_FRAGS) {
> +		ret = -EINVAL;
> +		goto out;
> +	}
> +	for (i = 0; i < shinfo.nr_frags; i++) {
> +		skb_frag_t *f = &shinfo.frags[i];
> +		int end;
> +
> +		if (start > offset + len) {
> +			ret = -E2BIG;
> +			goto out;
> +		}
> +
> +		end = start + skb_frag_size(f);
> +		copy = end - offset;
> +		if (copy > 0) {
> +			u32 poff, p_len, copied;
> +			struct page *p;
> +			u8 *vaddr;
> +
> +			if (copy > len)
> +				copy = len;
> +
> +			skb_frag_foreach_page(f,
> +					      skb_frag_off(f) + offset - start,
> +					      copy, p, poff, p_len, copied) {
> +
> +				vaddr = kmap_atomic(p);
> +				ret = probe_kernel_read(to + copied,
> +							vaddr + poff, p_len);
> +				kunmap_atomic(vaddr);
> +
> +				if (unlikely(ret < 0))
> +					goto out;
> +			}
> +			len -= copy;
> +			if (len == 0)
> +				return 0;
> +			offset += copy;
> +			to += copy;
> +		}
> +		start = end;
> +	}
> +
> +	for (frag_iterp = shinfo.frag_list; frag_iterp;
> +	     frag_iterp = frag_iter.next) {
> +		int end;
> +
> +		if (start > offset + len) {
> +			ret = -E2BIG;
> +			goto out;
> +		}
> +		ret = probe_kernel_read(&frag_iter, frag_iterp,
> +					sizeof(frag_iter));
> +		if (ret)
> +			goto out;
> +
> +		end = start + frag_iter.len;
> +		copy = end - offset;
> +		if (copy > 0) {
> +			if (copy > len)
> +				copy = len;
> +			ret = bpf_trace_skb_copy(to, &frag_iter,
> +						offset - start,
> +						copy);
> +			if (ret)
> +				goto out;
> +
> +			len -= copy;
> +			if (len == 0)
> +				return 0;
> +			offset += copy;
> +			to += copy;
> +		}
> +		start = end;
> +	}
> +out:
> +	if (ret)
> +		memset(tobuf, 0, len);
> +
> +	return ret;
> +}

For net side bpf_perf_event_output, we have
static unsigned long bpf_skb_copy(void *dst_buff, const void *skb,
                                   unsigned long off, unsigned long len)
{
         void *ptr = skb_header_pointer(skb, off, len, dst_buff);

         if (unlikely(!ptr))
                 return len;
         if (ptr != dst_buff)
                 memcpy(dst_buff, ptr, len);

         return 0;
}

BPF_CALL_5(bpf_skb_event_output, struct sk_buff *, skb, struct bpf_map 
*, map,
            u64, flags, void *, meta, u64, meta_size)
{
         u64 skb_size = (flags & BPF_F_CTXLEN_MASK) >> 32;

         if (unlikely(flags & ~(BPF_F_CTXLEN_MASK | BPF_F_INDEX_MASK)))
                 return -EINVAL;
         if (unlikely(skb_size > skb->len))
                 return -EFAULT;

         return bpf_event_output(map, flags, meta, meta_size, skb, skb_size,
                                 bpf_skb_copy);
}

It does not really consider output all the frags.
I understand that to get truly all packet data, frags should be
considered, but seems we did not do it before? I am wondering
whether we need to do here.

If we indeed do not need to handle frags here, I think maybe
bpf_probe_read() in existing bpf kprobe function should be
enough, we do not need this helper?

> +
> +/* Derive protocol for some of the easier cases.  For tracing, a probe point
> + * may be dealing with packets in various states. Common cases are IP
> + * packets prior to adding MAC header (_PCAP_TYPE_IP) and a full packet
> + * (_PCAP_TYPE_ETH).  For other cases the caller must specify the
> + * protocol they expect.  Other heuristics for packet identification
> + * should be added here as needed, since determining the packet type
> + * ensures we do not capture packets that fail to match the desired
> + * pcap type in BPF_F_PCAP_STRICT_TYPE mode.
> + */
> +static inline int bpf_skb_protocol_get(struct sk_buff *skb)
> +{
> +	switch (htons(skb->protocol)) {
> +	case ETH_P_IP:
> +	case ETH_P_IPV6:
> +		if (skb_network_header(skb) == skb->data)
> +			return BPF_PCAP_TYPE_IP;
> +		else
> +			return BPF_PCAP_TYPE_ETH;
> +	default:
> +		return BPF_PCAP_TYPE_UNSET;
> +	}
> +}
> +
> +BPF_CALL_5(bpf_trace_pcap, void *, data, u32, size, struct bpf_map *, map,
> +	   int, protocol_wanted, u64, flags)

Up to now, for helpers, verifier has a way to verifier it is used 
properly regarding to the context. For example, for xdp version
perf_event_output, the help prototype,
   BPF_CALL_5(bpf_xdp_event_output, struct xdp_buff *, xdp, struct 
bpf_map *, map,
            u64, flags, void *, meta, u64, meta_size)
the verifier is able to guarantee that the first parameter
has correct type xdp_buff, not something from type cast.
   .arg1_type      = ARG_PTR_TO_CTX,

This helper, in the below we have
   .arg1_type	= ARG_ANYTHING,

So it is not really enforced. Bringing BTF can help, but type
name matching typically bad.


> +{
> +	struct bpf_pcap_hdr pcap;
> +	struct sk_buff skb;
> +	int protocol;
> +	int ret;
> +
> +	if (unlikely(flags & ~(BPF_F_PCAP_ID_IIFINDEX | BPF_F_PCAP_ID_MASK |
> +			       BPF_F_PCAP_STRICT_TYPE)))
> +		return -EINVAL;
> +
> +	ret = probe_kernel_read(&skb, data, sizeof(skb));
> +	if (unlikely(ret < 0))
> +		return ret;
> +
> +	/* Sanity check skb len in case we get bogus data. */
> +	if (unlikely(!skb.len))
> +		return -ENOENT;
> +	if (unlikely(skb.len > GSO_MAX_SIZE || skb.data_len > skb.len))
> +		return -E2BIG;
> +
> +	protocol = bpf_skb_protocol_get(&skb);
> +
> +	if (protocol_wanted == BPF_PCAP_TYPE_UNSET) {
> +		/* If we cannot determine protocol type, bail. */
> +		if (protocol == BPF_PCAP_TYPE_UNSET)
> +			return -EPROTO;
> +	} else {
> +		/* if we determine protocol type, and it's not what we asked
> +		 * for _and_ we are in strict mode, bail.  Otherwise we assume
> +		 * the packet is the requested protocol type and drive on.
> +		 */
> +		if (flags & BPF_F_PCAP_STRICT_TYPE &&
> +		    protocol != BPF_PCAP_TYPE_UNSET &&
> +		    protocol != protocol_wanted)
> +			return -EPROTO;
> +		protocol = protocol_wanted;
> +	}
> +
> +	/* If we specified a matching incoming ifindex, bail if not a match. */
> +	if (flags & BPF_F_PCAP_ID_IIFINDEX) {
> +		int iif = flags & BPF_F_PCAP_ID_MASK;
> +
> +		if (iif && skb.skb_iif != iif)
> +			return -ENOENT;
> +	}
> +
> +	ret = bpf_pcap_prepare(protocol, size, skb.len, flags, &pcap);
> +	if (ret)
> +		return ret;
> +
> +	return bpf_event_output(map, BPF_F_CURRENT_CPU, &pcap, sizeof(pcap),
> +				&skb, pcap.cap_len, bpf_trace_skb_copy);
> +}
> +
> +static const struct bpf_func_proto bpf_trace_pcap_proto = {
> +	.func		= bpf_trace_pcap,
> +	.gpl_only	= true,
> +	.ret_type	= RET_INTEGER,
> +	.arg1_type	= ARG_ANYTHING,
> +	.arg2_type	= ARG_ANYTHING,
> +	.arg3_type	= ARG_CONST_MAP_PTR,
> +	.arg4_type	= ARG_ANYTHING,
> +	.arg5_type	= ARG_ANYTHING,
> +};
> +
>   BPF_CALL_0(bpf_get_current_task)
>   {
>   	return (long) current;
> @@ -709,6 +921,8 @@ static void do_bpf_send_signal(struct irq_work *entry)
>   #endif
>   	case BPF_FUNC_send_signal:
>   		return &bpf_send_signal_proto;
> +	case BPF_FUNC_pcap:
> +		return &bpf_trace_pcap_proto;
>   	default:
>   		return NULL;
>   	}
>
Alan Maguire Sept. 9, 2019, 10:25 p.m. UTC | #2
On Sun, 8 Sep 2019, Yonghong Song wrote:
 
> For net side bpf_perf_event_output, we have
> static unsigned long bpf_skb_copy(void *dst_buff, const void *skb,
>                                    unsigned long off, unsigned long len)
> {
>          void *ptr = skb_header_pointer(skb, off, len, dst_buff);
> 
>          if (unlikely(!ptr))
>                  return len;
>          if (ptr != dst_buff)
>                  memcpy(dst_buff, ptr, len);
> 
>          return 0;
> }
> 
> BPF_CALL_5(bpf_skb_event_output, struct sk_buff *, skb, struct bpf_map 
> *, map,
>             u64, flags, void *, meta, u64, meta_size)
> {
>          u64 skb_size = (flags & BPF_F_CTXLEN_MASK) >> 32;
> 
>          if (unlikely(flags & ~(BPF_F_CTXLEN_MASK | BPF_F_INDEX_MASK)))
>                  return -EINVAL;
>          if (unlikely(skb_size > skb->len))
>                  return -EFAULT;
> 
>          return bpf_event_output(map, flags, meta, meta_size, skb, skb_size,
>                                  bpf_skb_copy);
> }
> 
> It does not really consider output all the frags.
> I understand that to get truly all packet data, frags should be
> considered, but seems we did not do it before? I am wondering
> whether we need to do here.

Thanks for the feedback! In experimenting with packet capture,
my original hope was to keep things simple and avoid fragment parsing
if possible. However if scatter-gather is enabled for the networking
device, or indeed if it's running in a VM it turns out a lot of the
interesting packet data ends up in the fragments on transmit (ssh
headers, http headers etc).  So I think it would be worth considering
adding support for fragment traversal.  It's not needed as much
in the skb program case - we can always pullup the skb - but in
the tracing situation we probably wouldn't want to do something
that invasive in tracing context.

Fragment traversal might be worth breaking out as a separate patchset, 
perhaps triggered by a specific flag to bpf_skb_event_output?

Feedback from folks at Linux Plumbers (I hope I'm summarizing correctly) 
seemed to agree with what you mentioned WRT the first patch in this 
series.  The gist was we probably don't want to force the metadata to be a 
specific packet capture type; we'd rather use the existing perf event 
mechanisms and if we are indeed doing packet capture, simply specify that 
data in the program as metadata. 

I'd be happy with that approach myself if I could capture skb 
fragments in tracing programs - being able to do that would give 
equivalent functionality to what I proposed but without having a packet 
capture-specific helper.
> 
> If we indeed do not need to handle frags here, I think maybe
> bpf_probe_read() in existing bpf kprobe function should be
> enough, we do not need this helper?
> 

Certainly for many use cases, that will get you most of what you need - 
particularly if you're just looking at L2 to L4 data. For full packet 
capture however I think we may need to think about fragment traversal.

> > +
> > +/* Derive protocol for some of the easier cases.  For tracing, a probe point
> > + * may be dealing with packets in various states. Common cases are IP
> > + * packets prior to adding MAC header (_PCAP_TYPE_IP) and a full packet
> > + * (_PCAP_TYPE_ETH).  For other cases the caller must specify the
> > + * protocol they expect.  Other heuristics for packet identification
> > + * should be added here as needed, since determining the packet type
> > + * ensures we do not capture packets that fail to match the desired
> > + * pcap type in BPF_F_PCAP_STRICT_TYPE mode.
> > + */
> > +static inline int bpf_skb_protocol_get(struct sk_buff *skb)
> > +{
> > +	switch (htons(skb->protocol)) {
> > +	case ETH_P_IP:
> > +	case ETH_P_IPV6:
> > +		if (skb_network_header(skb) == skb->data)
> > +			return BPF_PCAP_TYPE_IP;
> > +		else
> > +			return BPF_PCAP_TYPE_ETH;
> > +	default:
> > +		return BPF_PCAP_TYPE_UNSET;
> > +	}
> > +}
> > +
> > +BPF_CALL_5(bpf_trace_pcap, void *, data, u32, size, struct bpf_map *, map,
> > +	   int, protocol_wanted, u64, flags)
> 
> Up to now, for helpers, verifier has a way to verifier it is used 
> properly regarding to the context. For example, for xdp version
> perf_event_output, the help prototype,
>    BPF_CALL_5(bpf_xdp_event_output, struct xdp_buff *, xdp, struct 
> bpf_map *, map,
>             u64, flags, void *, meta, u64, meta_size)
> the verifier is able to guarantee that the first parameter
> has correct type xdp_buff, not something from type cast.
>    .arg1_type      = ARG_PTR_TO_CTX,
> 
> This helper, in the below we have
>    .arg1_type	= ARG_ANYTHING,
> 
> So it is not really enforced. Bringing BTF can help, but type
> name matching typically bad.
> 
> 
One thing we were discussing - and I think this is similar to what
you're suggesting - is to investigate if there might be a way to
leverage BTF to provide additional guarantees that the tracing
data we are handling is indeed an skb.  Specifically if we
trace a kprobe function argument or a tracepoint function, and
if we had that guarantee, we could perhaps invoke the skb-style
perf event output function (trace both the skb data and the metadata).
The challenge would be how to do that type-based matching; we'd
need the function argument information from BTF _and_ need to
somehow associate it at probe attach time. 

Thanks again for looking at the code!

Alan
Yonghong Song Sept. 10, 2019, 7:43 a.m. UTC | #3
On 9/9/19 11:25 PM, Alan Maguire wrote:
> On Sun, 8 Sep 2019, Yonghong Song wrote:
>   
>> For net side bpf_perf_event_output, we have
>> static unsigned long bpf_skb_copy(void *dst_buff, const void *skb,
>>                                     unsigned long off, unsigned long len)
>> {
>>           void *ptr = skb_header_pointer(skb, off, len, dst_buff);
>>
>>           if (unlikely(!ptr))
>>                   return len;
>>           if (ptr != dst_buff)
>>                   memcpy(dst_buff, ptr, len);
>>
>>           return 0;
>> }
>>
>> BPF_CALL_5(bpf_skb_event_output, struct sk_buff *, skb, struct bpf_map
>> *, map,
>>              u64, flags, void *, meta, u64, meta_size)
>> {
>>           u64 skb_size = (flags & BPF_F_CTXLEN_MASK) >> 32;
>>
>>           if (unlikely(flags & ~(BPF_F_CTXLEN_MASK | BPF_F_INDEX_MASK)))
>>                   return -EINVAL;
>>           if (unlikely(skb_size > skb->len))
>>                   return -EFAULT;
>>
>>           return bpf_event_output(map, flags, meta, meta_size, skb, skb_size,
>>                                   bpf_skb_copy);
>> }
>>
>> It does not really consider output all the frags.
>> I understand that to get truly all packet data, frags should be
>> considered, but seems we did not do it before? I am wondering
>> whether we need to do here.
> 
> Thanks for the feedback! In experimenting with packet capture,
> my original hope was to keep things simple and avoid fragment parsing
> if possible. However if scatter-gather is enabled for the networking
> device, or indeed if it's running in a VM it turns out a lot of the
> interesting packet data ends up in the fragments on transmit (ssh
> headers, http headers etc).  So I think it would be worth considering
> adding support for fragment traversal.  It's not needed as much
> in the skb program case - we can always pullup the skb - but in
> the tracing situation we probably wouldn't want to do something
> that invasive in tracing context.

Agree that in tracing context, we should avoid push/pull skb. It is
indeed invasive.

> 
> Fragment traversal might be worth breaking out as a separate patchset,
> perhaps triggered by a specific flag to bpf_skb_event_output?

This can be done for bpf_skb_event_output as the context is a sk_buff.
And you can just follow the frags to copy the whole thing without
bpf_probe_read().

> 
> Feedback from folks at Linux Plumbers (I hope I'm summarizing correctly)
> seemed to agree with what you mentioned WRT the first patch in this
> series.  The gist was we probably don't want to force the metadata to be a
> specific packet capture type; we'd rather use the existing perf event
> mechanisms and if we are indeed doing packet capture, simply specify that
> data in the program as metadata.

Agree, you can have whatever metadata you have for bpf_perf_event_output.

> 
> I'd be happy with that approach myself if I could capture skb
> fragments in tracing programs - being able to do that would give
> equivalent functionality to what I proposed but without having a packet
> capture-specific helper.

That won't work for tracing program. Full of bpf_probe_read()
in tracing version of packet copying is not nice either.

We may still need a different helper for tracing programs.

I think we need something like below:
   - vmlinux BTF at /sys/kernel/btf/kernel, is loaded into kernel.
     (/sys/kernel/btf/kernel is the source of truth)
   - For a tracing bpf program, if that function eventually
     copy  helper
         bpf_skb_event_output(..., skb, ...)
     the verifier needs to verify skb is indeed a valid skb
     by tracing back to one of parameters.

     Here, I use skb as an example, maybe it can be extended
     to other data structures as well.

With this approach, you can reuse some of functions from
tracing side to deal with frag copying and no bpf_probe_read()
is needed.

Here, I use skb as an example, maybe it can be extended
to other data structures as well if needed.

>>
>> If we indeed do not need to handle frags here, I think maybe
>> bpf_probe_read() in existing bpf kprobe function should be
>> enough, we do not need this helper?
>>
> 
> Certainly for many use cases, that will get you most of what you need -
> particularly if you're just looking at L2 to L4 data. For full packet
> capture however I think we may need to think about fragment traversal.
> 
>>> +
>>> +/* Derive protocol for some of the easier cases.  For tracing, a probe point
>>> + * may be dealing with packets in various states. Common cases are IP
>>> + * packets prior to adding MAC header (_PCAP_TYPE_IP) and a full packet
>>> + * (_PCAP_TYPE_ETH).  For other cases the caller must specify the
>>> + * protocol they expect.  Other heuristics for packet identification
>>> + * should be added here as needed, since determining the packet type
>>> + * ensures we do not capture packets that fail to match the desired
>>> + * pcap type in BPF_F_PCAP_STRICT_TYPE mode.
>>> + */
>>> +static inline int bpf_skb_protocol_get(struct sk_buff *skb)
>>> +{
>>> +	switch (htons(skb->protocol)) {
>>> +	case ETH_P_IP:
>>> +	case ETH_P_IPV6:
>>> +		if (skb_network_header(skb) == skb->data)
>>> +			return BPF_PCAP_TYPE_IP;
>>> +		else
>>> +			return BPF_PCAP_TYPE_ETH;
>>> +	default:
>>> +		return BPF_PCAP_TYPE_UNSET;
>>> +	}
>>> +}
>>> +
>>> +BPF_CALL_5(bpf_trace_pcap, void *, data, u32, size, struct bpf_map *, map,
>>> +	   int, protocol_wanted, u64, flags)
>>
>> Up to now, for helpers, verifier has a way to verifier it is used
>> properly regarding to the context. For example, for xdp version
>> perf_event_output, the help prototype,
>>     BPF_CALL_5(bpf_xdp_event_output, struct xdp_buff *, xdp, struct
>> bpf_map *, map,
>>              u64, flags, void *, meta, u64, meta_size)
>> the verifier is able to guarantee that the first parameter
>> has correct type xdp_buff, not something from type cast.
>>     .arg1_type      = ARG_PTR_TO_CTX,
>>
>> This helper, in the below we have
>>     .arg1_type	= ARG_ANYTHING,
>>
>> So it is not really enforced. Bringing BTF can help, but type
>> name matching typically bad.
>>
>>
> One thing we were discussing - and I think this is similar to what
> you're suggesting - is to investigate if there might be a way to
> leverage BTF to provide additional guarantees that the tracing
> data we are handling is indeed an skb.  Specifically if we
> trace a kprobe function argument or a tracepoint function, and
> if we had that guarantee, we could perhaps invoke the skb-style
> perf event output function (trace both the skb data and the metadata).
> The challenge would be how to do that type-based matching; we'd
> need the function argument information from BTF _and_ need to
> somehow associate it at probe attach time.
> 
> Thanks again for looking at the code!
> 
> Alan
>

Patch
diff mbox series

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index a27e58e..13f86d3 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2758,7 +2758,9 @@  struct bpf_stack_build_id {
  *              held by *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This
  *		perf event has the same attributes as perf events generated
  *		by bpf_perf_event_output.  For skb and xdp programs, *data*
- *		is the relevant context.
+ *		is the relevant context, while for tracing programs,
+ *		*data* must be a pointer to a **struct sk_buff** derived
+ *		from kprobe or tracepoint arguments.
  *
  *		Metadata for this event is a **struct bpf_pcap_hdr**; this
  *		contains the capture length, actual packet length and
@@ -2771,6 +2773,14 @@  struct bpf_stack_build_id {
  *		to 48 bits; the id can be used to correlate captured packets
  *		with other trace data, since the passed-in flags value is stored
  *		stored in the **struct bpf_pcap_hdr** in the **flags** field.
+ *		Specifying **BPF_F_PCAP_ID_IIFINDEX** and a non-zero value in
+ *		the id portion of the flags limits capture events to skbs
+ *		with the specified incoming ifindex, allowing limiting of
+ *		tracing to the the associated interface.  Specifying
+ *		**BPF_F_PCAP_STRICT_TYPE** will cause *bpf_pcap* to return
+ *		-EPROTO and skip capture if a specific protocol is specified
+ *		and it does not match the current skb.  These additional flags
+ *		are only valid (and useful) for tracing programs.
  *
  *		The *protocol* value specifies the protocol type of the start
  *		of the packet so that packet capture can carry out
@@ -2780,7 +2790,12 @@  struct bpf_stack_build_id {
  *	Return
  *		0 on success, or a negative error in case of failure.
  *		-ENOENT will be returned if the associated perf event
- *		map entry is empty, or the skb is zero-length.
+ *		map entry is empty, the skb is zero-length,  or the incoming
+ *		ifindex was specified and we failed to match.
+ *		-EPROTO will be returned if **BPF_PCAP_TYPE_UNSET** is specified
+ *		and no protocol can be determined, or if we specify a protocol
+ *		along with **BPF_F_PCAP_STRICT_TYPE** and the skb protocol does
+ *		not match.
  *		-EINVAL will be returned if the flags value is invalid.
  *
  */
@@ -2977,6 +2992,8 @@  enum bpf_func_id {
 
 /* BPF_FUNC_pcap flags */
 #define	BPF_F_PCAP_ID_MASK		0xffffffffffff
+#define BPF_F_PCAP_ID_IIFINDEX		(1ULL << 48)
+#define BPF_F_PCAP_STRICT_TYPE         (1ULL << 56)
 
 /* Mode for BPF_FUNC_skb_adjust_room helper. */
 enum bpf_adj_room_mode {
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index ca1255d..311883b 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -13,6 +13,8 @@ 
 #include <linux/kprobes.h>
 #include <linux/syscalls.h>
 #include <linux/error-injection.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
 
 #include <asm/tlb.h>
 
@@ -530,6 +532,216 @@  u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
 	return __bpf_perf_event_output(regs, map, flags, sd);
 }
 
+/* Essentially just skb_copy_bits() using probe_kernel_read() where needed. */
+static unsigned long bpf_trace_skb_copy(void *tobuf, const void *from,
+					unsigned long offset,
+					unsigned long len)
+{
+	const struct sk_buff *frag_iterp, *skb = from;
+	struct skb_shared_info *shinfop, shinfo;
+	struct sk_buff frag_iter;
+	unsigned long copy, start;
+	void *to = tobuf;
+	int i, ret;
+
+	start = skb_headlen(skb);
+
+	copy = start - offset;
+	if (copy > 0) {
+		if (copy > len)
+			copy = len;
+		ret = probe_kernel_read(to, skb->data, copy);
+		if (unlikely(ret < 0))
+			goto out;
+		len -= copy;
+		if (len == 0)
+			return 0;
+		offset += copy;
+		to += copy;
+	}
+
+	if (skb->data_len == 0)
+		goto out;
+
+	shinfop = skb_shinfo(skb);
+
+	ret = probe_kernel_read(&shinfo, shinfop, sizeof(shinfo));
+	if (unlikely(ret < 0))
+		goto out;
+
+	if (shinfo.nr_frags > MAX_SKB_FRAGS) {
+		ret = -EINVAL;
+		goto out;
+	}
+	for (i = 0; i < shinfo.nr_frags; i++) {
+		skb_frag_t *f = &shinfo.frags[i];
+		int end;
+
+		if (start > offset + len) {
+			ret = -E2BIG;
+			goto out;
+		}
+
+		end = start + skb_frag_size(f);
+		copy = end - offset;
+		if (copy > 0) {
+			u32 poff, p_len, copied;
+			struct page *p;
+			u8 *vaddr;
+
+			if (copy > len)
+				copy = len;
+
+			skb_frag_foreach_page(f,
+					      skb_frag_off(f) + offset - start,
+					      copy, p, poff, p_len, copied) {
+
+				vaddr = kmap_atomic(p);
+				ret = probe_kernel_read(to + copied,
+							vaddr + poff, p_len);
+				kunmap_atomic(vaddr);
+
+				if (unlikely(ret < 0))
+					goto out;
+			}
+			len -= copy;
+			if (len == 0)
+				return 0;
+			offset += copy;
+			to += copy;
+		}
+		start = end;
+	}
+
+	for (frag_iterp = shinfo.frag_list; frag_iterp;
+	     frag_iterp = frag_iter.next) {
+		int end;
+
+		if (start > offset + len) {
+			ret = -E2BIG;
+			goto out;
+		}
+		ret = probe_kernel_read(&frag_iter, frag_iterp,
+					sizeof(frag_iter));
+		if (ret)
+			goto out;
+
+		end = start + frag_iter.len;
+		copy = end - offset;
+		if (copy > 0) {
+			if (copy > len)
+				copy = len;
+			ret = bpf_trace_skb_copy(to, &frag_iter,
+						offset - start,
+						copy);
+			if (ret)
+				goto out;
+
+			len -= copy;
+			if (len == 0)
+				return 0;
+			offset += copy;
+			to += copy;
+		}
+		start = end;
+	}
+out:
+	if (ret)
+		memset(tobuf, 0, len);
+
+	return ret;
+}
+
+/* Derive protocol for some of the easier cases.  For tracing, a probe point
+ * may be dealing with packets in various states. Common cases are IP
+ * packets prior to adding MAC header (_PCAP_TYPE_IP) and a full packet
+ * (_PCAP_TYPE_ETH).  For other cases the caller must specify the
+ * protocol they expect.  Other heuristics for packet identification
+ * should be added here as needed, since determining the packet type
+ * ensures we do not capture packets that fail to match the desired
+ * pcap type in BPF_F_PCAP_STRICT_TYPE mode.
+ */
+static inline int bpf_skb_protocol_get(struct sk_buff *skb)
+{
+	switch (htons(skb->protocol)) {
+	case ETH_P_IP:
+	case ETH_P_IPV6:
+		if (skb_network_header(skb) == skb->data)
+			return BPF_PCAP_TYPE_IP;
+		else
+			return BPF_PCAP_TYPE_ETH;
+	default:
+		return BPF_PCAP_TYPE_UNSET;
+	}
+}
+
+BPF_CALL_5(bpf_trace_pcap, void *, data, u32, size, struct bpf_map *, map,
+	   int, protocol_wanted, u64, flags)
+{
+	struct bpf_pcap_hdr pcap;
+	struct sk_buff skb;
+	int protocol;
+	int ret;
+
+	if (unlikely(flags & ~(BPF_F_PCAP_ID_IIFINDEX | BPF_F_PCAP_ID_MASK |
+			       BPF_F_PCAP_STRICT_TYPE)))
+		return -EINVAL;
+
+	ret = probe_kernel_read(&skb, data, sizeof(skb));
+	if (unlikely(ret < 0))
+		return ret;
+
+	/* Sanity check skb len in case we get bogus data. */
+	if (unlikely(!skb.len))
+		return -ENOENT;
+	if (unlikely(skb.len > GSO_MAX_SIZE || skb.data_len > skb.len))
+		return -E2BIG;
+
+	protocol = bpf_skb_protocol_get(&skb);
+
+	if (protocol_wanted == BPF_PCAP_TYPE_UNSET) {
+		/* If we cannot determine protocol type, bail. */
+		if (protocol == BPF_PCAP_TYPE_UNSET)
+			return -EPROTO;
+	} else {
+		/* if we determine protocol type, and it's not what we asked
+		 * for _and_ we are in strict mode, bail.  Otherwise we assume
+		 * the packet is the requested protocol type and drive on.
+		 */
+		if (flags & BPF_F_PCAP_STRICT_TYPE &&
+		    protocol != BPF_PCAP_TYPE_UNSET &&
+		    protocol != protocol_wanted)
+			return -EPROTO;
+		protocol = protocol_wanted;
+	}
+
+	/* If we specified a matching incoming ifindex, bail if not a match. */
+	if (flags & BPF_F_PCAP_ID_IIFINDEX) {
+		int iif = flags & BPF_F_PCAP_ID_MASK;
+
+		if (iif && skb.skb_iif != iif)
+			return -ENOENT;
+	}
+
+	ret = bpf_pcap_prepare(protocol, size, skb.len, flags, &pcap);
+	if (ret)
+		return ret;
+
+	return bpf_event_output(map, BPF_F_CURRENT_CPU, &pcap, sizeof(pcap),
+				&skb, pcap.cap_len, bpf_trace_skb_copy);
+}
+
+static const struct bpf_func_proto bpf_trace_pcap_proto = {
+	.func		= bpf_trace_pcap,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_ANYTHING,
+	.arg2_type	= ARG_ANYTHING,
+	.arg3_type	= ARG_CONST_MAP_PTR,
+	.arg4_type	= ARG_ANYTHING,
+	.arg5_type	= ARG_ANYTHING,
+};
+
 BPF_CALL_0(bpf_get_current_task)
 {
 	return (long) current;
@@ -709,6 +921,8 @@  static void do_bpf_send_signal(struct irq_work *entry)
 #endif
 	case BPF_FUNC_send_signal:
 		return &bpf_send_signal_proto;
+	case BPF_FUNC_pcap:
+		return &bpf_trace_pcap_proto;
 	default:
 		return NULL;
 	}