diff mbox series

[bpf-next,09/10] tools: bpftool: add simple perf event output reader

Message ID 20180504013717.29317-10-jakub.kicinski@netronome.com
State Accepted, archived
Delegated to: BPF Maintainers
Headers show
Series bpf: support offload of bpf_event_output() | expand

Commit Message

Jakub Kicinski May 4, 2018, 1:37 a.m. UTC
Users of BPF sooner or later discover perf_event_output() helpers
and BPF_MAP_TYPE_PERF_EVENT_ARRAY.  Dumping this array type is
not possible, however, we can add simple reading of perf events.
Create a new event_pipe subcommand for maps, this sub command
will only work with BPF_MAP_TYPE_PERF_EVENT_ARRAY maps.

Parts of the code from samples/bpf/trace_output_user.c.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
---
 .../bpf/bpftool/Documentation/bpftool-map.rst |  29 +-
 tools/bpf/bpftool/Documentation/bpftool.rst   |   2 +-
 tools/bpf/bpftool/Makefile                    |   7 +-
 tools/bpf/bpftool/bash-completion/bpftool     |  36 +-
 tools/bpf/bpftool/common.c                    |  19 +
 tools/bpf/bpftool/main.h                      |   4 +
 tools/bpf/bpftool/map.c                       |  19 +-
 tools/bpf/bpftool/map_perf_ring.c             | 347 ++++++++++++++++++
 8 files changed, 444 insertions(+), 19 deletions(-)
 create mode 100644 tools/bpf/bpftool/map_perf_ring.c

Comments

Alexei Starovoitov May 4, 2018, 9:25 p.m. UTC | #1
On Thu, May 03, 2018 at 06:37:16PM -0700, Jakub Kicinski wrote:
> Users of BPF sooner or later discover perf_event_output() helpers
> and BPF_MAP_TYPE_PERF_EVENT_ARRAY.  Dumping this array type is
> not possible, however, we can add simple reading of perf events.
> Create a new event_pipe subcommand for maps, this sub command
> will only work with BPF_MAP_TYPE_PERF_EVENT_ARRAY maps.
> 
> Parts of the code from samples/bpf/trace_output_user.c.
> 
> Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
> Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
> ---
>  .../bpf/bpftool/Documentation/bpftool-map.rst |  29 +-
>  tools/bpf/bpftool/Documentation/bpftool.rst   |   2 +-
>  tools/bpf/bpftool/Makefile                    |   7 +-
>  tools/bpf/bpftool/bash-completion/bpftool     |  36 +-
>  tools/bpf/bpftool/common.c                    |  19 +
>  tools/bpf/bpftool/main.h                      |   4 +
>  tools/bpf/bpftool/map.c                       |  19 +-
>  tools/bpf/bpftool/map_perf_ring.c             | 347 ++++++++++++++++++
>  8 files changed, 444 insertions(+), 19 deletions(-)
>  create mode 100644 tools/bpf/bpftool/map_perf_ring.c
> 
> diff --git a/tools/bpf/bpftool/Documentation/bpftool-map.rst b/tools/bpf/bpftool/Documentation/bpftool-map.rst
> index c3eef8c972cd..a6258bc8ec4f 100644
> --- a/tools/bpf/bpftool/Documentation/bpftool-map.rst
> +++ b/tools/bpf/bpftool/Documentation/bpftool-map.rst
> @@ -22,12 +22,13 @@ MAP COMMANDS
>  =============
>  
>  |	**bpftool** **map { show | list }**   [*MAP*]
> -|	**bpftool** **map dump**    *MAP*
> -|	**bpftool** **map update**  *MAP*  **key** *DATA*   **value** *VALUE* [*UPDATE_FLAGS*]
> -|	**bpftool** **map lookup**  *MAP*  **key** *DATA*
> -|	**bpftool** **map getnext** *MAP* [**key** *DATA*]
> -|	**bpftool** **map delete**  *MAP*  **key** *DATA*
> -|	**bpftool** **map pin**     *MAP*  *FILE*
> +|	**bpftool** **map dump**       *MAP*
> +|	**bpftool** **map update**     *MAP*  **key** *DATA*   **value** *VALUE* [*UPDATE_FLAGS*]
> +|	**bpftool** **map lookup**     *MAP*  **key** *DATA*
> +|	**bpftool** **map getnext**    *MAP* [**key** *DATA*]
> +|	**bpftool** **map delete**     *MAP*  **key** *DATA*
> +|	**bpftool** **map pin**        *MAP*  *FILE*
> +|	**bpftool** **map event_pipe** *MAP* [**cpu** *N* **index** *M*]
>  |	**bpftool** **map help**
>  |
>  |	*MAP* := { **id** *MAP_ID* | **pinned** *FILE* }
> @@ -76,6 +77,22 @@ DESCRIPTION
>  
>  		  Note: *FILE* must be located in *bpffs* mount.
>  
> +	**bpftool** **map event_pipe** *MAP* [**cpu** *N* **index** *M*]
> +		  Read events from a BPF_MAP_TYPE_PERF_EVENT_ARRAY map.
> +
> +		  Install perf rings into a perf event array map and dump
> +		  output of any bpf_perf_event_output() call in the kernel.
> +		  By default read the number of CPUs on the system and
> +		  install perf ring for each CPU in the corresponding index
> +		  in the array.
> +
> +		  If **cpu** and **index** are specified, install perf ring
> +		  for given **cpu** at **index** in the array (single ring).
> +
> +		  Note that installing a perf ring into an array will silently
> +		  replace any existing ring.  Any other application will stop
> +		  receiving events if it installed its rings earlier.
> +
>  	**bpftool map help**
>  		  Print short help message.
>  
> diff --git a/tools/bpf/bpftool/Documentation/bpftool.rst b/tools/bpf/bpftool/Documentation/bpftool.rst
> index 20689a321ffe..564cb0d9692b 100644
> --- a/tools/bpf/bpftool/Documentation/bpftool.rst
> +++ b/tools/bpf/bpftool/Documentation/bpftool.rst
> @@ -23,7 +23,7 @@ SYNOPSIS
>  
>  	*MAP-COMMANDS* :=
>  	{ **show** | **list** | **dump** | **update** | **lookup** | **getnext** | **delete**
> -	| **pin** | **help** }
> +	| **pin** | **event_pipe** | **help** }
>  
>  	*PROG-COMMANDS* := { **show** | **list** | **dump jited** | **dump xlated** | **pin**
>  	| **load** | **help** }
> diff --git a/tools/bpf/bpftool/Makefile b/tools/bpf/bpftool/Makefile
> index 4e69782c4a79..892dbf095bff 100644
> --- a/tools/bpf/bpftool/Makefile
> +++ b/tools/bpf/bpftool/Makefile
> @@ -39,7 +39,12 @@ CC = gcc
>  
>  CFLAGS += -O2
>  CFLAGS += -W -Wall -Wextra -Wno-unused-parameter -Wshadow -Wno-missing-field-initializers
> -CFLAGS += -DPACKAGE='"bpftool"' -D__EXPORTED_HEADERS__ -I$(srctree)/tools/include/uapi -I$(srctree)/tools/include -I$(srctree)/tools/lib/bpf -I$(srctree)/kernel/bpf/
> +CFLAGS += -DPACKAGE='"bpftool"' -D__EXPORTED_HEADERS__ \
> +	-I$(srctree)/kernel/bpf/ \
> +	-I$(srctree)/tools/include \
> +	-I$(srctree)/tools/include/uapi \
> +	-I$(srctree)/tools/lib/bpf \
> +	-I$(srctree)/tools/perf
>  CFLAGS += -DBPFTOOL_VERSION='"$(BPFTOOL_VERSION)"'
>  LIBS = -lelf -lbfd -lopcodes $(LIBBPF)
>  
> diff --git a/tools/bpf/bpftool/bash-completion/bpftool b/tools/bpf/bpftool/bash-completion/bpftool
> index 852d84a98acd..b301c9b315f1 100644
> --- a/tools/bpf/bpftool/bash-completion/bpftool
> +++ b/tools/bpf/bpftool/bash-completion/bpftool
> @@ -1,6 +1,6 @@
>  # bpftool(8) bash completion                               -*- shell-script -*-
>  #
> -# Copyright (C) 2017 Netronome Systems, Inc.
> +# Copyright (C) 2017-2018 Netronome Systems, Inc.
>  #
>  # This software is dual licensed under the GNU General License
>  # Version 2, June 1991 as shown in the file COPYING in the top-level
> @@ -79,6 +79,14 @@ _bpftool_get_map_ids()
>          command sed -n 's/.*"id": \(.*\),$/\1/p' )" -- "$cur" ) )
>  }
>  
> +_bpftool_get_perf_map_ids()
> +{
> +    COMPREPLY+=( $( compgen -W "$( bpftool -jp map  2>&1 | \
> +        command grep -C2 perf_event_array | \
> +        command sed -n 's/.*"id": \(.*\),$/\1/p' )" -- "$cur" ) )
> +}
> +
> +
>  _bpftool_get_prog_ids()
>  {
>      COMPREPLY+=( $( compgen -W "$( bpftool -jp prog 2>&1 | \
> @@ -359,10 +367,34 @@ _bpftool()
>                      fi
>                      return 0
>                      ;;
> +                event_pipe)
> +                    case $prev in
> +                        $command)
> +                            COMPREPLY=( $( compgen -W "$MAP_TYPE" -- "$cur" ) )
> +                            return 0
> +                            ;;
> +                        id)
> +                            _bpftool_get_perf_map_ids
> +                            return 0
> +                            ;;
> +                        cpu)
> +                            return 0
> +                            ;;
> +                        index)
> +                            return 0
> +                            ;;
> +                        *)
> +                            _bpftool_once_attr 'cpu'
> +                            _bpftool_once_attr 'index'
> +                            return 0
> +                            ;;
> +                    esac
> +                    ;;
>                  *)
>                      [[ $prev == $object ]] && \
>                          COMPREPLY=( $( compgen -W 'delete dump getnext help \
> -                            lookup pin show list update' -- "$cur" ) )
> +                            lookup pin event_pipe show list update' -- \
> +                            "$cur" ) )
>                      ;;
>              esac
>              ;;
> diff --git a/tools/bpf/bpftool/common.c b/tools/bpf/bpftool/common.c
> index 9c620770c6ed..32f9e397a6c0 100644
> --- a/tools/bpf/bpftool/common.c
> +++ b/tools/bpf/bpftool/common.c
> @@ -331,6 +331,16 @@ char *get_fdinfo(int fd, const char *key)
>  	return NULL;
>  }
>  
> +void print_data_json(uint8_t *data, size_t len)
> +{
> +	unsigned int i;
> +
> +	jsonw_start_array(json_wtr);
> +	for (i = 0; i < len; i++)
> +		jsonw_printf(json_wtr, "%d", data[i]);
> +	jsonw_end_array(json_wtr);
> +}
> +
>  void print_hex_data_json(uint8_t *data, size_t len)
>  {
>  	unsigned int i;
> @@ -421,6 +431,15 @@ void delete_pinned_obj_table(struct pinned_obj_table *tab)
>  	}
>  }
>  
> +unsigned int get_page_size(void)
> +{
> +	static int result;
> +
> +	if (!result)
> +		result = getpagesize();
> +	return result;
> +}
> +
>  unsigned int get_possible_cpus(void)
>  {
>  	static unsigned int result;
> diff --git a/tools/bpf/bpftool/main.h b/tools/bpf/bpftool/main.h
> index cbf8985da362..6173cd997e7a 100644
> --- a/tools/bpf/bpftool/main.h
> +++ b/tools/bpf/bpftool/main.h
> @@ -117,14 +117,18 @@ int do_pin_fd(int fd, const char *name);
>  
>  int do_prog(int argc, char **arg);
>  int do_map(int argc, char **arg);
> +int do_event_pipe(int argc, char **argv);
>  int do_cgroup(int argc, char **arg);
>  
>  int prog_parse_fd(int *argc, char ***argv);
> +int map_parse_fd_and_info(int *argc, char ***argv, void *info, __u32 *info_len);
>  
>  void disasm_print_insn(unsigned char *image, ssize_t len, int opcodes,
>  		       const char *arch);
> +void print_data_json(uint8_t *data, size_t len);
>  void print_hex_data_json(uint8_t *data, size_t len);
>  
> +unsigned int get_page_size(void);
>  unsigned int get_possible_cpus(void);
>  const char *ifindex_to_bfd_name_ns(__u32 ifindex, __u64 ns_dev, __u64 ns_ino);
>  
> diff --git a/tools/bpf/bpftool/map.c b/tools/bpf/bpftool/map.c
> index 5efefde5f578..af6766e956ba 100644
> --- a/tools/bpf/bpftool/map.c
> +++ b/tools/bpf/bpftool/map.c
> @@ -130,8 +130,7 @@ static int map_parse_fd(int *argc, char ***argv)
>  	return -1;
>  }
>  
> -static int
> -map_parse_fd_and_info(int *argc, char ***argv, void *info, __u32 *info_len)
> +int map_parse_fd_and_info(int *argc, char ***argv, void *info, __u32 *info_len)
>  {
>  	int err;
>  	int fd;
> @@ -817,12 +816,13 @@ static int do_help(int argc, char **argv)
>  
>  	fprintf(stderr,
>  		"Usage: %s %s { show | list }   [MAP]\n"
> -		"       %s %s dump    MAP\n"
> -		"       %s %s update  MAP  key DATA value VALUE [UPDATE_FLAGS]\n"
> -		"       %s %s lookup  MAP  key DATA\n"
> -		"       %s %s getnext MAP [key DATA]\n"
> -		"       %s %s delete  MAP  key DATA\n"
> -		"       %s %s pin     MAP  FILE\n"
> +		"       %s %s dump       MAP\n"
> +		"       %s %s update     MAP  key DATA value VALUE [UPDATE_FLAGS]\n"
> +		"       %s %s lookup     MAP  key DATA\n"
> +		"       %s %s getnext    MAP [key DATA]\n"
> +		"       %s %s delete     MAP  key DATA\n"
> +		"       %s %s pin        MAP  FILE\n"
> +		"       %s %s event_pipe MAP [cpu N index M]\n"
>  		"       %s %s help\n"
>  		"\n"
>  		"       MAP := { id MAP_ID | pinned FILE }\n"
> @@ -834,7 +834,7 @@ static int do_help(int argc, char **argv)
>  		"",
>  		bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2],
>  		bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2],
> -		bin_name, argv[-2], bin_name, argv[-2]);
> +		bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2]);
>  
>  	return 0;
>  }
> @@ -849,6 +849,7 @@ static const struct cmd cmds[] = {
>  	{ "getnext",	do_getnext },
>  	{ "delete",	do_delete },
>  	{ "pin",	do_pin },
> +	{ "event_pipe",	do_event_pipe },
>  	{ 0 }
>  };
>  
> diff --git a/tools/bpf/bpftool/map_perf_ring.c b/tools/bpf/bpftool/map_perf_ring.c
> new file mode 100644
> index 000000000000..c5a2ced8552d
> --- /dev/null
> +++ b/tools/bpf/bpftool/map_perf_ring.c
> @@ -0,0 +1,347 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/* Copyright (C) 2018 Netronome Systems, Inc. */
> +/* This program is free software; you can redistribute it and/or
> + * modify it under the terms of version 2 of the GNU General Public
> + * License as published by the Free Software Foundation.
> + */
> +#include <errno.h>
> +#include <fcntl.h>
> +#include <libbpf.h>
> +#include <poll.h>
> +#include <signal.h>
> +#include <stdbool.h>
> +#include <stdio.h>
> +#include <stdlib.h>
> +#include <string.h>
> +#include <time.h>
> +#include <unistd.h>
> +#include <linux/bpf.h>
> +#include <linux/perf_event.h>
> +#include <sys/ioctl.h>
> +#include <sys/mman.h>
> +#include <sys/syscall.h>
> +
> +#include <bpf.h>
> +#include <perf-sys.h>
> +
> +#include "main.h"
> +
> +#define MMAP_PAGE_CNT	16
> +
> +static bool stop;
> +
> +struct event_ring_info {
> +	int fd;
> +	int key;
> +	unsigned int cpu;
> +	void *mem;
> +};
> +
> +struct perf_event_sample {
> +	struct perf_event_header header;
> +	__u32 size;
> +	unsigned char data[];
> +};
> +
> +static void int_exit(int signo)
> +{
> +	fprintf(stderr, "Stopping...\n");
> +	stop = true;
> +}
> +
> +static void
> +print_bpf_output(struct event_ring_info *ring, struct perf_event_sample *e)
> +{
> +	struct {
> +		struct perf_event_header header;
> +		__u64 id;
> +		__u64 lost;
> +	} *lost = (void *)e;
> +	struct timespec ts;
> +
> +	if (clock_gettime(CLOCK_MONOTONIC, &ts)) {
> +		perror("Can't read clock for timestamp");
> +		return;
> +	}
> +
> +	if (json_output) {
> +		jsonw_start_object(json_wtr);
> +		jsonw_name(json_wtr, "timestamp");
> +		jsonw_uint(json_wtr, ts.tv_sec * 1000000000ull + ts.tv_nsec);
> +		jsonw_name(json_wtr, "type");
> +		jsonw_uint(json_wtr, e->header.type);
> +		jsonw_name(json_wtr, "cpu");
> +		jsonw_uint(json_wtr, ring->cpu);
> +		jsonw_name(json_wtr, "index");
> +		jsonw_uint(json_wtr, ring->key);
> +		if (e->header.type == PERF_RECORD_SAMPLE) {
> +			jsonw_name(json_wtr, "data");
> +			print_data_json(e->data, e->size);
> +		} else if (e->header.type == PERF_RECORD_LOST) {
> +			jsonw_name(json_wtr, "lost");
> +			jsonw_start_object(json_wtr);
> +			jsonw_name(json_wtr, "id");
> +			jsonw_uint(json_wtr, lost->id);
> +			jsonw_name(json_wtr, "count");
> +			jsonw_uint(json_wtr, lost->lost);
> +			jsonw_end_object(json_wtr);
> +		}
> +		jsonw_end_object(json_wtr);
> +	} else {
> +		if (e->header.type == PERF_RECORD_SAMPLE) {
> +			printf("== @%ld.%ld CPU: %d index: %d =====\n",
> +			       (long)ts.tv_sec, ts.tv_nsec,
> +			       ring->cpu, ring->key);
> +			fprint_hex(stdout, e->data, e->size, " ");
> +			printf("\n");
> +		} else if (e->header.type == PERF_RECORD_LOST) {
> +			printf("lost %lld events\n", lost->lost);
> +		} else {
> +			printf("unknown event type=%d size=%d\n",
> +			       e->header.type, e->header.size);
> +		}
> +	}
> +}
> +
> +static void
> +perf_event_read(struct event_ring_info *ring, void **buf, size_t *buf_len)
> +{
> +	volatile struct perf_event_mmap_page *header = ring->mem;
> +	__u64 buffer_size = MMAP_PAGE_CNT * get_page_size();
> +	__u64 data_tail = header->data_tail;
> +	__u64 data_head = header->data_head;
> +	void *base, *begin, *end;
> +
> +	asm volatile("" ::: "memory"); /* in real code it should be smp_rmb() */
> +	if (data_head == data_tail)
> +		return;

this function was copied several times into different places.
I think it's time to put into common lib. Like libbpf.
Would be great if you can do it in the follow up.

for the set:
Acked-by: Alexei Starovoitov <ast@kernel.org>
Daniel Borkmann May 4, 2018, 9:53 p.m. UTC | #2
On 05/04/2018 03:37 AM, Jakub Kicinski wrote:
> Users of BPF sooner or later discover perf_event_output() helpers
> and BPF_MAP_TYPE_PERF_EVENT_ARRAY.  Dumping this array type is
> not possible, however, we can add simple reading of perf events.
> Create a new event_pipe subcommand for maps, this sub command
> will only work with BPF_MAP_TYPE_PERF_EVENT_ARRAY maps.
> 
> Parts of the code from samples/bpf/trace_output_user.c.
> 
> Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
> Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
[...]

One remark below:

[...]
> +static void
> +print_bpf_output(struct event_ring_info *ring, struct perf_event_sample *e)
> +{
> +	struct {
> +		struct perf_event_header header;
> +		__u64 id;
> +		__u64 lost;
> +	} *lost = (void *)e;
> +	struct timespec ts;
> +
> +	if (clock_gettime(CLOCK_MONOTONIC, &ts)) {
> +		perror("Can't read clock for timestamp");
> +		return;
> +	}
Instead of the timestamp above, probably better to pick it up via
PERF_SAMPLE_TIME which needs to be added to sample_type so it also
ends up in the RB. Given below you poll with 200 and you don't set
a wakeup event for perf RB (it's probably fine not to here, but it
can be done based on watermark or events), the clock_gettime() will
be off compared to when it was actually put into the RB.

> +	if (json_output) {
> +		jsonw_start_object(json_wtr);
> +		jsonw_name(json_wtr, "timestamp");
> +		jsonw_uint(json_wtr, ts.tv_sec * 1000000000ull + ts.tv_nsec);
> +		jsonw_name(json_wtr, "type");
> +		jsonw_uint(json_wtr, e->header.type);
> +		jsonw_name(json_wtr, "cpu");
> +		jsonw_uint(json_wtr, ring->cpu);
> +		jsonw_name(json_wtr, "index");
> +		jsonw_uint(json_wtr, ring->key);
> +		if (e->header.type == PERF_RECORD_SAMPLE) {
> +			jsonw_name(json_wtr, "data");
> +			print_data_json(e->data, e->size);
> +		} else if (e->header.type == PERF_RECORD_LOST) {
> +			jsonw_name(json_wtr, "lost");
> +			jsonw_start_object(json_wtr);
> +			jsonw_name(json_wtr, "id");
> +			jsonw_uint(json_wtr, lost->id);
> +			jsonw_name(json_wtr, "count");
> +			jsonw_uint(json_wtr, lost->lost);
> +			jsonw_end_object(json_wtr);
> +		}
> +		jsonw_end_object(json_wtr);
> +	} else {
> +		if (e->header.type == PERF_RECORD_SAMPLE) {
> +			printf("== @%ld.%ld CPU: %d index: %d =====\n",
> +			       (long)ts.tv_sec, ts.tv_nsec,
> +			       ring->cpu, ring->key);
> +			fprint_hex(stdout, e->data, e->size, " ");
> +			printf("\n");
> +		} else if (e->header.type == PERF_RECORD_LOST) {
> +			printf("lost %lld events\n", lost->lost);
> +		} else {
> +			printf("unknown event type=%d size=%d\n",
> +			       e->header.type, e->header.size);
> +		}
Jakub Kicinski May 4, 2018, 10:28 p.m. UTC | #3
CC perf folks

On Fri, 4 May 2018 14:25:03 -0700, Alexei Starovoitov wrote:
> > +static void
> > +perf_event_read(struct event_ring_info *ring, void **buf, size_t *buf_len)
> > +{
> > +	volatile struct perf_event_mmap_page *header = ring->mem;
> > +	__u64 buffer_size = MMAP_PAGE_CNT * get_page_size();
> > +	__u64 data_tail = header->data_tail;
> > +	__u64 data_head = header->data_head;
> > +	void *base, *begin, *end;
> > +
> > +	asm volatile("" ::: "memory"); /* in real code it should be smp_rmb() */
> > +	if (data_head == data_tail)
> > +		return;  
> 
> this function was copied several times into different places.
> I think it's time to put into common lib. Like libbpf.

Agreed, I think libbpf would work, although there is nothing BPF
specific in this loop AFAICT now.

> Would be great if you can do it in the follow up.

Looking into it now, I found these:

$ git grep 'data_head == data_tail'
tools/bpf/bpftool/map_perf_ring.c:      if (data_head == data_tail)
tools/testing/selftests/bpf/trace_helpers.c:    if (data_head == data_tail)

Are there any other copies I should try to cater to?  I have change a few
things compared to the selftest, I guess others may have modified their
copy too.  Just trying to make sure what we put in libbpf would cater
to most possible use cases.

Should I also move bpf_perf_event_open()/test_bpf_perf_event() to libbpf?

> for the set:
> Acked-by: Alexei Starovoitov <ast@kernel.org>

Thanks!
diff mbox series

Patch

diff --git a/tools/bpf/bpftool/Documentation/bpftool-map.rst b/tools/bpf/bpftool/Documentation/bpftool-map.rst
index c3eef8c972cd..a6258bc8ec4f 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-map.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-map.rst
@@ -22,12 +22,13 @@  MAP COMMANDS
 =============
 
 |	**bpftool** **map { show | list }**   [*MAP*]
-|	**bpftool** **map dump**    *MAP*
-|	**bpftool** **map update**  *MAP*  **key** *DATA*   **value** *VALUE* [*UPDATE_FLAGS*]
-|	**bpftool** **map lookup**  *MAP*  **key** *DATA*
-|	**bpftool** **map getnext** *MAP* [**key** *DATA*]
-|	**bpftool** **map delete**  *MAP*  **key** *DATA*
-|	**bpftool** **map pin**     *MAP*  *FILE*
+|	**bpftool** **map dump**       *MAP*
+|	**bpftool** **map update**     *MAP*  **key** *DATA*   **value** *VALUE* [*UPDATE_FLAGS*]
+|	**bpftool** **map lookup**     *MAP*  **key** *DATA*
+|	**bpftool** **map getnext**    *MAP* [**key** *DATA*]
+|	**bpftool** **map delete**     *MAP*  **key** *DATA*
+|	**bpftool** **map pin**        *MAP*  *FILE*
+|	**bpftool** **map event_pipe** *MAP* [**cpu** *N* **index** *M*]
 |	**bpftool** **map help**
 |
 |	*MAP* := { **id** *MAP_ID* | **pinned** *FILE* }
@@ -76,6 +77,22 @@  DESCRIPTION
 
 		  Note: *FILE* must be located in *bpffs* mount.
 
+	**bpftool** **map event_pipe** *MAP* [**cpu** *N* **index** *M*]
+		  Read events from a BPF_MAP_TYPE_PERF_EVENT_ARRAY map.
+
+		  Install perf rings into a perf event array map and dump
+		  output of any bpf_perf_event_output() call in the kernel.
+		  By default read the number of CPUs on the system and
+		  install perf ring for each CPU in the corresponding index
+		  in the array.
+
+		  If **cpu** and **index** are specified, install perf ring
+		  for given **cpu** at **index** in the array (single ring).
+
+		  Note that installing a perf ring into an array will silently
+		  replace any existing ring.  Any other application will stop
+		  receiving events if it installed its rings earlier.
+
 	**bpftool map help**
 		  Print short help message.
 
diff --git a/tools/bpf/bpftool/Documentation/bpftool.rst b/tools/bpf/bpftool/Documentation/bpftool.rst
index 20689a321ffe..564cb0d9692b 100644
--- a/tools/bpf/bpftool/Documentation/bpftool.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool.rst
@@ -23,7 +23,7 @@  SYNOPSIS
 
 	*MAP-COMMANDS* :=
 	{ **show** | **list** | **dump** | **update** | **lookup** | **getnext** | **delete**
-	| **pin** | **help** }
+	| **pin** | **event_pipe** | **help** }
 
 	*PROG-COMMANDS* := { **show** | **list** | **dump jited** | **dump xlated** | **pin**
 	| **load** | **help** }
diff --git a/tools/bpf/bpftool/Makefile b/tools/bpf/bpftool/Makefile
index 4e69782c4a79..892dbf095bff 100644
--- a/tools/bpf/bpftool/Makefile
+++ b/tools/bpf/bpftool/Makefile
@@ -39,7 +39,12 @@  CC = gcc
 
 CFLAGS += -O2
 CFLAGS += -W -Wall -Wextra -Wno-unused-parameter -Wshadow -Wno-missing-field-initializers
-CFLAGS += -DPACKAGE='"bpftool"' -D__EXPORTED_HEADERS__ -I$(srctree)/tools/include/uapi -I$(srctree)/tools/include -I$(srctree)/tools/lib/bpf -I$(srctree)/kernel/bpf/
+CFLAGS += -DPACKAGE='"bpftool"' -D__EXPORTED_HEADERS__ \
+	-I$(srctree)/kernel/bpf/ \
+	-I$(srctree)/tools/include \
+	-I$(srctree)/tools/include/uapi \
+	-I$(srctree)/tools/lib/bpf \
+	-I$(srctree)/tools/perf
 CFLAGS += -DBPFTOOL_VERSION='"$(BPFTOOL_VERSION)"'
 LIBS = -lelf -lbfd -lopcodes $(LIBBPF)
 
diff --git a/tools/bpf/bpftool/bash-completion/bpftool b/tools/bpf/bpftool/bash-completion/bpftool
index 852d84a98acd..b301c9b315f1 100644
--- a/tools/bpf/bpftool/bash-completion/bpftool
+++ b/tools/bpf/bpftool/bash-completion/bpftool
@@ -1,6 +1,6 @@ 
 # bpftool(8) bash completion                               -*- shell-script -*-
 #
-# Copyright (C) 2017 Netronome Systems, Inc.
+# Copyright (C) 2017-2018 Netronome Systems, Inc.
 #
 # This software is dual licensed under the GNU General License
 # Version 2, June 1991 as shown in the file COPYING in the top-level
@@ -79,6 +79,14 @@  _bpftool_get_map_ids()
         command sed -n 's/.*"id": \(.*\),$/\1/p' )" -- "$cur" ) )
 }
 
+_bpftool_get_perf_map_ids()
+{
+    COMPREPLY+=( $( compgen -W "$( bpftool -jp map  2>&1 | \
+        command grep -C2 perf_event_array | \
+        command sed -n 's/.*"id": \(.*\),$/\1/p' )" -- "$cur" ) )
+}
+
+
 _bpftool_get_prog_ids()
 {
     COMPREPLY+=( $( compgen -W "$( bpftool -jp prog 2>&1 | \
@@ -359,10 +367,34 @@  _bpftool()
                     fi
                     return 0
                     ;;
+                event_pipe)
+                    case $prev in
+                        $command)
+                            COMPREPLY=( $( compgen -W "$MAP_TYPE" -- "$cur" ) )
+                            return 0
+                            ;;
+                        id)
+                            _bpftool_get_perf_map_ids
+                            return 0
+                            ;;
+                        cpu)
+                            return 0
+                            ;;
+                        index)
+                            return 0
+                            ;;
+                        *)
+                            _bpftool_once_attr 'cpu'
+                            _bpftool_once_attr 'index'
+                            return 0
+                            ;;
+                    esac
+                    ;;
                 *)
                     [[ $prev == $object ]] && \
                         COMPREPLY=( $( compgen -W 'delete dump getnext help \
-                            lookup pin show list update' -- "$cur" ) )
+                            lookup pin event_pipe show list update' -- \
+                            "$cur" ) )
                     ;;
             esac
             ;;
diff --git a/tools/bpf/bpftool/common.c b/tools/bpf/bpftool/common.c
index 9c620770c6ed..32f9e397a6c0 100644
--- a/tools/bpf/bpftool/common.c
+++ b/tools/bpf/bpftool/common.c
@@ -331,6 +331,16 @@  char *get_fdinfo(int fd, const char *key)
 	return NULL;
 }
 
+void print_data_json(uint8_t *data, size_t len)
+{
+	unsigned int i;
+
+	jsonw_start_array(json_wtr);
+	for (i = 0; i < len; i++)
+		jsonw_printf(json_wtr, "%d", data[i]);
+	jsonw_end_array(json_wtr);
+}
+
 void print_hex_data_json(uint8_t *data, size_t len)
 {
 	unsigned int i;
@@ -421,6 +431,15 @@  void delete_pinned_obj_table(struct pinned_obj_table *tab)
 	}
 }
 
+unsigned int get_page_size(void)
+{
+	static int result;
+
+	if (!result)
+		result = getpagesize();
+	return result;
+}
+
 unsigned int get_possible_cpus(void)
 {
 	static unsigned int result;
diff --git a/tools/bpf/bpftool/main.h b/tools/bpf/bpftool/main.h
index cbf8985da362..6173cd997e7a 100644
--- a/tools/bpf/bpftool/main.h
+++ b/tools/bpf/bpftool/main.h
@@ -117,14 +117,18 @@  int do_pin_fd(int fd, const char *name);
 
 int do_prog(int argc, char **arg);
 int do_map(int argc, char **arg);
+int do_event_pipe(int argc, char **argv);
 int do_cgroup(int argc, char **arg);
 
 int prog_parse_fd(int *argc, char ***argv);
+int map_parse_fd_and_info(int *argc, char ***argv, void *info, __u32 *info_len);
 
 void disasm_print_insn(unsigned char *image, ssize_t len, int opcodes,
 		       const char *arch);
+void print_data_json(uint8_t *data, size_t len);
 void print_hex_data_json(uint8_t *data, size_t len);
 
+unsigned int get_page_size(void);
 unsigned int get_possible_cpus(void);
 const char *ifindex_to_bfd_name_ns(__u32 ifindex, __u64 ns_dev, __u64 ns_ino);
 
diff --git a/tools/bpf/bpftool/map.c b/tools/bpf/bpftool/map.c
index 5efefde5f578..af6766e956ba 100644
--- a/tools/bpf/bpftool/map.c
+++ b/tools/bpf/bpftool/map.c
@@ -130,8 +130,7 @@  static int map_parse_fd(int *argc, char ***argv)
 	return -1;
 }
 
-static int
-map_parse_fd_and_info(int *argc, char ***argv, void *info, __u32 *info_len)
+int map_parse_fd_and_info(int *argc, char ***argv, void *info, __u32 *info_len)
 {
 	int err;
 	int fd;
@@ -817,12 +816,13 @@  static int do_help(int argc, char **argv)
 
 	fprintf(stderr,
 		"Usage: %s %s { show | list }   [MAP]\n"
-		"       %s %s dump    MAP\n"
-		"       %s %s update  MAP  key DATA value VALUE [UPDATE_FLAGS]\n"
-		"       %s %s lookup  MAP  key DATA\n"
-		"       %s %s getnext MAP [key DATA]\n"
-		"       %s %s delete  MAP  key DATA\n"
-		"       %s %s pin     MAP  FILE\n"
+		"       %s %s dump       MAP\n"
+		"       %s %s update     MAP  key DATA value VALUE [UPDATE_FLAGS]\n"
+		"       %s %s lookup     MAP  key DATA\n"
+		"       %s %s getnext    MAP [key DATA]\n"
+		"       %s %s delete     MAP  key DATA\n"
+		"       %s %s pin        MAP  FILE\n"
+		"       %s %s event_pipe MAP [cpu N index M]\n"
 		"       %s %s help\n"
 		"\n"
 		"       MAP := { id MAP_ID | pinned FILE }\n"
@@ -834,7 +834,7 @@  static int do_help(int argc, char **argv)
 		"",
 		bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2],
 		bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2],
-		bin_name, argv[-2], bin_name, argv[-2]);
+		bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2]);
 
 	return 0;
 }
@@ -849,6 +849,7 @@  static const struct cmd cmds[] = {
 	{ "getnext",	do_getnext },
 	{ "delete",	do_delete },
 	{ "pin",	do_pin },
+	{ "event_pipe",	do_event_pipe },
 	{ 0 }
 };
 
diff --git a/tools/bpf/bpftool/map_perf_ring.c b/tools/bpf/bpftool/map_perf_ring.c
new file mode 100644
index 000000000000..c5a2ced8552d
--- /dev/null
+++ b/tools/bpf/bpftool/map_perf_ring.c
@@ -0,0 +1,347 @@ 
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (C) 2018 Netronome Systems, Inc. */
+/* This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <errno.h>
+#include <fcntl.h>
+#include <libbpf.h>
+#include <poll.h>
+#include <signal.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <unistd.h>
+#include <linux/bpf.h>
+#include <linux/perf_event.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <sys/syscall.h>
+
+#include <bpf.h>
+#include <perf-sys.h>
+
+#include "main.h"
+
+#define MMAP_PAGE_CNT	16
+
+static bool stop;
+
+struct event_ring_info {
+	int fd;
+	int key;
+	unsigned int cpu;
+	void *mem;
+};
+
+struct perf_event_sample {
+	struct perf_event_header header;
+	__u32 size;
+	unsigned char data[];
+};
+
+static void int_exit(int signo)
+{
+	fprintf(stderr, "Stopping...\n");
+	stop = true;
+}
+
+static void
+print_bpf_output(struct event_ring_info *ring, struct perf_event_sample *e)
+{
+	struct {
+		struct perf_event_header header;
+		__u64 id;
+		__u64 lost;
+	} *lost = (void *)e;
+	struct timespec ts;
+
+	if (clock_gettime(CLOCK_MONOTONIC, &ts)) {
+		perror("Can't read clock for timestamp");
+		return;
+	}
+
+	if (json_output) {
+		jsonw_start_object(json_wtr);
+		jsonw_name(json_wtr, "timestamp");
+		jsonw_uint(json_wtr, ts.tv_sec * 1000000000ull + ts.tv_nsec);
+		jsonw_name(json_wtr, "type");
+		jsonw_uint(json_wtr, e->header.type);
+		jsonw_name(json_wtr, "cpu");
+		jsonw_uint(json_wtr, ring->cpu);
+		jsonw_name(json_wtr, "index");
+		jsonw_uint(json_wtr, ring->key);
+		if (e->header.type == PERF_RECORD_SAMPLE) {
+			jsonw_name(json_wtr, "data");
+			print_data_json(e->data, e->size);
+		} else if (e->header.type == PERF_RECORD_LOST) {
+			jsonw_name(json_wtr, "lost");
+			jsonw_start_object(json_wtr);
+			jsonw_name(json_wtr, "id");
+			jsonw_uint(json_wtr, lost->id);
+			jsonw_name(json_wtr, "count");
+			jsonw_uint(json_wtr, lost->lost);
+			jsonw_end_object(json_wtr);
+		}
+		jsonw_end_object(json_wtr);
+	} else {
+		if (e->header.type == PERF_RECORD_SAMPLE) {
+			printf("== @%ld.%ld CPU: %d index: %d =====\n",
+			       (long)ts.tv_sec, ts.tv_nsec,
+			       ring->cpu, ring->key);
+			fprint_hex(stdout, e->data, e->size, " ");
+			printf("\n");
+		} else if (e->header.type == PERF_RECORD_LOST) {
+			printf("lost %lld events\n", lost->lost);
+		} else {
+			printf("unknown event type=%d size=%d\n",
+			       e->header.type, e->header.size);
+		}
+	}
+}
+
+static void
+perf_event_read(struct event_ring_info *ring, void **buf, size_t *buf_len)
+{
+	volatile struct perf_event_mmap_page *header = ring->mem;
+	__u64 buffer_size = MMAP_PAGE_CNT * get_page_size();
+	__u64 data_tail = header->data_tail;
+	__u64 data_head = header->data_head;
+	void *base, *begin, *end;
+
+	asm volatile("" ::: "memory"); /* in real code it should be smp_rmb() */
+	if (data_head == data_tail)
+		return;
+
+	base = ((char *)header) + get_page_size();
+
+	begin = base + data_tail % buffer_size;
+	end = base + data_head % buffer_size;
+
+	while (begin != end) {
+		struct perf_event_sample *e;
+
+		e = begin;
+		if (begin + e->header.size > base + buffer_size) {
+			long len = base + buffer_size - begin;
+
+			if (*buf_len < e->header.size) {
+				free(*buf);
+				*buf = malloc(e->header.size);
+				if (!*buf) {
+					fprintf(stderr,
+						"can't allocate memory");
+					stop = true;
+					return;
+				}
+				*buf_len = e->header.size;
+			}
+
+			memcpy(*buf, begin, len);
+			memcpy(*buf + len, base, e->header.size - len);
+			e = (void *)*buf;
+			begin = base + e->header.size - len;
+		} else if (begin + e->header.size == base + buffer_size) {
+			begin = base;
+		} else {
+			begin += e->header.size;
+		}
+
+		print_bpf_output(ring, e);
+	}
+
+	__sync_synchronize(); /* smp_mb() */
+	header->data_tail = data_head;
+}
+
+static int perf_mmap_size(void)
+{
+	return get_page_size() * (MMAP_PAGE_CNT + 1);
+}
+
+static void *perf_event_mmap(int fd)
+{
+	int mmap_size = perf_mmap_size();
+	void *base;
+
+	base = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+	if (base == MAP_FAILED) {
+		p_err("event mmap failed: %s\n", strerror(errno));
+		return NULL;
+	}
+
+	return base;
+}
+
+static void perf_event_unmap(void *mem)
+{
+	if (munmap(mem, perf_mmap_size()))
+		fprintf(stderr, "Can't unmap ring memory!\n");
+}
+
+static int bpf_perf_event_open(int map_fd, int key, int cpu)
+{
+	struct perf_event_attr attr = {
+		.sample_type = PERF_SAMPLE_RAW,
+		.type = PERF_TYPE_SOFTWARE,
+		.config = PERF_COUNT_SW_BPF_OUTPUT,
+	};
+	int pmu_fd;
+
+	pmu_fd = sys_perf_event_open(&attr, -1, cpu, -1, 0);
+	if (pmu_fd < 0) {
+		p_err("failed to open perf event %d for CPU %d", key, cpu);
+		return -1;
+	}
+
+	if (bpf_map_update_elem(map_fd, &key, &pmu_fd, BPF_ANY)) {
+		p_err("failed to update map for event %d for CPU %d", key, cpu);
+		goto err_close;
+	}
+	if (ioctl(pmu_fd, PERF_EVENT_IOC_ENABLE, 0)) {
+		p_err("failed to enable event %d for CPU %d", key, cpu);
+		goto err_close;
+	}
+
+	return pmu_fd;
+
+err_close:
+	close(pmu_fd);
+	return -1;
+}
+
+int do_event_pipe(int argc, char **argv)
+{
+	int i, nfds, map_fd, index = -1, cpu = -1;
+	struct bpf_map_info map_info = {};
+	struct event_ring_info *rings;
+	size_t tmp_buf_sz = 0;
+	void *tmp_buf = NULL;
+	struct pollfd *pfds;
+	__u32 map_info_len;
+	bool do_all = true;
+
+	map_info_len = sizeof(map_info);
+	map_fd = map_parse_fd_and_info(&argc, &argv, &map_info, &map_info_len);
+	if (map_fd < 0)
+		return -1;
+
+	if (map_info.type != BPF_MAP_TYPE_PERF_EVENT_ARRAY) {
+		p_err("map is not a perf event array");
+		goto err_close_map;
+	}
+
+	while (argc) {
+		if (argc < 2)
+			BAD_ARG();
+
+		if (is_prefix(*argv, "cpu")) {
+			char *endptr;
+
+			NEXT_ARG();
+			cpu = strtoul(*argv, &endptr, 0);
+			if (*endptr) {
+				p_err("can't parse %s as CPU ID", **argv);
+				goto err_close_map;
+			}
+
+			NEXT_ARG();
+		} else if (is_prefix(*argv, "index")) {
+			char *endptr;
+
+			NEXT_ARG();
+			index = strtoul(*argv, &endptr, 0);
+			if (*endptr) {
+				p_err("can't parse %s as index", **argv);
+				goto err_close_map;
+			}
+
+			NEXT_ARG();
+		} else {
+			BAD_ARG();
+		}
+
+		do_all = false;
+	}
+
+	if (!do_all) {
+		if (index == -1 || cpu == -1) {
+			p_err("cpu and index must be specified together");
+			goto err_close_map;
+		}
+
+		nfds = 1;
+	} else {
+		nfds = min(get_possible_cpus(), map_info.max_entries);
+		cpu = 0;
+		index = 0;
+	}
+
+	rings = calloc(nfds, sizeof(rings[0]));
+	if (!rings)
+		goto err_close_map;
+
+	pfds = calloc(nfds, sizeof(pfds[0]));
+	if (!pfds)
+		goto err_free_rings;
+
+	for (i = 0; i < nfds; i++) {
+		rings[i].cpu = cpu + i;
+		rings[i].key = index + i;
+
+		rings[i].fd = bpf_perf_event_open(map_fd, rings[i].key,
+						  rings[i].cpu);
+		if (rings[i].fd < 0)
+			goto err_close_fds_prev;
+
+		rings[i].mem = perf_event_mmap(rings[i].fd);
+		if (!rings[i].mem)
+			goto err_close_fds_current;
+
+		pfds[i].fd = rings[i].fd;
+		pfds[i].events = POLLIN;
+	}
+
+	signal(SIGINT, int_exit);
+	signal(SIGHUP, int_exit);
+	signal(SIGTERM, int_exit);
+
+	if (json_output)
+		jsonw_start_array(json_wtr);
+
+	while (!stop) {
+		poll(pfds, nfds, 200);
+		for (i = 0; i < nfds; i++)
+			perf_event_read(&rings[i], &tmp_buf, &tmp_buf_sz);
+	}
+	free(tmp_buf);
+
+	if (json_output)
+		jsonw_end_array(json_wtr);
+
+	for (i = 0; i < nfds; i++) {
+		perf_event_unmap(rings[i].mem);
+		close(rings[i].fd);
+	}
+	free(pfds);
+	free(rings);
+	close(map_fd);
+
+	return 0;
+
+err_close_fds_prev:
+	while (i--) {
+		perf_event_unmap(rings[i].mem);
+err_close_fds_current:
+		close(rings[i].fd);
+	}
+	free(pfds);
+err_free_rings:
+	free(rings);
+err_close_map:
+	close(map_fd);
+	return -1;
+}