diff mbox series

[v2,bpf-next,1/2] bpftool: introduce "prog profile" command

Message ID 20200228234058.634044-2-songliubraving@fb.com
State Changes Requested
Delegated to: BPF Maintainers
Headers show
Series bpftool: introduce prog profile | expand

Commit Message

Song Liu Feb. 28, 2020, 11:40 p.m. UTC
With fentry/fexit programs, it is possible to profile BPF program with
hardware counters. Introduce bpftool "prog profile", which measures key
metrics of a BPF program.

bpftool prog profile command creates per-cpu perf events. Then it attaches
fentry/fexit programs to the target BPF program. The fentry program saves
perf event value to a map. The fexit program reads the perf event again,
and calculates the difference, which is the instructions/cycles used by
the target program.

Example input and output:

  ./bpftool prog profile 3 id 337 cycles instructions llc_misses

        4228 run_cnt
     3403698 cycles                                              (84.08%)
     3525294 instructions   #  1.04 insn per cycle               (84.05%)
          13 llc_misses     #  3.69 LLC misses per million isns  (83.50%)

This command measures cycles and instructions for BPF program with id
337 for 3 seconds. The program has triggered 4228 times. The rest of the
output is similar to perf-stat. In this example, the counters were only
counting ~84% of the time because of time multiplexing of perf counters.

Note that, this approach measures cycles and instructions in very small
increments. So the fentry/fexit programs introduce noticeable errors to
the measurement results.

The fentry/fexit programs are generated with BPF skeletons. Therefore, we
build bpftool twice. The first time _bpftool is built without skeletons.
Then, _bpftool is used to generate the skeletons. The second time, bpftool
is built with skeletons.

Signed-off-by: Song Liu <songliubraving@fb.com>
---
 tools/bpf/bpftool/Makefile                |  18 +
 tools/bpf/bpftool/prog.c                  | 428 +++++++++++++++++++++-
 tools/bpf/bpftool/skeleton/profiler.bpf.c | 171 +++++++++
 tools/bpf/bpftool/skeleton/profiler.h     |  47 +++
 tools/scripts/Makefile.include            |   1 +
 5 files changed, 664 insertions(+), 1 deletion(-)
 create mode 100644 tools/bpf/bpftool/skeleton/profiler.bpf.c
 create mode 100644 tools/bpf/bpftool/skeleton/profiler.h

Comments

Yonghong Song March 1, 2020, 3:52 a.m. UTC | #1
On 2/28/20 3:40 PM, Song Liu wrote:
> With fentry/fexit programs, it is possible to profile BPF program with
> hardware counters. Introduce bpftool "prog profile", which measures key
> metrics of a BPF program.
> 
> bpftool prog profile command creates per-cpu perf events. Then it attaches
> fentry/fexit programs to the target BPF program. The fentry program saves
> perf event value to a map. The fexit program reads the perf event again,
> and calculates the difference, which is the instructions/cycles used by
> the target program.
> 
> Example input and output:
> 
>    ./bpftool prog profile 3 id 337 cycles instructions llc_misses
> 
>          4228 run_cnt
>       3403698 cycles                                              (84.08%)
>       3525294 instructions   #  1.04 insn per cycle               (84.05%)
>            13 llc_misses     #  3.69 LLC misses per million isns  (83.50%)
> 
> This command measures cycles and instructions for BPF program with id
> 337 for 3 seconds. The program has triggered 4228 times. The rest of the
> output is similar to perf-stat. In this example, the counters were only
> counting ~84% of the time because of time multiplexing of perf counters.
> 
> Note that, this approach measures cycles and instructions in very small
> increments. So the fentry/fexit programs introduce noticeable errors to
> the measurement results.
> 
> The fentry/fexit programs are generated with BPF skeletons. Therefore, we
> build bpftool twice. The first time _bpftool is built without skeletons.
> Then, _bpftool is used to generate the skeletons. The second time, bpftool
> is built with skeletons.
> 
> Signed-off-by: Song Liu <songliubraving@fb.com>
> ---
>   tools/bpf/bpftool/Makefile                |  18 +
>   tools/bpf/bpftool/prog.c                  | 428 +++++++++++++++++++++-
>   tools/bpf/bpftool/skeleton/profiler.bpf.c | 171 +++++++++
>   tools/bpf/bpftool/skeleton/profiler.h     |  47 +++
>   tools/scripts/Makefile.include            |   1 +
>   5 files changed, 664 insertions(+), 1 deletion(-)
>   create mode 100644 tools/bpf/bpftool/skeleton/profiler.bpf.c
>   create mode 100644 tools/bpf/bpftool/skeleton/profiler.h
> 
> diff --git a/tools/bpf/bpftool/Makefile b/tools/bpf/bpftool/Makefile
> index c4e810335810..c035fc107027 100644
> --- a/tools/bpf/bpftool/Makefile
> +++ b/tools/bpf/bpftool/Makefile
> @@ -59,6 +59,7 @@ LIBS = $(LIBBPF) -lelf -lz
>   
>   INSTALL ?= install
>   RM ?= rm -f
> +CLANG ?= clang
>   
>   FEATURE_USER = .bpftool
>   FEATURE_TESTS = libbfd disassembler-four-args reallocarray zlib
> @@ -110,6 +111,22 @@ SRCS += $(BFD_SRCS)
>   endif
>   
>   OBJS = $(patsubst %.c,$(OUTPUT)%.o,$(SRCS)) $(OUTPUT)disasm.o
> +_OBJS = $(filter-out $(OUTPUT)prog.o,$(OBJS)) $(OUTPUT)_prog.o
> +
> +$(OUTPUT)_prog.o: prog.c
> +	$(QUIET_CC)$(COMPILE.c) -MMD -DBPFTOOL_WITHOUT_SKELETONS -o $@ $<
> +
> +$(OUTPUT)_bpftool: $(_OBJS) $(LIBBPF)
> +	$(QUIET_LINK)$(CC) $(CFLAGS) $(LDFLAGS) -o $@ $(_OBJS) $(LIBS)
> +
> +skeleton/profiler.bpf.o: skeleton/profiler.bpf.c
> +	$(QUIET_CLANG)$(CLANG) -g -O2 -target bpf -c $< -o $@

With a fresh checkout, applying this patch and just selftests like
   make -C tools/testing/selftests/bpf

I got the following build error:

make[2]: Leaving directory `/data/users/yhs/work/net-next/tools/lib/bpf'
clang -g -O2 -target bpf -c skeleton/profiler.bpf.c -o 
skeleton/profiler.bpf.o
skeleton/profiler.bpf.c:5:10: fatal error: 'bpf/bpf_helpers.h' file not 
found
#include <bpf/bpf_helpers.h>
          ^~~~~~~~~~~~~~~~~~~
1 error generated.
make[1]: *** [skeleton/profiler.bpf.o] Error 1

I think Makefile should be tweaked to avoid selftest failure.

> +
> +profiler.skel.h: $(OUTPUT)_bpftool skeleton/profiler.bpf.o
> +	$(QUIET_GEN)$(OUTPUT)./_bpftool gen skeleton skeleton/profiler.bpf.o > $@
> +
> +$(OUTPUT)prog.o: prog.c profiler.skel.h
> +	$(QUIET_CC)$(COMPILE.c) -MMD -o $@ $<
>   
>   $(OUTPUT)disasm.o: $(srctree)/kernel/bpf/disasm.c
>   	$(QUIET_CC)$(COMPILE.c) -MMD -o $@ $<
> @@ -125,6 +142,7 @@ $(OUTPUT)%.o: %.c
>   clean: $(LIBBPF)-clean
>   	$(call QUIET_CLEAN, bpftool)
>   	$(Q)$(RM) -- $(OUTPUT)bpftool $(OUTPUT)*.o $(OUTPUT)*.d
> +	$(Q)$(RM) -- $(OUTPUT)_bpftool profiler.skel.h skeleton/profiler.bpf.o
>   	$(Q)$(RM) -r -- $(OUTPUT)libbpf/
>   	$(call QUIET_CLEAN, core-gen)
>   	$(Q)$(RM) -- $(OUTPUT)FEATURE-DUMP.bpftool
> diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c
> index 1996e67a2f00..39f0f14464ad 100644
> --- a/tools/bpf/bpftool/prog.c
> +++ b/tools/bpf/bpftool/prog.c
> @@ -10,12 +10,16 @@
>   #include <string.h>
>   #include <time.h>
>   #include <unistd.h>
> +#include <signal.h>
>   #include <net/if.h>
>   #include <sys/types.h>
>   #include <sys/stat.h>
> +#include <sys/ioctl.h>
> +#include <sys/syscall.h>
>   
>   #include <linux/err.h>
>   #include <linux/sizes.h>
> +#include <linux/perf_event.h>
>   
>   #include <bpf/bpf.h>
>   #include <bpf/btf.h>
> @@ -1537,6 +1541,425 @@ static int do_loadall(int argc, char **argv)
>   	return load_with_options(argc, argv, false);
>   }
>   
> +#ifdef BPFTOOL_WITHOUT_SKELETONS
> +
> +static int do_profile(int argc, char **argv)
> +{
> +	return 0;
> +}
> +
> +#else /* BPFTOOL_WITHOUT_SKELETONS */
> +
> +#include "profiler.skel.h"
> +
> +#define SAMPLE_PERIOD  0x7fffffffffffffffULL
> +struct profile_metric {
> +	const char *name;
> +	struct perf_event_attr attr;
> +	bool selected;
> +	struct bpf_perf_event_value val;
> +
> +	/* calculate ratios like instructions per cycle */
> +	const int ratio_metric; /* 0 for N/A, 1 for index 0 (cycles)  */
> +	const float ratio_mul;
> +	const char *ratio_desc;
> +} metrics[] = {
> +	{
> +		.name = "cycles",
> +		.attr = {
> +			.freq = 0,
> +			.sample_period = SAMPLE_PERIOD,
> +			.inherit = 0,
> +			.type = PERF_TYPE_HARDWARE,
> +			.read_format = 0,
> +			.sample_type = 0,
> +			.config = PERF_COUNT_HW_CPU_CYCLES,
> +		},
> +	},
> +	{
> +		.name = "instructions",
> +		.attr = {
> +			.freq = 0,
> +			.sample_period = SAMPLE_PERIOD,
> +			.inherit = 0,
> +			.type = PERF_TYPE_HARDWARE,
> +			.read_format = 0,
> +			.sample_type = 0,
> +			.config = PERF_COUNT_HW_INSTRUCTIONS,
> +		},
> +		.ratio_metric = 1,
> +		.ratio_mul = 1.0,
> +		.ratio_desc = "insn per cycle",
> +	},
> +	{
> +		.name = "l1d_loads",
> +		.attr = {
> +			.freq = 0,
> +			.sample_period = SAMPLE_PERIOD,
> +			.inherit = 0,
> +			.type = PERF_TYPE_HW_CACHE,
> +			.read_format = 0,
> +			.sample_type = 0,
> +			.config =
> +				PERF_COUNT_HW_CACHE_L1D |
> +				(PERF_COUNT_HW_CACHE_OP_READ << 8) |
> +				(PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16),
> +		},
> +	},
> +	{
> +		.name = "llc_misses",
> +		.attr = {
> +			.freq = 0,
> +			.sample_period = SAMPLE_PERIOD,
> +			.inherit = 0,
> +			.type = PERF_TYPE_HW_CACHE,
> +			.read_format = 0,
> +			.sample_type = 0,
> +			.config =
> +				PERF_COUNT_HW_CACHE_LL |
> +				(PERF_COUNT_HW_CACHE_OP_READ << 8) |
> +				(PERF_COUNT_HW_CACHE_RESULT_MISS << 16),
> +		},
> +		.ratio_metric = 2,
> +		.ratio_mul = 1e6,
> +		.ratio_desc = "LLC misses per million isns",
> +	},
> +};
> +
> +u64 profile_total_count;
> +
> +#define MAX_NUM_PROFILE_METRICS 4
> +
> +static int profile_parse_metrics(int argc, char **argv)
> +{
> +	unsigned int metric_cnt;
> +	int selected_cnt = 0;
> +	unsigned int i;
> +
> +	metric_cnt = sizeof(metrics) / sizeof(struct profile_metric);
> +
> +	while (argc > 0) {
> +		for (i = 0; i < metric_cnt; i++) {
> +			if (strcmp(argv[0], metrics[i].name) == 0) {
> +				if (!metrics[i].selected)
> +					selected_cnt++;
> +				metrics[i].selected = true;
> +				break;
> +			}
> +		}
> +		if (i == metric_cnt) {
> +			p_err("unknown metric %s", argv[0]);
> +			return -1;
> +		}
> +		NEXT_ARG();
> +	}
> +	if (selected_cnt > MAX_NUM_PROFILE_METRICS) {
> +		p_err("too many (%d) metrics, please specify no more than %d metrics at at time",
> +		      selected_cnt, MAX_NUM_PROFILE_METRICS);
> +		return -1;
> +	}
> +	return selected_cnt;
> +}
> +
> +static void profile_read_values(struct profiler_bpf *obj)
> +{
> +	u32 m, cpu, num_cpu = obj->rodata->num_cpu;
> +	u64 counts[num_cpu];
> +	int reading_map_fd, count_map_fd;
> +	u32 key = 0;
> +
> +	reading_map_fd = bpf_map__fd(obj->maps.accum_readings);
> +	count_map_fd = bpf_map__fd(obj->maps.counts);
> +	if (reading_map_fd < 0 || count_map_fd < 0) {
> +		p_err("failed to get fd for map");
> +		return;
> +	}
> +
> +	assert(bpf_map_lookup_elem(count_map_fd, &key, counts) == 0);
> +	profile_total_count = 0;
> +	for (cpu = 0; cpu < num_cpu; cpu++)
> +		profile_total_count += counts[cpu];
> +
> +	for (m = 0; m < ARRAY_SIZE(metrics); m++) {
> +		struct bpf_perf_event_value values[obj->rodata->num_cpu];
> +
> +		if (!metrics[m].selected)
> +			continue;
> +
> +		assert(bpf_map_lookup_elem(reading_map_fd, &key, values) == 0);
> +		for (cpu = 0; cpu < num_cpu; cpu++) {
> +			metrics[m].val.counter += values[cpu].counter;
> +			metrics[m].val.enabled += values[cpu].enabled;
> +			metrics[m].val.running += values[cpu].running;
> +		}
> +		key++;
> +	}
> +}
> +
> +static void profile_print_readings_json(void)
> +{
> +	u32 m;
> +
> +	jsonw_start_array(json_wtr);
> +	for (m = 0; m < ARRAY_SIZE(metrics); m++) {
> +		if (!metrics[m].selected)
> +			continue;
> +		jsonw_start_object(json_wtr);
> +		jsonw_string_field(json_wtr, "metric", metrics[m].name);
> +		jsonw_lluint_field(json_wtr, "run_cnt", profile_total_count);
> +		jsonw_lluint_field(json_wtr, "value", metrics[m].val.counter);
> +		jsonw_lluint_field(json_wtr, "enabled", metrics[m].val.enabled);
> +		jsonw_lluint_field(json_wtr, "running", metrics[m].val.running);
> +
> +		jsonw_end_object(json_wtr);
> +	}
> +	jsonw_end_array(json_wtr);
> +}
> +
> +static void profile_print_readings_plain(void)
> +{
> +	u32 m;
> +
> +	printf("\n%18lu %-20s\n", profile_total_count, "run_cnt");
> +	for (m = 0; m < ARRAY_SIZE(metrics); m++) {
> +		struct bpf_perf_event_value *val = &metrics[m].val;
> +		int r;
> +
> +		if (!metrics[m].selected)
> +			continue;
> +		printf("%18llu %-20s", val->counter, metrics[m].name);
> +
> +		r = metrics[m].ratio_metric - 1;
> +		if (r >= 0 && metrics[r].selected) {
> +			printf("# %8.2f %-30s",
> +			       val->counter * metrics[m].ratio_mul /
> +			       metrics[r].val.counter,
> +			       metrics[m].ratio_desc);
> +		} else {
> +			printf("%-41s", "");
> +		}
> +
> +		if (val->enabled > val->running)
> +			printf("(%4.2f%%)",
> +			       val->running * 100.0 / val->enabled);
> +		printf("\n");
> +	}
> +}
> +
> +static void profile_print_readings(void)
> +{
> +	if (json_output)
> +		profile_print_readings_json();
> +	else
> +		profile_print_readings_plain();
> +}
> +
> +static void profile_close_perf_events(struct profiler_bpf *obj)
> +{
> +	int map_fd, pmu_fd;
> +	u32 i;
> +
> +	map_fd = bpf_map__fd(obj->maps.events);
> +	if (map_fd < 0) {
> +		p_err("failed to get fd for events map");
> +		return;
> +	}
> +
> +	for (i = 0; i < obj->rodata->num_cpu * obj->rodata->num_metric; i++) {
> +		bpf_map_lookup_elem(map_fd, &i, &pmu_fd);
> +		close(pmu_fd);
> +	}
> +}
> +
> +static int profile_open_perf_events(struct profiler_bpf *obj)
> +{
> +	int map_fd, pmu_fd, i, key = 0;
> +	unsigned int cpu, m;
> +
> +	map_fd = bpf_map__fd(obj->maps.events);
> +	if (map_fd < 0) {
> +		p_err("failed to get fd for events map");
> +		return -1;
> +	}
> +
> +	for (m = 0; m < ARRAY_SIZE(metrics); m++) {
> +		if (!metrics[m].selected)
> +			continue;
> +		for (cpu = 0; cpu < obj->rodata->num_cpu; cpu++) {
> +			pmu_fd = syscall(__NR_perf_event_open, &metrics[m].attr,
> +					 -1/*pid*/, cpu, -1/*group_fd*/, 0);
> +			if (pmu_fd < 0 ||
> +			    bpf_map_update_elem(map_fd, &key,
> +						&pmu_fd, BPF_ANY) ||
> +			    ioctl(pmu_fd, PERF_EVENT_IOC_ENABLE, 0)) {
> +				p_err("failed to create event %s on cpu %d",
> +				      metrics[m].name, cpu);
> +				goto err;
> +			}
> +			key++;
> +		}
> +	}
> +	return 0;
> +err:
> +	for (i = key - 1; i >= 0; i--) {
> +		bpf_map_lookup_elem(map_fd, &i, &pmu_fd);
> +		close(pmu_fd);
> +	}
> +	return -1;
> +}
> +
> +static char *profile_target_name(int tgt_fd)
> +{
> +	struct bpf_prog_info_linear *info_linear;
> +	struct bpf_func_info *func_info;
> +	const struct btf_type *t;
> +	char *name = NULL;
> +	struct btf *btf;
> +
> +	info_linear = bpf_program__get_prog_info_linear(
> +		tgt_fd, 1UL << BPF_PROG_INFO_FUNC_INFO);
> +	if (IS_ERR_OR_NULL(info_linear)) {
> +		p_err("failed to get info_linear for prog FD %d", tgt_fd);
> +		return NULL;
> +	}
> +
> +	if (info_linear->info.btf_id == 0 ||
> +	    btf__get_from_id(info_linear->info.btf_id, &btf)) {
> +		p_err("prog FD %d doesn't have valid btf", tgt_fd);
> +		goto out;
> +	}
> +
> +	func_info = (struct bpf_func_info *)(info_linear->info.func_info);
> +	t = btf__type_by_id(btf, func_info[0].type_id);
> +	if (!t) {
> +		p_err("btf %d doesn't have type %d",
> +		      info_linear->info.btf_id, func_info[0].type_id);
> +		goto out;
> +	}
> +	name = strdup(btf__name_by_offset(btf, t->name_off));
> +out:
> +	free(info_linear);
> +	return name;
> +}
> +
> +struct profiler_bpf *profile_obj;
> +int profile_tgt_fd = -1;
> +char *profile_tgt_name;
> +
> +static void profile_print_and_cleanup(void)
> +{
> +	profile_close_perf_events(profile_obj);
> +	profiler_bpf__detach(profile_obj);
> +	profile_read_values(profile_obj);
> +	profiler_bpf__destroy(profile_obj);
> +	profile_print_readings();
> +
> +	if (profile_tgt_fd >= 0)
> +		close(profile_tgt_fd);
> +	free(profile_tgt_name);
> +}
> +
> +#define PROFILE_DEFAULT_LONG_DURATION (3600 * 24)
> +
> +static void int_exit(int signo)
> +{
> +	profile_print_and_cleanup();
> +	exit(0);
> +}
> +
> +static int do_profile(int argc, char **argv)
> +{
> +	int num_metric, num_cpu, err = -1;
> +	struct bpf_program *prog;
> +	unsigned long duration;
> +	char *endptr;
> +
> +	/* we at least need: <duration>, "id", <id>, <metric> */
> +	if (argc < 4)
> +		usage();
> +
> +	/* parse profiling duration */
> +	duration = strtoul(*argv, &endptr, 0);
> +	if (*endptr)
> +		duration = PROFILE_DEFAULT_LONG_DURATION;
> +	else
> +		NEXT_ARG();
> +
> +	/* parse target fd */
> +	profile_tgt_fd = prog_parse_fd(&argc, &argv);
> +	if (profile_tgt_fd < 0) {
> +		p_err("failed to parse fd");
> +		return -1;
> +	}
> +
> +	num_metric = profile_parse_metrics(argc, argv);
> +	if (num_metric <= 0)
> +		goto out;
> +
> +	num_cpu = libbpf_num_possible_cpus();
> +	if (num_cpu <= 0) {
> +		p_err("failed to identify number of CPUs");
> +		goto out;
> +	}
> +
> +	profile_obj = profiler_bpf__open();
> +	if (!profile_obj) {
> +		p_err("failed to open and/or load BPF object");
> +		goto out;
> +	}
> +
> +	profile_obj->rodata->num_cpu = num_cpu;
> +	profile_obj->rodata->num_metric = num_metric;
> +
> +	/* adjust map sizes */
> +	bpf_map__resize(profile_obj->maps.events, num_metric * num_cpu);
> +	bpf_map__resize(profile_obj->maps.fentry_readings, num_metric);
> +	bpf_map__resize(profile_obj->maps.accum_readings, num_metric);
> +	bpf_map__resize(profile_obj->maps.counts, 1);
> +
> +	/* change target name */
> +	profile_tgt_name = profile_target_name(profile_tgt_fd);
> +	if (!profile_tgt_name) {
> +		p_err("failed to load target function name");
> +		goto out;
> +	}
> +
> +	bpf_object__for_each_program(prog, profile_obj->obj) {
> +		err = bpf_program__set_attach_target(prog, profile_tgt_fd,
> +						     profile_tgt_name);
> +		if (err) {
> +			p_err("failed to set attach target\n");
> +			goto out;
> +		}
> +	}
> +
> +	set_max_rlimit();
> +	err = profiler_bpf__load(profile_obj);
> +	if (err) {
> +		p_err("failed to load profile_obj");
> +		goto out;
> +	}
> +
> +	profile_open_perf_events(profile_obj);
> +
> +	err = profiler_bpf__attach(profile_obj);
> +	if (err) {
> +		p_err("failed to attach profile_obj");
> +		goto out;
> +	}
> +	signal(SIGINT, int_exit);
> +
> +	sleep(duration);
> +	profile_print_and_cleanup();
> +	return 0;
> +out:
> +	close(profile_tgt_fd);
> +	free(profile_tgt_name);
> +	return err;
> +}
> +
> +#endif /* BPFTOOL_WITHOUT_SKELETONS */
> +
>   static int do_help(int argc, char **argv)
>   {
>   	if (json_output) {
> @@ -1560,6 +1983,7 @@ static int do_help(int argc, char **argv)
>   		"                         [data_out FILE [data_size_out L]] \\\n"
>   		"                         [ctx_in FILE [ctx_out FILE [ctx_size_out M]]] \\\n"
>   		"                         [repeat N]\n"
> +		"       %s %s profile [DURATION] PROG METRICs\n"
>   		"       %s %s tracelog\n"
>   		"       %s %s help\n"
>   		"\n"
> @@ -1578,11 +2002,12 @@ static int do_help(int argc, char **argv)
>   		"       ATTACH_TYPE := { msg_verdict | stream_verdict | stream_parser |\n"
>   		"                        flow_dissector }\n"
>   		"       " HELP_SPEC_OPTIONS "\n"
> +		"       METRIC := { cycles | instructions | l1d_loads | llc_misses }\n"
>   		"",
>   		bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2],
>   		bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2],
>   		bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2],
> -		bin_name, argv[-2]);
> +		bin_name, argv[-2], bin_name, argv[-2]);
>   
>   	return 0;
>   }
> @@ -1599,6 +2024,7 @@ static const struct cmd cmds[] = {
>   	{ "detach",	do_detach },
>   	{ "tracelog",	do_tracelog },
>   	{ "run",	do_run },
> +	{ "profile",	do_profile },
>   	{ 0 }
>   };
>   
> diff --git a/tools/bpf/bpftool/skeleton/profiler.bpf.c b/tools/bpf/bpftool/skeleton/profiler.bpf.c
> new file mode 100644
> index 000000000000..abd3a7aacc1f
> --- /dev/null
> +++ b/tools/bpf/bpftool/skeleton/profiler.bpf.c
> @@ -0,0 +1,171 @@
> +// SPDX-License-Identifier: GPL-2.0
> +// Copyright (c) 2019 Facebook
> +#include "profiler.h"
> +#include <linux/bpf.h>
> +#include <bpf/bpf_helpers.h>
> +
> +#define ___bpf_concat(a, b) a ## b
> +#define ___bpf_apply(fn, n) ___bpf_concat(fn, n)
> +#define ___bpf_nth(_, _1, _2, _3, _4, _5, _6, _7, _8, _9, _a, _b, _c, N, ...) N
> +#define ___bpf_narg(...) \
> +	___bpf_nth(_, ##__VA_ARGS__, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)
> +#define ___bpf_empty(...) \
> +	___bpf_nth(_, ##__VA_ARGS__, N, N, N, N, N, N, N, N, N, N, 0)
> +
> +#define ___bpf_ctx_cast0() ctx
> +#define ___bpf_ctx_cast1(x) ___bpf_ctx_cast0(), (void *)ctx[0]
> +#define ___bpf_ctx_cast2(x, args...) ___bpf_ctx_cast1(args), (void *)ctx[1]
> +#define ___bpf_ctx_cast3(x, args...) ___bpf_ctx_cast2(args), (void *)ctx[2]
> +#define ___bpf_ctx_cast4(x, args...) ___bpf_ctx_cast3(args), (void *)ctx[3]
> +#define ___bpf_ctx_cast5(x, args...) ___bpf_ctx_cast4(args), (void *)ctx[4]
> +#define ___bpf_ctx_cast6(x, args...) ___bpf_ctx_cast5(args), (void *)ctx[5]
> +#define ___bpf_ctx_cast7(x, args...) ___bpf_ctx_cast6(args), (void *)ctx[6]
> +#define ___bpf_ctx_cast8(x, args...) ___bpf_ctx_cast7(args), (void *)ctx[7]
> +#define ___bpf_ctx_cast9(x, args...) ___bpf_ctx_cast8(args), (void *)ctx[8]
> +#define ___bpf_ctx_cast10(x, args...) ___bpf_ctx_cast9(args), (void *)ctx[9]
> +#define ___bpf_ctx_cast11(x, args...) ___bpf_ctx_cast10(args), (void *)ctx[10]
> +#define ___bpf_ctx_cast12(x, args...) ___bpf_ctx_cast11(args), (void *)ctx[11]
> +#define ___bpf_ctx_cast(args...) \
> +	___bpf_apply(___bpf_ctx_cast, ___bpf_narg(args))(args)
> +
> +/*
> + * BPF_PROG is a convenience wrapper for generic tp_btf/fentry/fexit and
> + * similar kinds of BPF programs, that accept input arguments as a single
> + * pointer to untyped u64 array, where each u64 can actually be a typed
> + * pointer or integer of different size. Instead of requring user to write
> + * manual casts and work with array elements by index, BPF_PROG macro
> + * allows user to declare a list of named and typed input arguments in the
> + * same syntax as for normal C function. All the casting is hidden and
> + * performed transparently, while user code can just assume working with
> + * function arguments of specified type and name.
> + *
> + * Original raw context argument is preserved as well as 'ctx' argument.
> + * This is useful when using BPF helpers that expect original context
> + * as one of the parameters (e.g., for bpf_perf_event_output()).
> + */
> +#define BPF_PROG(name, args...)						    \
> +name(unsigned long long *ctx);						    \
> +static __always_inline typeof(name(0))					    \
> +____##name(unsigned long long *ctx, ##args);				    \
> +typeof(name(0)) name(unsigned long long *ctx)				    \
> +{									    \
> +	_Pragma("GCC diagnostic push")					    \
> +	_Pragma("GCC diagnostic ignored \"-Wint-conversion\"")		    \
> +	return ____##name(___bpf_ctx_cast(args));			    \
> +	_Pragma("GCC diagnostic pop")					    \
> +}									    \
> +static __always_inline typeof(name(0))					    \
> +____##name(unsigned long long *ctx, ##args)
> +
> +/* map of perf event fds, num_cpu * num_metric entries */
> +struct {
> +	__uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
> +	__uint(key_size, sizeof(u32));
> +	__uint(value_size, sizeof(int));
> +} events SEC(".maps");
> +
> +/* readings at fentry */
> +struct {
> +	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
> +	__uint(key_size, sizeof(u32));
> +	__uint(value_size, sizeof(struct bpf_perf_event_value));
> +} fentry_readings SEC(".maps");
> +
> +/* accumulated readings */
> +struct {
> +	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
> +	__uint(key_size, sizeof(u32));
> +	__uint(value_size, sizeof(struct bpf_perf_event_value));
> +} accum_readings SEC(".maps");
> +
> +/* sample counts, one per cpu */
> +struct {
> +	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
> +	__uint(key_size, sizeof(u32));
> +	__uint(value_size, sizeof(u64));
> +} counts SEC(".maps");
> +
> +const volatile __u32 num_cpu = 1;
> +const volatile __u32 num_metric = 1;
> +#define MAX_NUM_MATRICS 4
> +
> +SEC("fentry/XXX")
> +int BPF_PROG(fentry_XXX)
> +{
> +	struct bpf_perf_event_value *ptrs[MAX_NUM_MATRICS];
> +	u32 key = bpf_get_smp_processor_id();
> +	u32 i;
> +
> +	/* look up before reading, to reduce error */
> +	for (i = 0; i < num_metric && i < MAX_NUM_MATRICS; i++) {
> +		u32 flag = i;
> +
> +		ptrs[i] = bpf_map_lookup_elem(&fentry_readings, &flag);
> +		if (!ptrs[i])
> +			return 0;
> +	}
> +
> +	for (i = 0; i < num_metric && i < MAX_NUM_MATRICS; i++) {
> +		struct bpf_perf_event_value reading;
> +		int err;
> +
> +		err = bpf_perf_event_read_value(&events, key, &reading,
> +						sizeof(reading));
> +		if (err)
> +			return 0;
> +		*(ptrs[i]) = reading;
> +		key += num_cpu;
> +	}
> +
> +	return 0;
> +}
> +
> +static inline void
> +fexit_update_maps(u32 id, struct bpf_perf_event_value *after)
> +{
> +	struct bpf_perf_event_value *before, diff, *accum;
> +
> +	before = bpf_map_lookup_elem(&fentry_readings, &id);
> +	/* only account samples with a valid fentry_reading */
> +	if (before && before->counter) {
> +		struct bpf_perf_event_value *accum;
> +
> +		diff.counter = after->counter - before->counter;
> +		diff.enabled = after->enabled - before->enabled;
> +		diff.running = after->running - before->running;
> +
> +		accum = bpf_map_lookup_elem(&accum_readings, &id);
> +		if (accum) {
> +			accum->counter += diff.counter;
> +			accum->enabled += diff.enabled;
> +			accum->running += diff.running;
> +		}
> +	}
> +}
> +
> +SEC("fexit/XXX")
> +int BPF_PROG(fexit_XXX)
> +{
> +	struct bpf_perf_event_value readings[MAX_NUM_MATRICS];
> +	u32 cpu = bpf_get_smp_processor_id();
> +	u32 i, one = 1, zero = 0;
> +	int err;
> +	u64 *count;
> +
> +	/* read all events before updating the maps, to reduce error */
> +	for (i = 0; i < num_metric && i < MAX_NUM_MATRICS; i++) {
> +		err = bpf_perf_event_read_value(&events, cpu + i * num_cpu,
> +						readings + i, sizeof(*readings));
> +		if (err)
> +			return 0;
> +	}
> +	count = bpf_map_lookup_elem(&counts, &zero);
> +	if (count) {
> +		*count += 1;
> +		for (i = 0; i < num_metric && i < MAX_NUM_MATRICS; i++)
> +			fexit_update_maps(i, &readings[i]);
> +	}
> +	return 0;
> +}
> +
> +char LICENSE[] SEC("license") = "GPL";
> diff --git a/tools/bpf/bpftool/skeleton/profiler.h b/tools/bpf/bpftool/skeleton/profiler.h
> new file mode 100644
> index 000000000000..ae15cb0c4d43
> --- /dev/null
> +++ b/tools/bpf/bpftool/skeleton/profiler.h
> @@ -0,0 +1,47 @@
> +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */
> +#ifndef __PROFILER_H
> +#define __PROFILER_H
> +
> +/* useful typedefs from vimlinux.h */
> +
> +typedef signed char __s8;
> +typedef unsigned char __u8;
> +typedef short int __s16;
> +typedef short unsigned int __u16;
> +typedef int __s32;
> +typedef unsigned int __u32;
> +typedef long long int __s64;
> +typedef long long unsigned int __u64;
> +
> +typedef __s8 s8;
> +typedef __u8 u8;
> +typedef __s16 s16;
> +typedef __u16 u16;
> +typedef __s32 s32;
> +typedef __u32 u32;
> +typedef __s64 s64;
> +typedef __u64 u64;
> +
> +enum {
> +	false = 0,
> +	true = 1,
> +};
> +
> +#ifdef __CHECKER__
> +#define __bitwise__ __attribute__((bitwise))
> +#else
> +#define __bitwise__
> +#endif
> +#define __bitwise __bitwise__
> +
> +typedef __u16 __bitwise __le16;
> +typedef __u16 __bitwise __be16;
> +typedef __u32 __bitwise __le32;
> +typedef __u32 __bitwise __be32;
> +typedef __u64 __bitwise __le64;
> +typedef __u64 __bitwise __be64;
> +
> +typedef __u16 __bitwise __sum16;
> +typedef __u32 __bitwise __wsum;
> +
> +#endif /* __PROFILER_H */
> diff --git a/tools/scripts/Makefile.include b/tools/scripts/Makefile.include
> index ded7a950dc40..59f31f01cb93 100644
> --- a/tools/scripts/Makefile.include
> +++ b/tools/scripts/Makefile.include
> @@ -106,6 +106,7 @@ ifneq ($(silent),1)
>     ifneq ($(V),1)
>   	QUIET_CC       = @echo '  CC       '$@;
>   	QUIET_CC_FPIC  = @echo '  CC FPIC  '$@;
> +	QUIET_CLANG    = @echo '  CLANG    '$@;
>   	QUIET_AR       = @echo '  AR       '$@;
>   	QUIET_LINK     = @echo '  LINK     '$@;
>   	QUIET_MKDIR    = @echo '  MKDIR    '$@;
>
Yonghong Song March 1, 2020, 4:14 a.m. UTC | #2
On 2/28/20 3:40 PM, Song Liu wrote:
> With fentry/fexit programs, it is possible to profile BPF program with
> hardware counters. Introduce bpftool "prog profile", which measures key
> metrics of a BPF program.
> 
> bpftool prog profile command creates per-cpu perf events. Then it attaches
> fentry/fexit programs to the target BPF program. The fentry program saves
> perf event value to a map. The fexit program reads the perf event again,
> and calculates the difference, which is the instructions/cycles used by
> the target program.
> 
> Example input and output:
> 
>    ./bpftool prog profile 3 id 337 cycles instructions llc_misses
> 
>          4228 run_cnt
>       3403698 cycles                                              (84.08%)
>       3525294 instructions   #  1.04 insn per cycle               (84.05%)
>            13 llc_misses     #  3.69 LLC misses per million isns  (83.50%)

if run_cnt is 0, the following is the result:

-bash-4.4$ sudo ./bpftool prog profile 3 id 52 cycles instructions 
llc_misses

                  0 run_cnt
                  0 cycles 

                  0 instructions        #     -nan insn per cycle 

                  0 llc_misses          #     -nan LLC misses per 
million isns

-nan is a little bit crypto for user output. maybe just says
   unknown insns per cycle
in the comment?

We can still display "0 cycles" etc. just to make output uniform.

> 
> This command measures cycles and instructions for BPF program with id
> 337 for 3 seconds. The program has triggered 4228 times. The rest of the
> output is similar to perf-stat. In this example, the counters were only
> counting ~84% of the time because of time multiplexing of perf counters.
> 
> Note that, this approach measures cycles and instructions in very small
> increments. So the fentry/fexit programs introduce noticeable errors to
> the measurement results.
> 
> The fentry/fexit programs are generated with BPF skeletons. Therefore, we
> build bpftool twice. The first time _bpftool is built without skeletons.
> Then, _bpftool is used to generate the skeletons. The second time, bpftool
> is built with skeletons.
> 
> Signed-off-by: Song Liu <songliubraving@fb.com>
> ---
>   tools/bpf/bpftool/Makefile                |  18 +
>   tools/bpf/bpftool/prog.c                  | 428 +++++++++++++++++++++-
>   tools/bpf/bpftool/skeleton/profiler.bpf.c | 171 +++++++++
>   tools/bpf/bpftool/skeleton/profiler.h     |  47 +++
>   tools/scripts/Makefile.include            |   1 +
>   5 files changed, 664 insertions(+), 1 deletion(-)
>   create mode 100644 tools/bpf/bpftool/skeleton/profiler.bpf.c
>   create mode 100644 tools/bpf/bpftool/skeleton/profiler.h
[.....]
Song Liu March 1, 2020, 10:37 p.m. UTC | #3
> On Feb 29, 2020, at 7:52 PM, Yonghong Song <yhs@fb.com> wrote:
> 
> 
> 
> On 2/28/20 3:40 PM, Song Liu wrote:
>> With fentry/fexit programs, it is possible to profile BPF program with
>> hardware counters. Introduce bpftool "prog profile", which measures key
>> metrics of a BPF program.
>> bpftool prog profile command creates per-cpu perf events. Then it attaches
>> fentry/fexit programs to the target BPF program. The fentry program saves
>> perf event value to a map. The fexit program reads the perf event again,
>> and calculates the difference, which is the instructions/cycles used by
>> the target program.
>> Example input and output:
>>   ./bpftool prog profile 3 id 337 cycles instructions llc_misses
>>         4228 run_cnt
>>      3403698 cycles                                              (84.08%)
>>      3525294 instructions   #  1.04 insn per cycle               (84.05%)
>>           13 llc_misses     #  3.69 LLC misses per million isns  (83.50%)
>> This command measures cycles and instructions for BPF program with id
>> 337 for 3 seconds. The program has triggered 4228 times. The rest of the
>> output is similar to perf-stat. In this example, the counters were only
>> counting ~84% of the time because of time multiplexing of perf counters.
>> Note that, this approach measures cycles and instructions in very small
>> increments. So the fentry/fexit programs introduce noticeable errors to
>> the measurement results.
>> The fentry/fexit programs are generated with BPF skeletons. Therefore, we
>> build bpftool twice. The first time _bpftool is built without skeletons.
>> Then, _bpftool is used to generate the skeletons. The second time, bpftool
>> is built with skeletons.
>> Signed-off-by: Song Liu <songliubraving@fb.com>
>> ---
>>  tools/bpf/bpftool/Makefile                |  18 +
>>  tools/bpf/bpftool/prog.c                  | 428 +++++++++++++++++++++-
>>  tools/bpf/bpftool/skeleton/profiler.bpf.c | 171 +++++++++
>>  tools/bpf/bpftool/skeleton/profiler.h     |  47 +++
>>  tools/scripts/Makefile.include            |   1 +
>>  5 files changed, 664 insertions(+), 1 deletion(-)
>>  create mode 100644 tools/bpf/bpftool/skeleton/profiler.bpf.c
>>  create mode 100644 tools/bpf/bpftool/skeleton/profiler.h
>> diff --git a/tools/bpf/bpftool/Makefile b/tools/bpf/bpftool/Makefile
>> index c4e810335810..c035fc107027 100644
>> --- a/tools/bpf/bpftool/Makefile
>> +++ b/tools/bpf/bpftool/Makefile
>> @@ -59,6 +59,7 @@ LIBS = $(LIBBPF) -lelf -lz
>>    INSTALL ?= install
>>  RM ?= rm -f
>> +CLANG ?= clang
>>    FEATURE_USER = .bpftool
>>  FEATURE_TESTS = libbfd disassembler-four-args reallocarray zlib
>> @@ -110,6 +111,22 @@ SRCS += $(BFD_SRCS)
>>  endif
>>    OBJS = $(patsubst %.c,$(OUTPUT)%.o,$(SRCS)) $(OUTPUT)disasm.o
>> +_OBJS = $(filter-out $(OUTPUT)prog.o,$(OBJS)) $(OUTPUT)_prog.o
>> +
>> +$(OUTPUT)_prog.o: prog.c
>> +	$(QUIET_CC)$(COMPILE.c) -MMD -DBPFTOOL_WITHOUT_SKELETONS -o $@ $<
>> +
>> +$(OUTPUT)_bpftool: $(_OBJS) $(LIBBPF)
>> +	$(QUIET_LINK)$(CC) $(CFLAGS) $(LDFLAGS) -o $@ $(_OBJS) $(LIBS)
>> +
>> +skeleton/profiler.bpf.o: skeleton/profiler.bpf.c
>> +	$(QUIET_CLANG)$(CLANG) -g -O2 -target bpf -c $< -o $@
> 
> With a fresh checkout, applying this patch and just selftests like
>  make -C tools/testing/selftests/bpf
> 
> I got the following build error:
> 
> make[2]: Leaving directory `/data/users/yhs/work/net-next/tools/lib/bpf'
> clang -g -O2 -target bpf -c skeleton/profiler.bpf.c -o skeleton/profiler.bpf.o
> skeleton/profiler.bpf.c:5:10: fatal error: 'bpf/bpf_helpers.h' file not found
> #include <bpf/bpf_helpers.h>
>         ^~~~~~~~~~~~~~~~~~~
> 1 error generated.
> make[1]: *** [skeleton/profiler.bpf.o] Error 1
> 
> I think Makefile should be tweaked to avoid selftest failure.

Hmm... I am not seeing this error. The build succeeded in the test. 

Thanks,
Song
Song Liu March 1, 2020, 10:37 p.m. UTC | #4
> On Feb 29, 2020, at 8:14 PM, Yonghong Song <yhs@fb.com> wrote:
> 
> 
> 
> On 2/28/20 3:40 PM, Song Liu wrote:
>> With fentry/fexit programs, it is possible to profile BPF program with
>> hardware counters. Introduce bpftool "prog profile", which measures key
>> metrics of a BPF program.
>> bpftool prog profile command creates per-cpu perf events. Then it attaches
>> fentry/fexit programs to the target BPF program. The fentry program saves
>> perf event value to a map. The fexit program reads the perf event again,
>> and calculates the difference, which is the instructions/cycles used by
>> the target program.
>> Example input and output:
>>   ./bpftool prog profile 3 id 337 cycles instructions llc_misses
>>         4228 run_cnt
>>      3403698 cycles                                              (84.08%)
>>      3525294 instructions   #  1.04 insn per cycle               (84.05%)
>>           13 llc_misses     #  3.69 LLC misses per million isns  (83.50%)
> 
> if run_cnt is 0, the following is the result:
> 
> -bash-4.4$ sudo ./bpftool prog profile 3 id 52 cycles instructions llc_misses
> 
>                 0 run_cnt
>                 0 cycles 
>                 0 instructions        #     -nan insn per cycle 
>                 0 llc_misses          #     -nan LLC misses per million isns
> 
> -nan is a little bit crypto for user output. maybe just says
>  unknown insns per cycle
> in the comment?
> 
> We can still display "0 cycles" etc. just to make output uniform.

Good catch! Will fix this in next version. 

Song
Yonghong Song March 2, 2020, 3:42 a.m. UTC | #5
On 3/1/20 2:37 PM, Song Liu wrote:
> 
> 
>> On Feb 29, 2020, at 7:52 PM, Yonghong Song <yhs@fb.com> wrote:
>>
>>
>>
>> On 2/28/20 3:40 PM, Song Liu wrote:
>>> With fentry/fexit programs, it is possible to profile BPF program with
>>> hardware counters. Introduce bpftool "prog profile", which measures key
>>> metrics of a BPF program.
>>> bpftool prog profile command creates per-cpu perf events. Then it attaches
>>> fentry/fexit programs to the target BPF program. The fentry program saves
>>> perf event value to a map. The fexit program reads the perf event again,
>>> and calculates the difference, which is the instructions/cycles used by
>>> the target program.
>>> Example input and output:
>>>    ./bpftool prog profile 3 id 337 cycles instructions llc_misses
>>>          4228 run_cnt
>>>       3403698 cycles                                              (84.08%)
>>>       3525294 instructions   #  1.04 insn per cycle               (84.05%)
>>>            13 llc_misses     #  3.69 LLC misses per million isns  (83.50%)
>>> This command measures cycles and instructions for BPF program with id
>>> 337 for 3 seconds. The program has triggered 4228 times. The rest of the
>>> output is similar to perf-stat. In this example, the counters were only
>>> counting ~84% of the time because of time multiplexing of perf counters.
>>> Note that, this approach measures cycles and instructions in very small
>>> increments. So the fentry/fexit programs introduce noticeable errors to
>>> the measurement results.
>>> The fentry/fexit programs are generated with BPF skeletons. Therefore, we
>>> build bpftool twice. The first time _bpftool is built without skeletons.
>>> Then, _bpftool is used to generate the skeletons. The second time, bpftool
>>> is built with skeletons.
>>> Signed-off-by: Song Liu <songliubraving@fb.com>
>>> ---
>>>   tools/bpf/bpftool/Makefile                |  18 +
>>>   tools/bpf/bpftool/prog.c                  | 428 +++++++++++++++++++++-
>>>   tools/bpf/bpftool/skeleton/profiler.bpf.c | 171 +++++++++
>>>   tools/bpf/bpftool/skeleton/profiler.h     |  47 +++
>>>   tools/scripts/Makefile.include            |   1 +
>>>   5 files changed, 664 insertions(+), 1 deletion(-)
>>>   create mode 100644 tools/bpf/bpftool/skeleton/profiler.bpf.c
>>>   create mode 100644 tools/bpf/bpftool/skeleton/profiler.h
>>> diff --git a/tools/bpf/bpftool/Makefile b/tools/bpf/bpftool/Makefile
>>> index c4e810335810..c035fc107027 100644
>>> --- a/tools/bpf/bpftool/Makefile
>>> +++ b/tools/bpf/bpftool/Makefile
>>> @@ -59,6 +59,7 @@ LIBS = $(LIBBPF) -lelf -lz
>>>     INSTALL ?= install
>>>   RM ?= rm -f
>>> +CLANG ?= clang
>>>     FEATURE_USER = .bpftool
>>>   FEATURE_TESTS = libbfd disassembler-four-args reallocarray zlib
>>> @@ -110,6 +111,22 @@ SRCS += $(BFD_SRCS)
>>>   endif
>>>     OBJS = $(patsubst %.c,$(OUTPUT)%.o,$(SRCS)) $(OUTPUT)disasm.o
>>> +_OBJS = $(filter-out $(OUTPUT)prog.o,$(OBJS)) $(OUTPUT)_prog.o
>>> +
>>> +$(OUTPUT)_prog.o: prog.c
>>> +	$(QUIET_CC)$(COMPILE.c) -MMD -DBPFTOOL_WITHOUT_SKELETONS -o $@ $<
>>> +
>>> +$(OUTPUT)_bpftool: $(_OBJS) $(LIBBPF)
>>> +	$(QUIET_LINK)$(CC) $(CFLAGS) $(LDFLAGS) -o $@ $(_OBJS) $(LIBS)
>>> +
>>> +skeleton/profiler.bpf.o: skeleton/profiler.bpf.c
>>> +	$(QUIET_CLANG)$(CLANG) -g -O2 -target bpf -c $< -o $@
>>
>> With a fresh checkout, applying this patch and just selftests like
>>   make -C tools/testing/selftests/bpf
>>
>> I got the following build error:
>>
>> make[2]: Leaving directory `/data/users/yhs/work/net-next/tools/lib/bpf'
>> clang -g -O2 -target bpf -c skeleton/profiler.bpf.c -o skeleton/profiler.bpf.o
>> skeleton/profiler.bpf.c:5:10: fatal error: 'bpf/bpf_helpers.h' file not found
>> #include <bpf/bpf_helpers.h>
>>          ^~~~~~~~~~~~~~~~~~~
>> 1 error generated.
>> make[1]: *** [skeleton/profiler.bpf.o] Error 1
>>
>> I think Makefile should be tweaked to avoid selftest failure.
> 
> Hmm... I am not seeing this error. The build succeeded in the test.

Just tried again with a *fresh* checkout of tools/ directory with the patch.
   -bash-4.4$ make -C tools/testing/selftests/bpf
   ...
   LINK 
/data/users/yhs/work/net-next/tools/testing/selftests/bpf/tools/build/bpftool//libbpf/libbpf.a
   LINK 
/data/users/yhs/work/net-next/tools/testing/selftests/bpf/tools/build/bpftool/_bpftool
make[1]: *** No rule to make target `skeleton/profiler.bpf.c', needed by 
`skeleton/profiler.bpf.o'.  Stop.
make: *** 
[/data/users/yhs/work/net-next/tools/testing/selftests/bpf/tools/sbin/bpftool] 
Error 2
make: Leaving directory 
`/data/users/yhs/work/net-next/tools/testing/selftests/bpf

The error is different from my previous try, but the build still fails.

> 
> Thanks,
> Song
>
Yonghong Song March 2, 2020, 4:24 a.m. UTC | #6
On 2/28/20 3:40 PM, Song Liu wrote:
> With fentry/fexit programs, it is possible to profile BPF program with
> hardware counters. Introduce bpftool "prog profile", which measures key
> metrics of a BPF program.
> 
> bpftool prog profile command creates per-cpu perf events. Then it attaches
> fentry/fexit programs to the target BPF program. The fentry program saves
> perf event value to a map. The fexit program reads the perf event again,
> and calculates the difference, which is the instructions/cycles used by
> the target program.
> 
> Example input and output:
> 
>    ./bpftool prog profile 3 id 337 cycles instructions llc_misses
> 
>          4228 run_cnt
>       3403698 cycles                                              (84.08%)
>       3525294 instructions   #  1.04 insn per cycle               (84.05%)
>            13 llc_misses     #  3.69 LLC misses per million isns  (83.50%)
> 
> This command measures cycles and instructions for BPF program with id
> 337 for 3 seconds. The program has triggered 4228 times. The rest of the
> output is similar to perf-stat. In this example, the counters were only
> counting ~84% of the time because of time multiplexing of perf counters.
> 
> Note that, this approach measures cycles and instructions in very small
> increments. So the fentry/fexit programs introduce noticeable errors to
> the measurement results.
> 
> The fentry/fexit programs are generated with BPF skeletons. Therefore, we
> build bpftool twice. The first time _bpftool is built without skeletons.
> Then, _bpftool is used to generate the skeletons. The second time, bpftool
> is built with skeletons.
> 
> Signed-off-by: Song Liu <songliubraving@fb.com>
> ---
>   tools/bpf/bpftool/Makefile                |  18 +
>   tools/bpf/bpftool/prog.c                  | 428 +++++++++++++++++++++-
>   tools/bpf/bpftool/skeleton/profiler.bpf.c | 171 +++++++++
>   tools/bpf/bpftool/skeleton/profiler.h     |  47 +++
>   tools/scripts/Makefile.include            |   1 +
>   5 files changed, 664 insertions(+), 1 deletion(-)
>   create mode 100644 tools/bpf/bpftool/skeleton/profiler.bpf.c
>   create mode 100644 tools/bpf/bpftool/skeleton/profiler.h
> 
> diff --git a/tools/bpf/bpftool/Makefile b/tools/bpf/bpftool/Makefile
> index c4e810335810..c035fc107027 100644
> --- a/tools/bpf/bpftool/Makefile
> +++ b/tools/bpf/bpftool/Makefile
> @@ -59,6 +59,7 @@ LIBS = $(LIBBPF) -lelf -lz
>   
>   INSTALL ?= install
>   RM ?= rm -f
> +CLANG ?= clang
>   
>   FEATURE_USER = .bpftool
>   FEATURE_TESTS = libbfd disassembler-four-args reallocarray zlib
> @@ -110,6 +111,22 @@ SRCS += $(BFD_SRCS)
>   endif
>   
>   OBJS = $(patsubst %.c,$(OUTPUT)%.o,$(SRCS)) $(OUTPUT)disasm.o
> +_OBJS = $(filter-out $(OUTPUT)prog.o,$(OBJS)) $(OUTPUT)_prog.o
> +
> +$(OUTPUT)_prog.o: prog.c
> +	$(QUIET_CC)$(COMPILE.c) -MMD -DBPFTOOL_WITHOUT_SKELETONS -o $@ $<
> +
> +$(OUTPUT)_bpftool: $(_OBJS) $(LIBBPF)
> +	$(QUIET_LINK)$(CC) $(CFLAGS) $(LDFLAGS) -o $@ $(_OBJS) $(LIBS)
> +
> +skeleton/profiler.bpf.o: skeleton/profiler.bpf.c
> +	$(QUIET_CLANG)$(CLANG) -g -O2 -target bpf -c $< -o $@
> +
> +profiler.skel.h: $(OUTPUT)_bpftool skeleton/profiler.bpf.o
> +	$(QUIET_GEN)$(OUTPUT)./_bpftool gen skeleton skeleton/profiler.bpf.o > $@
> +
> +$(OUTPUT)prog.o: prog.c profiler.skel.h
> +	$(QUIET_CC)$(COMPILE.c) -MMD -o $@ $<
>   
>   $(OUTPUT)disasm.o: $(srctree)/kernel/bpf/disasm.c
>   	$(QUIET_CC)$(COMPILE.c) -MMD -o $@ $<
> @@ -125,6 +142,7 @@ $(OUTPUT)%.o: %.c
>   clean: $(LIBBPF)-clean
>   	$(call QUIET_CLEAN, bpftool)
>   	$(Q)$(RM) -- $(OUTPUT)bpftool $(OUTPUT)*.o $(OUTPUT)*.d
> +	$(Q)$(RM) -- $(OUTPUT)_bpftool profiler.skel.h skeleton/profiler.bpf.o
>   	$(Q)$(RM) -r -- $(OUTPUT)libbpf/
>   	$(call QUIET_CLEAN, core-gen)
>   	$(Q)$(RM) -- $(OUTPUT)FEATURE-DUMP.bpftool
> diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c
> index 1996e67a2f00..39f0f14464ad 100644
> --- a/tools/bpf/bpftool/prog.c
> +++ b/tools/bpf/bpftool/prog.c
> @@ -10,12 +10,16 @@
>   #include <string.h>
>   #include <time.h>
>   #include <unistd.h>
> +#include <signal.h>
>   #include <net/if.h>
>   #include <sys/types.h>
>   #include <sys/stat.h>
> +#include <sys/ioctl.h>
> +#include <sys/syscall.h>
>   
>   #include <linux/err.h>
>   #include <linux/sizes.h>
> +#include <linux/perf_event.h>
>   
>   #include <bpf/bpf.h>
>   #include <bpf/btf.h>
> @@ -1537,6 +1541,425 @@ static int do_loadall(int argc, char **argv)
>   	return load_with_options(argc, argv, false);
>   }
>   
> +#ifdef BPFTOOL_WITHOUT_SKELETONS
> +
> +static int do_profile(int argc, char **argv)
> +{
> +	return 0;
> +}
> +
> +#else /* BPFTOOL_WITHOUT_SKELETONS */
> +
> +#include "profiler.skel.h"
> +
> +#define SAMPLE_PERIOD  0x7fffffffffffffffULL
> +struct profile_metric {
> +	const char *name;
> +	struct perf_event_attr attr;
> +	bool selected;
> +	struct bpf_perf_event_value val;

reverse christmas tree?

> +
> +	/* calculate ratios like instructions per cycle */
> +	const int ratio_metric; /* 0 for N/A, 1 for index 0 (cycles)  */
> +	const float ratio_mul;
> +	const char *ratio_desc;
> +} metrics[] = {
> +	{
> +		.name = "cycles",
> +		.attr = {
> +			.freq = 0,
> +			.sample_period = SAMPLE_PERIOD,
> +			.inherit = 0,
> +			.type = PERF_TYPE_HARDWARE,
> +			.read_format = 0,
> +			.sample_type = 0,
> +			.config = PERF_COUNT_HW_CPU_CYCLES,
> +		},

We have some "= 0" attributes, not sure whether we should all list here.
I think most (if not all) of them can be omitted.

> +	},
> +	{
> +		.name = "instructions",
> +		.attr = {
> +			.freq = 0,
> +			.sample_period = SAMPLE_PERIOD,
> +			.inherit = 0,
> +			.type = PERF_TYPE_HARDWARE,
> +			.read_format = 0,
> +			.sample_type = 0,
> +			.config = PERF_COUNT_HW_INSTRUCTIONS,
> +		},
> +		.ratio_metric = 1,
> +		.ratio_mul = 1.0,
> +		.ratio_desc = "insn per cycle",
> +	},
> +	{
> +		.name = "l1d_loads",
> +		.attr = {
> +			.freq = 0,
> +			.sample_period = SAMPLE_PERIOD,
> +			.inherit = 0,
> +			.type = PERF_TYPE_HW_CACHE,
> +			.read_format = 0,
> +			.sample_type = 0,
> +			.config =
> +				PERF_COUNT_HW_CACHE_L1D |
> +				(PERF_COUNT_HW_CACHE_OP_READ << 8) |
> +				(PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16),
> +		},

why we do not have metric here?

> +	},
> +	{
> +		.name = "llc_misses",
> +		.attr = {
> +			.freq = 0,
> +			.sample_period = SAMPLE_PERIOD,
> +			.inherit = 0,
> +			.type = PERF_TYPE_HW_CACHE,
> +			.read_format = 0,
> +			.sample_type = 0,
> +			.config =
> +				PERF_COUNT_HW_CACHE_LL |
> +				(PERF_COUNT_HW_CACHE_OP_READ << 8) |
> +				(PERF_COUNT_HW_CACHE_RESULT_MISS << 16),
> +		},
> +		.ratio_metric = 2,
> +		.ratio_mul = 1e6,
> +		.ratio_desc = "LLC misses per million isns",
> +	},
> +};

icache miss and itlb miss might be useful as well as the code will jump
to a different physical page. I think we should addd them. dtlb_miss
probably not a big problem, but it would be good to be an option.

For ratio_metric, we explicitly assign a slot here. Any specific reason?
We can just say this metric *permits* ratio_metric and then ratio_matric
is assigned dynamically by the user command line options?

I am thinking how we could support *all* metrics the underlying system
support based on `perf list`. This can be the future work though.

> +
> +u64 profile_total_count;

static variable?

> +
> +#define MAX_NUM_PROFILE_METRICS 4
> +
> +static int profile_parse_metrics(int argc, char **argv)
> +{
> +	unsigned int metric_cnt;
> +	int selected_cnt = 0;
> +	unsigned int i;
> +
> +	metric_cnt = sizeof(metrics) / sizeof(struct profile_metric);
> +
> +	while (argc > 0) {
> +		for (i = 0; i < metric_cnt; i++) {
> +			if (strcmp(argv[0], metrics[i].name) == 0) {
> +				if (!metrics[i].selected)
> +					selected_cnt++;
> +				metrics[i].selected = true;
> +				break;
> +			}
> +		}
> +		if (i == metric_cnt) {
> +			p_err("unknown metric %s", argv[0]);
> +			return -1;
> +		}
> +		NEXT_ARG();
> +	}
> +	if (selected_cnt > MAX_NUM_PROFILE_METRICS) {
> +		p_err("too many (%d) metrics, please specify no more than %d metrics at at time",
> +		      selected_cnt, MAX_NUM_PROFILE_METRICS);
> +		return -1;
> +	}
> +	return selected_cnt;
> +}
> +
> +static void profile_read_values(struct profiler_bpf *obj)
> +{
> +	u32 m, cpu, num_cpu = obj->rodata->num_cpu;
> +	u64 counts[num_cpu];
> +	int reading_map_fd, count_map_fd;
> +	u32 key = 0;

reverse christmas tree?

> +
> +	reading_map_fd = bpf_map__fd(obj->maps.accum_readings);
> +	count_map_fd = bpf_map__fd(obj->maps.counts);
> +	if (reading_map_fd < 0 || count_map_fd < 0) {
> +		p_err("failed to get fd for map");
> +		return;
> +	}
> +
> +	assert(bpf_map_lookup_elem(count_map_fd, &key, counts) == 0);

In the patch, I see sometime bpf_map_lookup_elem() result is checked
with failure being handled. Sometimes, assert() is used. Maybe be
consistent with checking result approach?

> +	profile_total_count = 0;
> +	for (cpu = 0; cpu < num_cpu; cpu++)
> +		profile_total_count += counts[cpu];
> +
> +	for (m = 0; m < ARRAY_SIZE(metrics); m++) {
> +		struct bpf_perf_event_value values[obj->rodata->num_cpu];

just use ...[num_cpu]

> +
> +		if (!metrics[m].selected)
> +			continue;
> +
> +		assert(bpf_map_lookup_elem(reading_map_fd, &key, values) == 0);

checking return value?

> +		for (cpu = 0; cpu < num_cpu; cpu++) {
> +			metrics[m].val.counter += values[cpu].counter;
> +			metrics[m].val.enabled += values[cpu].enabled;
> +			metrics[m].val.running += values[cpu].running;
> +		}
> +		key++;
> +	}
> +}
> +
> +static void profile_print_readings_json(void)
> +{
> +	u32 m;
> +
> +	jsonw_start_array(json_wtr);
> +	for (m = 0; m < ARRAY_SIZE(metrics); m++) {
> +		if (!metrics[m].selected)
> +			continue;
> +		jsonw_start_object(json_wtr);
> +		jsonw_string_field(json_wtr, "metric", metrics[m].name);
> +		jsonw_lluint_field(json_wtr, "run_cnt", profile_total_count);
> +		jsonw_lluint_field(json_wtr, "value", metrics[m].val.counter);
> +		jsonw_lluint_field(json_wtr, "enabled", metrics[m].val.enabled);
> +		jsonw_lluint_field(json_wtr, "running", metrics[m].val.running);
> +
> +		jsonw_end_object(json_wtr);
> +	}
> +	jsonw_end_array(json_wtr);
> +}
> +
> +static void profile_print_readings_plain(void)
> +{
> +	u32 m;
> +
> +	printf("\n%18lu %-20s\n", profile_total_count, "run_cnt");
> +	for (m = 0; m < ARRAY_SIZE(metrics); m++) {
> +		struct bpf_perf_event_value *val = &metrics[m].val;
> +		int r;
> +
> +		if (!metrics[m].selected)
> +			continue;
> +		printf("%18llu %-20s", val->counter, metrics[m].name);
> +
> +		r = metrics[m].ratio_metric - 1;
> +		if (r >= 0 && metrics[r].selected) {
> +			printf("# %8.2f %-30s",
> +			       val->counter * metrics[m].ratio_mul /
> +			       metrics[r].val.counter,
> +			       metrics[m].ratio_desc);
> +		} else {
> +			printf("%-41s", "");
> +		}
> +
> +		if (val->enabled > val->running)
> +			printf("(%4.2f%%)",
> +			       val->running * 100.0 / val->enabled);
> +		printf("\n");
> +	}
> +}
> +
> +static void profile_print_readings(void)
> +{
> +	if (json_output)
> +		profile_print_readings_json();
> +	else
> +		profile_print_readings_plain();
> +}
> +
> +static void profile_close_perf_events(struct profiler_bpf *obj)
> +{
> +	int map_fd, pmu_fd;
> +	u32 i;
> +
> +	map_fd = bpf_map__fd(obj->maps.events);
> +	if (map_fd < 0) {
> +		p_err("failed to get fd for events map");
> +		return;
> +	}
> +
> +	for (i = 0; i < obj->rodata->num_cpu * obj->rodata->num_metric; i++) {
> +		bpf_map_lookup_elem(map_fd, &i, &pmu_fd);

events is a perf_events array. lookup will always return failure.
So you need remember pmu_fd and free here.

> +		close(pmu_fd);
> +	}
> +}
> +
> +static int profile_open_perf_events(struct profiler_bpf *obj)
> +{
> +	int map_fd, pmu_fd, i, key = 0;
> +	unsigned int cpu, m;
> +
> +	map_fd = bpf_map__fd(obj->maps.events);
> +	if (map_fd < 0) {
> +		p_err("failed to get fd for events map");
> +		return -1;
> +	}
> +
> +	for (m = 0; m < ARRAY_SIZE(metrics); m++) {
> +		if (!metrics[m].selected)
> +			continue;
> +		for (cpu = 0; cpu < obj->rodata->num_cpu; cpu++) {
> +			pmu_fd = syscall(__NR_perf_event_open, &metrics[m].attr,
> +					 -1/*pid*/, cpu, -1/*group_fd*/, 0);
> +			if (pmu_fd < 0 ||
> +			    bpf_map_update_elem(map_fd, &key,
> +						&pmu_fd, BPF_ANY) ||
> +			    ioctl(pmu_fd, PERF_EVENT_IOC_ENABLE, 0)) {
> +				p_err("failed to create event %s on cpu %d",
> +				      metrics[m].name, cpu);
> +				goto err;
> +			}
> +			key++;
> +		}
> +	}
> +	return 0;
> +err:
> +	for (i = key - 1; i >= 0; i--) {
> +		bpf_map_lookup_elem(map_fd, &i, &pmu_fd);

this won't work. You need to remember previously opened pmu_fd and close 
them.

> +		close(pmu_fd);
> +	}
> +	return -1;
> +}
> +
> +static char *profile_target_name(int tgt_fd)
> +{
> +	struct bpf_prog_info_linear *info_linear;
> +	struct bpf_func_info *func_info;
> +	const struct btf_type *t;
> +	char *name = NULL;
> +	struct btf *btf;
> +
> +	info_linear = bpf_program__get_prog_info_linear(
> +		tgt_fd, 1UL << BPF_PROG_INFO_FUNC_INFO);
> +	if (IS_ERR_OR_NULL(info_linear)) {
> +		p_err("failed to get info_linear for prog FD %d", tgt_fd);
> +		return NULL;
> +	}
> +
> +	if (info_linear->info.btf_id == 0 ||
> +	    btf__get_from_id(info_linear->info.btf_id, &btf)) {
> +		p_err("prog FD %d doesn't have valid btf", tgt_fd);
> +		goto out;
> +	}
> +
> +	func_info = (struct bpf_func_info *)(info_linear->info.func_info);
> +	t = btf__type_by_id(btf, func_info[0].type_id);
> +	if (!t) {
> +		p_err("btf %d doesn't have type %d",
> +		      info_linear->info.btf_id, func_info[0].type_id);
> +		goto out;
> +	}
> +	name = strdup(btf__name_by_offset(btf, t->name_off));
> +out:
> +	free(info_linear);
> +	return name;
> +}
> +
> +struct profiler_bpf *profile_obj;
> +int profile_tgt_fd = -1;
> +char *profile_tgt_name;

all static variable?

> +
> +static void profile_print_and_cleanup(void)
> +{
> +	profile_close_perf_events(profile_obj);
> +	profiler_bpf__detach(profile_obj);
> +	profile_read_values(profile_obj);
> +	profiler_bpf__destroy(profile_obj);
> +	profile_print_readings();
> +
> +	if (profile_tgt_fd >= 0)
> +		close(profile_tgt_fd);
> +	free(profile_tgt_name);
> +}
> +
> +#define PROFILE_DEFAULT_LONG_DURATION (3600 * 24)

We need to let user know this value in "help" at least.
In "man" page it may be get updated but I think we probably
should add there as well.

> +
> +static void int_exit(int signo)
> +{
> +	profile_print_and_cleanup();
> +	exit(0);
> +}
> +
> +static int do_profile(int argc, char **argv)
> +{
> +	int num_metric, num_cpu, err = -1;
> +	struct bpf_program *prog;
> +	unsigned long duration;
> +	char *endptr;
> +
> +	/* we at least need: <duration>, "id", <id>, <metric> */
> +	if (argc < 4)
> +		usage();
> +
> +	/* parse profiling duration */
> +	duration = strtoul(*argv, &endptr, 0);
> +	if (*endptr)
> +		duration = PROFILE_DEFAULT_LONG_DURATION;
> +	else
> +		NEXT_ARG();
> +
> +	/* parse target fd */
> +	profile_tgt_fd = prog_parse_fd(&argc, &argv);
> +	if (profile_tgt_fd < 0) {
> +		p_err("failed to parse fd");
> +		return -1;
> +	}
> +
> +	num_metric = profile_parse_metrics(argc, argv);
> +	if (num_metric <= 0)
> +		goto out;
> +
> +	num_cpu = libbpf_num_possible_cpus();
> +	if (num_cpu <= 0) {
> +		p_err("failed to identify number of CPUs");
> +		goto out;
> +	}
> +
> +	profile_obj = profiler_bpf__open();
> +	if (!profile_obj) {
> +		p_err("failed to open and/or load BPF object");
> +		goto out;
> +	}
> +
> +	profile_obj->rodata->num_cpu = num_cpu;
> +	profile_obj->rodata->num_metric = num_metric;
> +
> +	/* adjust map sizes */
> +	bpf_map__resize(profile_obj->maps.events, num_metric * num_cpu);
> +	bpf_map__resize(profile_obj->maps.fentry_readings, num_metric);
> +	bpf_map__resize(profile_obj->maps.accum_readings, num_metric);
> +	bpf_map__resize(profile_obj->maps.counts, 1);
> +
> +	/* change target name */
> +	profile_tgt_name = profile_target_name(profile_tgt_fd);
> +	if (!profile_tgt_name) {
> +		p_err("failed to load target function name");
> +		goto out;
> +	}
> +
> +	bpf_object__for_each_program(prog, profile_obj->obj) {
> +		err = bpf_program__set_attach_target(prog, profile_tgt_fd,
> +						     profile_tgt_name);
> +		if (err) {
> +			p_err("failed to set attach target\n");
> +			goto out;
> +		}
> +	}
> +
> +	set_max_rlimit();
> +	err = profiler_bpf__load(profile_obj);
> +	if (err) {
> +		p_err("failed to load profile_obj");
> +		goto out;
> +	}
> +
> +	profile_open_perf_events(profile_obj);
> +
> +	err = profiler_bpf__attach(profile_obj);
> +	if (err) {
> +		p_err("failed to attach profile_obj");
> +		goto out;
> +	}
> +	signal(SIGINT, int_exit);
> +
> +	sleep(duration);
> +	profile_print_and_cleanup();
> +	return 0;
> +out:
> +	close(profile_tgt_fd);
> +	free(profile_tgt_name);

error path handling misses at least destroying profile_obj in certain 
conditions.

> +	return err;
> +}
> +
> +#endif /* BPFTOOL_WITHOUT_SKELETONS */
> +
>   static int do_help(int argc, char **argv)
>   {
>   	if (json_output) {
> @@ -1560,6 +1983,7 @@ static int do_help(int argc, char **argv)
>   		"                         [data_out FILE [data_size_out L]] \\\n"
>   		"                         [ctx_in FILE [ctx_out FILE [ctx_size_out M]]] \\\n"
>   		"                         [repeat N]\n"
> +		"       %s %s profile [DURATION] PROG METRICs\n"
>   		"       %s %s tracelog\n"
>   		"       %s %s help\n"
>   		"\n"
> @@ -1578,11 +2002,12 @@ static int do_help(int argc, char **argv)
>   		"       ATTACH_TYPE := { msg_verdict | stream_verdict | stream_parser |\n"
>   		"                        flow_dissector }\n"
>   		"       " HELP_SPEC_OPTIONS "\n"
> +		"       METRIC := { cycles | instructions | l1d_loads | llc_misses }\n"
>   		"",
>   		bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2],
>   		bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2],
>   		bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2],
> -		bin_name, argv[-2]);
> +		bin_name, argv[-2], bin_name, argv[-2]);
>   
>   	return 0;
>   }
> @@ -1599,6 +2024,7 @@ static const struct cmd cmds[] = {
>   	{ "detach",	do_detach },
>   	{ "tracelog",	do_tracelog },
>   	{ "run",	do_run },
> +	{ "profile",	do_profile },
>   	{ 0 }
>   };
>   
> diff --git a/tools/bpf/bpftool/skeleton/profiler.bpf.c b/tools/bpf/bpftool/skeleton/profiler.bpf.c
> new file mode 100644
> index 000000000000..abd3a7aacc1f
> --- /dev/null
> +++ b/tools/bpf/bpftool/skeleton/profiler.bpf.c
> @@ -0,0 +1,171 @@
> +// SPDX-License-Identifier: GPL-2.0
> +// Copyright (c) 2019 Facebook
> +#include "profiler.h"
> +#include <linux/bpf.h>
> +#include <bpf/bpf_helpers.h>
> +
> +#define ___bpf_concat(a, b) a ## b
> +#define ___bpf_apply(fn, n) ___bpf_concat(fn, n)
> +#define ___bpf_nth(_, _1, _2, _3, _4, _5, _6, _7, _8, _9, _a, _b, _c, N, ...) N
> +#define ___bpf_narg(...) \
> +	___bpf_nth(_, ##__VA_ARGS__, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)
> +#define ___bpf_empty(...) \
> +	___bpf_nth(_, ##__VA_ARGS__, N, N, N, N, N, N, N, N, N, N, 0)
> +
> +#define ___bpf_ctx_cast0() ctx
> +#define ___bpf_ctx_cast1(x) ___bpf_ctx_cast0(), (void *)ctx[0]
> +#define ___bpf_ctx_cast2(x, args...) ___bpf_ctx_cast1(args), (void *)ctx[1]
> +#define ___bpf_ctx_cast3(x, args...) ___bpf_ctx_cast2(args), (void *)ctx[2]
> +#define ___bpf_ctx_cast4(x, args...) ___bpf_ctx_cast3(args), (void *)ctx[3]
> +#define ___bpf_ctx_cast5(x, args...) ___bpf_ctx_cast4(args), (void *)ctx[4]
> +#define ___bpf_ctx_cast6(x, args...) ___bpf_ctx_cast5(args), (void *)ctx[5]
> +#define ___bpf_ctx_cast7(x, args...) ___bpf_ctx_cast6(args), (void *)ctx[6]
> +#define ___bpf_ctx_cast8(x, args...) ___bpf_ctx_cast7(args), (void *)ctx[7]
> +#define ___bpf_ctx_cast9(x, args...) ___bpf_ctx_cast8(args), (void *)ctx[8]
> +#define ___bpf_ctx_cast10(x, args...) ___bpf_ctx_cast9(args), (void *)ctx[9]
> +#define ___bpf_ctx_cast11(x, args...) ___bpf_ctx_cast10(args), (void *)ctx[10]
> +#define ___bpf_ctx_cast12(x, args...) ___bpf_ctx_cast11(args), (void *)ctx[11]
> +#define ___bpf_ctx_cast(args...) \
> +	___bpf_apply(___bpf_ctx_cast, ___bpf_narg(args))(args)
> +
> +/*
> + * BPF_PROG is a convenience wrapper for generic tp_btf/fentry/fexit and
> + * similar kinds of BPF programs, that accept input arguments as a single
> + * pointer to untyped u64 array, where each u64 can actually be a typed
> + * pointer or integer of different size. Instead of requring user to write
> + * manual casts and work with array elements by index, BPF_PROG macro
> + * allows user to declare a list of named and typed input arguments in the
> + * same syntax as for normal C function. All the casting is hidden and
> + * performed transparently, while user code can just assume working with
> + * function arguments of specified type and name.
> + *
> + * Original raw context argument is preserved as well as 'ctx' argument.
> + * This is useful when using BPF helpers that expect original context
> + * as one of the parameters (e.g., for bpf_perf_event_output()).
> + */
> +#define BPF_PROG(name, args...)						    \
> +name(unsigned long long *ctx);						    \
> +static __always_inline typeof(name(0))					    \
> +____##name(unsigned long long *ctx, ##args);				    \
> +typeof(name(0)) name(unsigned long long *ctx)				    \
> +{									    \
> +	_Pragma("GCC diagnostic push")					    \
> +	_Pragma("GCC diagnostic ignored \"-Wint-conversion\"")		    \
> +	return ____##name(___bpf_ctx_cast(args));			    \
> +	_Pragma("GCC diagnostic pop")					    \
> +}									    \
> +static __always_inline typeof(name(0))					    \
> +____##name(unsigned long long *ctx, ##args)
> 

I know it is internal. But all the above macros are not great in
a bpf program. If we can reuse/amend current infrastructure.
That will be great. It may benefit users writing a similar
bpf program to here.

+
> +/* map of perf event fds, num_cpu * num_metric entries */
> +struct {
> +	__uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
> +	__uint(key_size, sizeof(u32));
> +	__uint(value_size, sizeof(int));
> +} events SEC(".maps");
> +
> +/* readings at fentry */
> +struct {
> +	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
> +	__uint(key_size, sizeof(u32));
> +	__uint(value_size, sizeof(struct bpf_perf_event_value));
> +} fentry_readings SEC(".maps");
> +
> +/* accumulated readings */
> +struct {
> +	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
> +	__uint(key_size, sizeof(u32));
> +	__uint(value_size, sizeof(struct bpf_perf_event_value));
> +} accum_readings SEC(".maps");
> +
> +/* sample counts, one per cpu */
> +struct {
> +	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
> +	__uint(key_size, sizeof(u32));
> +	__uint(value_size, sizeof(u64));
> +} counts SEC(".maps");
> +
> +const volatile __u32 num_cpu = 1;
> +const volatile __u32 num_metric = 1;
> +#define MAX_NUM_MATRICS 4
> +
> +SEC("fentry/XXX")
> +int BPF_PROG(fentry_XXX)
> +{
> +	struct bpf_perf_event_value *ptrs[MAX_NUM_MATRICS];
> +	u32 key = bpf_get_smp_processor_id();
> +	u32 i;
> +
> +	/* look up before reading, to reduce error */
> +	for (i = 0; i < num_metric && i < MAX_NUM_MATRICS; i++) {
> +		u32 flag = i;
> +
> +		ptrs[i] = bpf_map_lookup_elem(&fentry_readings, &flag);
> +		if (!ptrs[i])
> +			return 0;
> +	}
> +
> +	for (i = 0; i < num_metric && i < MAX_NUM_MATRICS; i++) {
> +		struct bpf_perf_event_value reading;
> +		int err;
> +
> +		err = bpf_perf_event_read_value(&events, key, &reading,
> +						sizeof(reading));
> +		if (err)
> +			return 0;
> +		*(ptrs[i]) = reading;
> +		key += num_cpu;
> +	}
> +
> +	return 0;
> +}
> +
> +static inline void
> +fexit_update_maps(u32 id, struct bpf_perf_event_value *after)
> +{
> +	struct bpf_perf_event_value *before, diff, *accum;
> +
> +	before = bpf_map_lookup_elem(&fentry_readings, &id);
> +	/* only account samples with a valid fentry_reading */
> +	if (before && before->counter) {
> +		struct bpf_perf_event_value *accum;
> +
> +		diff.counter = after->counter - before->counter;
> +		diff.enabled = after->enabled - before->enabled;
> +		diff.running = after->running - before->running;
> +
> +		accum = bpf_map_lookup_elem(&accum_readings, &id);
> +		if (accum) {
> +			accum->counter += diff.counter;
> +			accum->enabled += diff.enabled;
> +			accum->running += diff.running;
> +		}
> +	}
> +}
> +
> +SEC("fexit/XXX")
> +int BPF_PROG(fexit_XXX)
> +{
> +	struct bpf_perf_event_value readings[MAX_NUM_MATRICS];
> +	u32 cpu = bpf_get_smp_processor_id();
> +	u32 i, one = 1, zero = 0;
> +	int err;
> +	u64 *count;
> +
> +	/* read all events before updating the maps, to reduce error */
> +	for (i = 0; i < num_metric && i < MAX_NUM_MATRICS; i++) {
> +		err = bpf_perf_event_read_value(&events, cpu + i * num_cpu,
> +						readings + i, sizeof(*readings));
> +		if (err)
> +			return 0;
> +	}
> +	count = bpf_map_lookup_elem(&counts, &zero);
> +	if (count) {
> +		*count += 1;
> +		for (i = 0; i < num_metric && i < MAX_NUM_MATRICS; i++)
> +			fexit_update_maps(i, &readings[i]);
> +	}
> +	return 0;
> +}
> +
> +char LICENSE[] SEC("license") = "GPL";
> diff --git a/tools/bpf/bpftool/skeleton/profiler.h b/tools/bpf/bpftool/skeleton/profiler.h
> new file mode 100644
> index 000000000000..ae15cb0c4d43
> --- /dev/null
> +++ b/tools/bpf/bpftool/skeleton/profiler.h
> @@ -0,0 +1,47 @@
> +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */
> +#ifndef __PROFILER_H
> +#define __PROFILER_H
> +
> +/* useful typedefs from vimlinux.h */
> +
> +typedef signed char __s8;
> +typedef unsigned char __u8;
> +typedef short int __s16;
> +typedef short unsigned int __u16;
> +typedef int __s32;
> +typedef unsigned int __u32;
> +typedef long long int __s64;
> +typedef long long unsigned int __u64;
> +
> +typedef __s8 s8;
> +typedef __u8 u8;
> +typedef __s16 s16;
> +typedef __u16 u16;
> +typedef __s32 s32;
> +typedef __u32 u32;
> +typedef __s64 s64;
> +typedef __u64 u64;
> +
> +enum {
> +	false = 0,
> +	true = 1,
> +};
> +
> +#ifdef __CHECKER__
> +#define __bitwise__ __attribute__((bitwise))
> +#else
> +#define __bitwise__
> +#endif
> +#define __bitwise __bitwise__
> +
> +typedef __u16 __bitwise __le16;
> +typedef __u16 __bitwise __be16;
> +typedef __u32 __bitwise __le32;
> +typedef __u32 __bitwise __be32;
> +typedef __u64 __bitwise __le64;
> +typedef __u64 __bitwise __be64;
> +
> +typedef __u16 __bitwise __sum16;
> +typedef __u32 __bitwise __wsum;
> +
> +#endif /* __PROFILER_H */
> diff --git a/tools/scripts/Makefile.include b/tools/scripts/Makefile.include
> index ded7a950dc40..59f31f01cb93 100644
> --- a/tools/scripts/Makefile.include
> +++ b/tools/scripts/Makefile.include
> @@ -106,6 +106,7 @@ ifneq ($(silent),1)
>     ifneq ($(V),1)
>   	QUIET_CC       = @echo '  CC       '$@;
>   	QUIET_CC_FPIC  = @echo '  CC FPIC  '$@;
> +	QUIET_CLANG    = @echo '  CLANG    '$@;
>   	QUIET_AR       = @echo '  AR       '$@;
>   	QUIET_LINK     = @echo '  LINK     '$@;
>   	QUIET_MKDIR    = @echo '  MKDIR    '$@;
>
Yonghong Song March 2, 2020, 4:26 a.m. UTC | #7
On 3/1/20 7:42 PM, Yonghong Song wrote:
> 
> 
> On 3/1/20 2:37 PM, Song Liu wrote:
>>
>>
>>> On Feb 29, 2020, at 7:52 PM, Yonghong Song <yhs@fb.com> wrote:
>>>
>>>
>>>
>>> On 2/28/20 3:40 PM, Song Liu wrote:
>>>> With fentry/fexit programs, it is possible to profile BPF program with
>>>> hardware counters. Introduce bpftool "prog profile", which measures key
>>>> metrics of a BPF program.
>>>> bpftool prog profile command creates per-cpu perf events. Then it 
>>>> attaches
>>>> fentry/fexit programs to the target BPF program. The fentry program 
>>>> saves
>>>> perf event value to a map. The fexit program reads the perf event 
>>>> again,
>>>> and calculates the difference, which is the instructions/cycles used by
>>>> the target program.
>>>> Example input and output:
>>>>    ./bpftool prog profile 3 id 337 cycles instructions llc_misses
>>>>          4228 run_cnt
>>>>       3403698 cycles                                              
>>>> (84.08%)
>>>>       3525294 instructions   #  1.04 insn per cycle               
>>>> (84.05%)
>>>>            13 llc_misses     #  3.69 LLC misses per million isns  
>>>> (83.50%)
>>>> This command measures cycles and instructions for BPF program with id
>>>> 337 for 3 seconds. The program has triggered 4228 times. The rest of 
>>>> the
>>>> output is similar to perf-stat. In this example, the counters were only
>>>> counting ~84% of the time because of time multiplexing of perf 
>>>> counters.
>>>> Note that, this approach measures cycles and instructions in very small
>>>> increments. So the fentry/fexit programs introduce noticeable errors to
>>>> the measurement results.
>>>> The fentry/fexit programs are generated with BPF skeletons. 
>>>> Therefore, we
>>>> build bpftool twice. The first time _bpftool is built without 
>>>> skeletons.
>>>> Then, _bpftool is used to generate the skeletons. The second time, 
>>>> bpftool
>>>> is built with skeletons.
>>>> Signed-off-by: Song Liu <songliubraving@fb.com>
>>>> ---
>>>>   tools/bpf/bpftool/Makefile                |  18 +
>>>>   tools/bpf/bpftool/prog.c                  | 428 
>>>> +++++++++++++++++++++-
>>>>   tools/bpf/bpftool/skeleton/profiler.bpf.c | 171 +++++++++
>>>>   tools/bpf/bpftool/skeleton/profiler.h     |  47 +++
>>>>   tools/scripts/Makefile.include            |   1 +
>>>>   5 files changed, 664 insertions(+), 1 deletion(-)
>>>>   create mode 100644 tools/bpf/bpftool/skeleton/profiler.bpf.c
>>>>   create mode 100644 tools/bpf/bpftool/skeleton/profiler.h
>>>> diff --git a/tools/bpf/bpftool/Makefile b/tools/bpf/bpftool/Makefile
>>>> index c4e810335810..c035fc107027 100644
>>>> --- a/tools/bpf/bpftool/Makefile
>>>> +++ b/tools/bpf/bpftool/Makefile
>>>> @@ -59,6 +59,7 @@ LIBS = $(LIBBPF) -lelf -lz
>>>>     INSTALL ?= install
>>>>   RM ?= rm -f
>>>> +CLANG ?= clang
>>>>     FEATURE_USER = .bpftool
>>>>   FEATURE_TESTS = libbfd disassembler-four-args reallocarray zlib
>>>> @@ -110,6 +111,22 @@ SRCS += $(BFD_SRCS)
>>>>   endif
>>>>     OBJS = $(patsubst %.c,$(OUTPUT)%.o,$(SRCS)) $(OUTPUT)disasm.o
>>>> +_OBJS = $(filter-out $(OUTPUT)prog.o,$(OBJS)) $(OUTPUT)_prog.o
>>>> +
>>>> +$(OUTPUT)_prog.o: prog.c
>>>> +    $(QUIET_CC)$(COMPILE.c) -MMD -DBPFTOOL_WITHOUT_SKELETONS -o $@ $<
>>>> +
>>>> +$(OUTPUT)_bpftool: $(_OBJS) $(LIBBPF)
>>>> +    $(QUIET_LINK)$(CC) $(CFLAGS) $(LDFLAGS) -o $@ $(_OBJS) $(LIBS)
>>>> +
>>>> +skeleton/profiler.bpf.o: skeleton/profiler.bpf.c
>>>> +    $(QUIET_CLANG)$(CLANG) -g -O2 -target bpf -c $< -o $@
>>>
>>> With a fresh checkout, applying this patch and just selftests like
>>>   make -C tools/testing/selftests/bpf
>>>
>>> I got the following build error:
>>>
>>> make[2]: Leaving directory `/data/users/yhs/work/net-next/tools/lib/bpf'
>>> clang -g -O2 -target bpf -c skeleton/profiler.bpf.c -o 
>>> skeleton/profiler.bpf.o
>>> skeleton/profiler.bpf.c:5:10: fatal error: 'bpf/bpf_helpers.h' file 
>>> not found
>>> #include <bpf/bpf_helpers.h>
>>>          ^~~~~~~~~~~~~~~~~~~
>>> 1 error generated.
>>> make[1]: *** [skeleton/profiler.bpf.o] Error 1
>>>
>>> I think Makefile should be tweaked to avoid selftest failure.
>>
>> Hmm... I am not seeing this error. The build succeeded in the test.
> 
> Just tried again with a *fresh* checkout of tools/ directory with the 
> patch.
>    -bash-4.4$ make -C tools/testing/selftests/bpf
>    ...
>    LINK 
> /data/users/yhs/work/net-next/tools/testing/selftests/bpf/tools/build/bpftool//libbpf/libbpf.a 
> 
>    LINK 
> /data/users/yhs/work/net-next/tools/testing/selftests/bpf/tools/build/bpftool/_bpftool 
> 
> make[1]: *** No rule to make target `skeleton/profiler.bpf.c', needed by 
> `skeleton/profiler.bpf.o'.  Stop.
> make: *** 
> [/data/users/yhs/work/net-next/tools/testing/selftests/bpf/tools/sbin/bpftool] 
> Error 2
> make: Leaving directory 
> `/data/users/yhs/work/net-next/tools/testing/selftests/bpf
> 
> The error is different from my previous try, but the build still fails.

FYI. The following change fixed build in my env.

diff --git a/tools/bpf/bpftool/Makefile b/tools/bpf/bpftool/Makefile
index c035fc107027..20a90d8450f8 100644
--- a/tools/bpf/bpftool/Makefile
+++ b/tools/bpf/bpftool/Makefile
@@ -120,7 +120,7 @@ $(OUTPUT)_bpftool: $(_OBJS) $(LIBBPF)
         $(QUIET_LINK)$(CC) $(CFLAGS) $(LDFLAGS) -o $@ $(_OBJS) $(LIBS)

  skeleton/profiler.bpf.o: skeleton/profiler.bpf.c
-       $(QUIET_CLANG)$(CLANG) -g -O2 -target bpf -c $< -o $@
+       $(QUIET_CLANG)$(CLANG) -I$(srctree)/tools/lib -g -O2 -target bpf 
-c $< -o $@

  profiler.skel.h: $(OUTPUT)_bpftool skeleton/profiler.bpf.o
         $(QUIET_GEN)$(OUTPUT)./_bpftool gen skeleton 
skeleton/profiler.bpf.o > $@

> 
>>
>> Thanks,
>> Song
>>
Quentin Monnet March 2, 2020, 1:36 p.m. UTC | #8
Hi Song,

Thanks for this work! Some remarks (mostly nitpicks) inline.

2020-02-28 15:40 UTC-0800 ~ Song Liu <songliubraving@fb.com>
> With fentry/fexit programs, it is possible to profile BPF program with
> hardware counters. Introduce bpftool "prog profile", which measures key
> metrics of a BPF program.
> 
> bpftool prog profile command creates per-cpu perf events. Then it attaches
> fentry/fexit programs to the target BPF program. The fentry program saves
> perf event value to a map. The fexit program reads the perf event again,
> and calculates the difference, which is the instructions/cycles used by
> the target program.
> 
> Example input and output:
> 
>   ./bpftool prog profile 3 id 337 cycles instructions llc_misses
> 
>         4228 run_cnt
>      3403698 cycles                                              (84.08%)
>      3525294 instructions   #  1.04 insn per cycle               (84.05%)
>           13 llc_misses     #  3.69 LLC misses per million isns  (83.50%)
> 
> This command measures cycles and instructions for BPF program with id
> 337 for 3 seconds. The program has triggered 4228 times. The rest of the
> output is similar to perf-stat. In this example, the counters were only
> counting ~84% of the time because of time multiplexing of perf counters.
> 
> Note that, this approach measures cycles and instructions in very small
> increments. So the fentry/fexit programs introduce noticeable errors to
> the measurement results.
> 
> The fentry/fexit programs are generated with BPF skeletons. Therefore, we
> build bpftool twice. The first time _bpftool is built without skeletons.
> Then, _bpftool is used to generate the skeletons. The second time, bpftool
> is built with skeletons.
> 
> Signed-off-by: Song Liu <songliubraving@fb.com>
> ---

[...]

> diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c
> index 1996e67a2f00..39f0f14464ad 100644
> --- a/tools/bpf/bpftool/prog.c
> +++ b/tools/bpf/bpftool/prog.c
> @@ -10,12 +10,16 @@
>  #include <string.h>
>  #include <time.h>
>  #include <unistd.h>
> +#include <signal.h>
>  #include <net/if.h>
>  #include <sys/types.h>
>  #include <sys/stat.h>
> +#include <sys/ioctl.h>
> +#include <sys/syscall.h>
>  
>  #include <linux/err.h>
>  #include <linux/sizes.h>
> +#include <linux/perf_event.h>

Nit: Could you please keep the includes sorted alphabetically in each
category (standard, sys/, linux/)?

>  
>  #include <bpf/bpf.h>
>  #include <bpf/btf.h>
> @@ -1537,6 +1541,425 @@ static int do_loadall(int argc, char **argv)
>  	return load_with_options(argc, argv, false);
>  }
>  
> +#ifdef BPFTOOL_WITHOUT_SKELETONS
> +
> +static int do_profile(int argc, char **argv)
> +{
> +	return 0;
> +}
> +
> +#else /* BPFTOOL_WITHOUT_SKELETONS */
> +
> +#include "profiler.skel.h"
> +
> +#define SAMPLE_PERIOD  0x7fffffffffffffffULL
> +struct profile_metric {
> +	const char *name;
> +	struct perf_event_attr attr;
> +	bool selected;
> +	struct bpf_perf_event_value val;
> +
> +	/* calculate ratios like instructions per cycle */
> +	const int ratio_metric; /* 0 for N/A, 1 for index 0 (cycles)  */

Nit: Double space "  */"

> +	const float ratio_mul;
> +	const char *ratio_desc;
> +} metrics[] = {
> +	{
> +		.name = "cycles",
> +		.attr = {
> +			.freq = 0,
> +			.sample_period = SAMPLE_PERIOD,
> +			.inherit = 0,
> +			.type = PERF_TYPE_HARDWARE,
> +			.read_format = 0,
> +			.sample_type = 0,
> +			.config = PERF_COUNT_HW_CPU_CYCLES,
> +		},
> +	},
> +	{
> +		.name = "instructions",
> +		.attr = {
> +			.freq = 0,
> +			.sample_period = SAMPLE_PERIOD,
> +			.inherit = 0,
> +			.type = PERF_TYPE_HARDWARE,
> +			.read_format = 0,
> +			.sample_type = 0,
> +			.config = PERF_COUNT_HW_INSTRUCTIONS,
> +		},
> +		.ratio_metric = 1,
> +		.ratio_mul = 1.0,
> +		.ratio_desc = "insn per cycle",
> +	},
> +	{
> +		.name = "l1d_loads",
> +		.attr = {
> +			.freq = 0,
> +			.sample_period = SAMPLE_PERIOD,
> +			.inherit = 0,
> +			.type = PERF_TYPE_HW_CACHE,
> +			.read_format = 0,
> +			.sample_type = 0,
> +			.config =
> +				PERF_COUNT_HW_CACHE_L1D |
> +				(PERF_COUNT_HW_CACHE_OP_READ << 8) |
> +				(PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16),
> +		},
> +	},
> +	{
> +		.name = "llc_misses",
> +		.attr = {
> +			.freq = 0,
> +			.sample_period = SAMPLE_PERIOD,
> +			.inherit = 0,
> +			.type = PERF_TYPE_HW_CACHE,
> +			.read_format = 0,
> +			.sample_type = 0,
> +			.config =
> +				PERF_COUNT_HW_CACHE_LL |
> +				(PERF_COUNT_HW_CACHE_OP_READ << 8) |
> +				(PERF_COUNT_HW_CACHE_RESULT_MISS << 16),
> +		},
> +		.ratio_metric = 2,
> +		.ratio_mul = 1e6,
> +		.ratio_desc = "LLC misses per million isns",

s/isns/insns/, to remain consistent with "insn per cycle"?
(And if you change it, please update the example in man page)

> +	},
> +};
> +
> +u64 profile_total_count;

We should probably stick to __u64 (and __u32, etc.) in bpftool.

> +
> +#define MAX_NUM_PROFILE_METRICS 4
> +
> +static int profile_parse_metrics(int argc, char **argv)
> +{
> +	unsigned int metric_cnt;
> +	int selected_cnt = 0;
> +	unsigned int i;
> +
> +	metric_cnt = sizeof(metrics) / sizeof(struct profile_metric);
> +
> +	while (argc > 0) {
> +		for (i = 0; i < metric_cnt; i++) {
> +			if (strcmp(argv[0], metrics[i].name) == 0) {

Could we use prefixing (is_prefix() instead of strcmp()), as for the
other bpftool commands and keywords?

> +				if (!metrics[i].selected)
> +					selected_cnt++;
> +				metrics[i].selected = true;
> +				break;
> +			}
> +		}
> +		if (i == metric_cnt) {
> +			p_err("unknown metric %s", argv[0]);
> +			return -1;
> +		}
> +		NEXT_ARG();
> +	}
> +	if (selected_cnt > MAX_NUM_PROFILE_METRICS) {
> +		p_err("too many (%d) metrics, please specify no more than %d metrics at at time",
> +		      selected_cnt, MAX_NUM_PROFILE_METRICS);
> +		return -1;
> +	}
> +	return selected_cnt;
> +}

[...]

> +
> +static int do_profile(int argc, char **argv)
> +{
> +	int num_metric, num_cpu, err = -1;
> +	struct bpf_program *prog;
> +	unsigned long duration;
> +	char *endptr;
> +
> +	/* we at least need: <duration>, "id", <id>, <metric> */

Not necessarily "id" (can be "tag", "pinned", "name") but ok, probably
doesn't matter in the comment.

More important: I thought "duration" was optional?

> +	if (argc < 4)
> +		usage();

Or, as in other subcommands:

	if (!REQ_ARGS(4))
		return -EINVAL;

> +
> +	/* parse profiling duration */
> +	duration = strtoul(*argv, &endptr, 0);
> +	if (*endptr)
> +		duration = PROFILE_DEFAULT_LONG_DURATION;
> +	else
> +		NEXT_ARG();

If the field is optional, maybe it could be interesting to add a keyword
to the command line before passing it ("duration <n>"). It may turn
helpful if we add other optional parameters in the future, to know which
one we are passing (and might even help with bash completion).

> +
> +	/* parse target fd */
> +	profile_tgt_fd = prog_parse_fd(&argc, &argv);
> +	if (profile_tgt_fd < 0) {
> +		p_err("failed to parse fd");
> +		return -1;
> +	}

I would be tempted to have program handle _before_ duration on the
command line, what do you think?

    bpftool prog profile <mandatory_prog> <profile_options> <metrics>

> +
> +	num_metric = profile_parse_metrics(argc, argv);
> +	if (num_metric <= 0)
> +		goto out;
> +
> +	num_cpu = libbpf_num_possible_cpus();
> +	if (num_cpu <= 0) {
> +		p_err("failed to identify number of CPUs");
> +		goto out;
> +	}
> +
> +	profile_obj = profiler_bpf__open();
> +	if (!profile_obj) {
> +		p_err("failed to open and/or load BPF object");
> +		goto out;
> +	}
> +
> +	profile_obj->rodata->num_cpu = num_cpu;
> +	profile_obj->rodata->num_metric = num_metric;
> +
> +	/* adjust map sizes */
> +	bpf_map__resize(profile_obj->maps.events, num_metric * num_cpu);
> +	bpf_map__resize(profile_obj->maps.fentry_readings, num_metric);
> +	bpf_map__resize(profile_obj->maps.accum_readings, num_metric);
> +	bpf_map__resize(profile_obj->maps.counts, 1);
> +
> +	/* change target name */
> +	profile_tgt_name = profile_target_name(profile_tgt_fd);
> +	if (!profile_tgt_name) {
> +		p_err("failed to load target function name");
> +		goto out;
> +	}
> +
> +	bpf_object__for_each_program(prog, profile_obj->obj) {
> +		err = bpf_program__set_attach_target(prog, profile_tgt_fd,
> +						     profile_tgt_name);
> +		if (err) {
> +			p_err("failed to set attach target\n");
> +			goto out;
> +		}
> +	}
> +
> +	set_max_rlimit();
> +	err = profiler_bpf__load(profile_obj);
> +	if (err) {
> +		p_err("failed to load profile_obj");
> +		goto out;
> +	}
> +
> +	profile_open_perf_events(profile_obj);
> +
> +	err = profiler_bpf__attach(profile_obj);
> +	if (err) {
> +		p_err("failed to attach profile_obj");
> +		goto out;
> +	}
> +	signal(SIGINT, int_exit);
> +
> +	sleep(duration);
> +	profile_print_and_cleanup();
> +	return 0;
> +out:
> +	close(profile_tgt_fd);
> +	free(profile_tgt_name);
> +	return err;
> +}
> +
> +#endif /* BPFTOOL_WITHOUT_SKELETONS */
> +
>  static int do_help(int argc, char **argv)
>  {
>  	if (json_output) {
> @@ -1560,6 +1983,7 @@ static int do_help(int argc, char **argv)
>  		"                         [data_out FILE [data_size_out L]] \\\n"
>  		"                         [ctx_in FILE [ctx_out FILE [ctx_size_out M]]] \\\n"
>  		"                         [repeat N]\n"
> +		"       %s %s profile [DURATION] PROG METRICs\n"
>  		"       %s %s tracelog\n"
>  		"       %s %s help\n"
>  		"\n"
> @@ -1578,11 +2002,12 @@ static int do_help(int argc, char **argv)
>  		"       ATTACH_TYPE := { msg_verdict | stream_verdict | stream_parser |\n"
>  		"                        flow_dissector }\n"
>  		"       " HELP_SPEC_OPTIONS "\n"
> +		"       METRIC := { cycles | instructions | l1d_loads | llc_misses }\n"

Can we keep HELP_SPEC_OPTIONS at the end (below METRICS) please?

>  		"",
>  		bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2],
>  		bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2],
>  		bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2],
> -		bin_name, argv[-2]);
> +		bin_name, argv[-2], bin_name, argv[-2]);
>  
>  	return 0;
>  }
> @@ -1599,6 +2024,7 @@ static const struct cmd cmds[] = {
>  	{ "detach",	do_detach },
>  	{ "tracelog",	do_tracelog },
>  	{ "run",	do_run },
> +	{ "profile",	do_profile },
>  	{ 0 }
>  };
>  
> diff --git a/tools/bpf/bpftool/skeleton/profiler.bpf.c b/tools/bpf/bpftool/skeleton/profiler.bpf.c
> new file mode 100644
> index 000000000000..abd3a7aacc1f
> --- /dev/null
> +++ b/tools/bpf/bpftool/skeleton/profiler.bpf.c
> @@ -0,0 +1,171 @@
> +// SPDX-License-Identifier: GPL-2.0
> +// Copyright (c) 2019 Facebook

2019?

> +#include "profiler.h"
> +#include <linux/bpf.h>
> +#include <bpf/bpf_helpers.h>

[...]

> diff --git a/tools/bpf/bpftool/skeleton/profiler.h b/tools/bpf/bpftool/skeleton/profiler.h
> new file mode 100644
> index 000000000000..ae15cb0c4d43
> --- /dev/null
> +++ b/tools/bpf/bpftool/skeleton/profiler.h
> @@ -0,0 +1,47 @@
> +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */
> +#ifndef __PROFILER_H
> +#define __PROFILER_H
> +
> +/* useful typedefs from vimlinux.h */

Typo: vimlinux.h
Song Liu March 2, 2020, 4:52 p.m. UTC | #9
> On Mar 2, 2020, at 5:36 AM, Quentin Monnet <quentin@isovalent.com> wrote:
> 
> Hi Song,
> 
> Thanks for this work! Some remarks (mostly nitpicks) inline.
> 
> 2020-02-28 15:40 UTC-0800 ~ Song Liu <songliubraving@fb.com>
>> With fentry/fexit programs, it is possible to profile BPF program with
>> hardware counters. Introduce bpftool "prog profile", which measures key
>> metrics of a BPF program.
>> 
>> bpftool prog profile command creates per-cpu perf events. Then it attaches
>> fentry/fexit programs to the target BPF program. The fentry program saves
>> perf event value to a map. The fexit program reads the perf event again,
>> and calculates the difference, which is the instructions/cycles used by
>> the target program.
>> 
>> Example input and output:
>> 
>>  ./bpftool prog profile 3 id 337 cycles instructions llc_misses
>> 
>>        4228 run_cnt
>>     3403698 cycles                                              (84.08%)
>>     3525294 instructions   #  1.04 insn per cycle               (84.05%)
>>          13 llc_misses     #  3.69 LLC misses per million isns  (83.50%)
>> 
>> This command measures cycles and instructions for BPF program with id
>> 337 for 3 seconds. The program has triggered 4228 times. The rest of the
>> output is similar to perf-stat. In this example, the counters were only
>> counting ~84% of the time because of time multiplexing of perf counters.
>> 
>> Note that, this approach measures cycles and instructions in very small
>> increments. So the fentry/fexit programs introduce noticeable errors to
>> the measurement results.
>> 
>> The fentry/fexit programs are generated with BPF skeletons. Therefore, we
>> build bpftool twice. The first time _bpftool is built without skeletons.
>> Then, _bpftool is used to generate the skeletons. The second time, bpftool
>> is built with skeletons.
>> 
>> Signed-off-by: Song Liu <songliubraving@fb.com>
>> ---

Thanks Yonghong and Quentin! I will fix these and send v3. 

Song
Song Liu March 3, 2020, 12:10 a.m. UTC | #10
> On Mar 1, 2020, at 8:24 PM, Yonghong Song <yhs@fb.com> wrote:
>> +	},
>> +	{
>> +		.name = "instructions",
>> +		.attr = {
>> +			.freq = 0,
>> +			.sample_period = SAMPLE_PERIOD,
>> +			.inherit = 0,
>> +			.type = PERF_TYPE_HARDWARE,
>> +			.read_format = 0,
>> +			.sample_type = 0,
>> +			.config = PERF_COUNT_HW_INSTRUCTIONS,
>> +		},
>> +		.ratio_metric = 1,
>> +		.ratio_mul = 1.0,
>> +		.ratio_desc = "insn per cycle",
>> +	},
>> +	{
>> +		.name = "l1d_loads",
>> +		.attr = {
>> +			.freq = 0,
>> +			.sample_period = SAMPLE_PERIOD,
>> +			.inherit = 0,
>> +			.type = PERF_TYPE_HW_CACHE,
>> +			.read_format = 0,
>> +			.sample_type = 0,
>> +			.config =
>> +				PERF_COUNT_HW_CACHE_L1D |
>> +				(PERF_COUNT_HW_CACHE_OP_READ << 8) |
>> +				(PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16),
>> +		},
> 
> why we do not have metric here?

This follows perf-stat design: some events have another event to compare 
against, like instructions per cycle, etc. 

> 
>> +	},
>> +	{
>> +		.name = "llc_misses",
>> +		.attr = {
>> +			.freq = 0,
>> +			.sample_period = SAMPLE_PERIOD,
>> +			.inherit = 0,
>> +			.type = PERF_TYPE_HW_CACHE,
>> +			.read_format = 0,
>> +			.sample_type = 0,
>> +			.config =
>> +				PERF_COUNT_HW_CACHE_LL |
>> +				(PERF_COUNT_HW_CACHE_OP_READ << 8) |
>> +				(PERF_COUNT_HW_CACHE_RESULT_MISS << 16),
>> +		},
>> +		.ratio_metric = 2,
>> +		.ratio_mul = 1e6,
>> +		.ratio_desc = "LLC misses per million isns",
>> +	},
>> +};
> 
> icache miss and itlb miss might be useful as well as the code will jump
> to a different physical page. I think we should addd them. dtlb_miss
> probably not a big problem, but it would be good to be an option.

I plan to add more events later on. 

> 
> For ratio_metric, we explicitly assign a slot here. Any specific reason?
> We can just say this metric *permits* ratio_metric and then ratio_matric
> is assigned dynamically by the user command line options?
> 
> I am thinking how we could support *all* metrics the underlying system
> support based on `perf list`. This can be the future work though.

We are also thinking about adding similar functionality to perf-stat, 
which will be more flexible. 

> 
>> +
>> +u64 profile_total_count;
> 
[...]
>> +
>> +	reading_map_fd = bpf_map__fd(obj->maps.accum_readings);
>> +	count_map_fd = bpf_map__fd(obj->maps.counts);
>> +	if (reading_map_fd < 0 || count_map_fd < 0) {
>> +		p_err("failed to get fd for map");
>> +		return;
>> +	}
>> +
>> +	assert(bpf_map_lookup_elem(count_map_fd, &key, counts) == 0);
> 
> In the patch, I see sometime bpf_map_lookup_elem() result is checked
> with failure being handled. Sometimes, assert() is used. Maybe be
> consistent with checking result approach?

Will fix. 

[...]
> 
>> 
>> +}
>> +
>> +#define PROFILE_DEFAULT_LONG_DURATION (3600 * 24)
> 
> We need to let user know this value in "help" at least.
> In "man" page it may be get updated but I think we probably
> should add there as well.

I am planning to just use UINT_MAX. 

> 
>> +
[...]

>> +#define BPF_PROG(name, args...)						    \
>> +name(unsigned long long *ctx);						    \
>> +static __always_inline typeof(name(0))					    \
>> +____##name(unsigned long long *ctx, ##args);				    \
>> +typeof(name(0)) name(unsigned long long *ctx)				    \
>> +{									    \
>> +	_Pragma("GCC diagnostic push")					    \
>> +	_Pragma("GCC diagnostic ignored \"-Wint-conversion\"")		    \
>> +	return ____##name(___bpf_ctx_cast(args));			    \
>> +	_Pragma("GCC diagnostic pop")					    \
>> +}									    \
>> +static __always_inline typeof(name(0))					    \
>> +____##name(unsigned long long *ctx, ##args)
> 
> I know it is internal. But all the above macros are not great in
> a bpf program. If we can reuse/amend current infrastructure.
> That will be great. It may benefit users writing a similar
> bpf program to here.

This is copied from tools/testing/selftests/bpf/bpf_trace_helpers.h. 
I think we will move them to libbpf later. Then we can use that 
version instead. 

Thanks,
Song
Arnaldo Carvalho de Melo March 3, 2020, 1:06 p.m. UTC | #11
Em Tue, Mar 03, 2020 at 12:10:50AM +0000, Song Liu escreveu:
> > On Mar 1, 2020, at 8:24 PM, Yonghong Song <yhs@fb.com> wrote:
> >> +	},
> >> +	{
> >> +		.name = "instructions",
> >> +		.attr = {
> >> +			.freq = 0,
> >> +			.sample_period = SAMPLE_PERIOD,
> >> +			.inherit = 0,
> >> +			.type = PERF_TYPE_HARDWARE,
> >> +			.read_format = 0,
> >> +			.sample_type = 0,
> >> +			.config = PERF_COUNT_HW_INSTRUCTIONS,
> >> +		},
> >> +		.ratio_metric = 1,
> >> +		.ratio_mul = 1.0,
> >> +		.ratio_desc = "insn per cycle",
> >> +	},
> >> +	{
> >> +		.name = "l1d_loads",
> >> +		.attr = {
> >> +			.freq = 0,
> >> +			.sample_period = SAMPLE_PERIOD,
> >> +			.inherit = 0,
> >> +			.type = PERF_TYPE_HW_CACHE,
> >> +			.read_format = 0,
> >> +			.sample_type = 0,
> >> +			.config =
> >> +				PERF_COUNT_HW_CACHE_L1D |
> >> +				(PERF_COUNT_HW_CACHE_OP_READ << 8) |
> >> +				(PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16),
> >> +		},
> > 
> > why we do not have metric here?
> 
> This follows perf-stat design: some events have another event to compare 
> against, like instructions per cycle, etc. 
> 
> > 
> >> +	},
> >> +	{
> >> +		.name = "llc_misses",
> >> +		.attr = {
> >> +			.freq = 0,
> >> +			.sample_period = SAMPLE_PERIOD,
> >> +			.inherit = 0,
> >> +			.type = PERF_TYPE_HW_CACHE,
> >> +			.read_format = 0,
> >> +			.sample_type = 0,
> >> +			.config =
> >> +				PERF_COUNT_HW_CACHE_LL |
> >> +				(PERF_COUNT_HW_CACHE_OP_READ << 8) |
> >> +				(PERF_COUNT_HW_CACHE_RESULT_MISS << 16),
> >> +		},
> >> +		.ratio_metric = 2,
> >> +		.ratio_mul = 1e6,
> >> +		.ratio_desc = "LLC misses per million isns",
> >> +	},
> >> +};

> > icache miss and itlb miss might be useful as well as the code will jump
> > to a different physical page. I think we should addd them. dtlb_miss
> > probably not a big problem, but it would be good to be an option.

> I plan to add more events later on. 

> > For ratio_metric, we explicitly assign a slot here. Any specific reason?
> > We can just say this metric *permits* ratio_metric and then ratio_matric
> > is assigned dynamically by the user command line options?

> > I am thinking how we could support *all* metrics the underlying system
> > support based on `perf list`. This can be the future work though.
 
> We are also thinking about adding similar functionality to perf-stat, 
> which will be more flexible. 

Yeah, being able to count events bpf programs using the technique you're
using here but instead using 'perf stat' to set it up and then use what
is already in 'perf stat' would be really great, having the same
interface for BPF programs as we have for tid, pid, cgroups, system
wide, etc.

If you try it and find any problems with the codebase I'll be happy to
help as I think others working with 'perf stat' will too,

Cheers,

- Arnaldo
diff mbox series

Patch

diff --git a/tools/bpf/bpftool/Makefile b/tools/bpf/bpftool/Makefile
index c4e810335810..c035fc107027 100644
--- a/tools/bpf/bpftool/Makefile
+++ b/tools/bpf/bpftool/Makefile
@@ -59,6 +59,7 @@  LIBS = $(LIBBPF) -lelf -lz
 
 INSTALL ?= install
 RM ?= rm -f
+CLANG ?= clang
 
 FEATURE_USER = .bpftool
 FEATURE_TESTS = libbfd disassembler-four-args reallocarray zlib
@@ -110,6 +111,22 @@  SRCS += $(BFD_SRCS)
 endif
 
 OBJS = $(patsubst %.c,$(OUTPUT)%.o,$(SRCS)) $(OUTPUT)disasm.o
+_OBJS = $(filter-out $(OUTPUT)prog.o,$(OBJS)) $(OUTPUT)_prog.o
+
+$(OUTPUT)_prog.o: prog.c
+	$(QUIET_CC)$(COMPILE.c) -MMD -DBPFTOOL_WITHOUT_SKELETONS -o $@ $<
+
+$(OUTPUT)_bpftool: $(_OBJS) $(LIBBPF)
+	$(QUIET_LINK)$(CC) $(CFLAGS) $(LDFLAGS) -o $@ $(_OBJS) $(LIBS)
+
+skeleton/profiler.bpf.o: skeleton/profiler.bpf.c
+	$(QUIET_CLANG)$(CLANG) -g -O2 -target bpf -c $< -o $@
+
+profiler.skel.h: $(OUTPUT)_bpftool skeleton/profiler.bpf.o
+	$(QUIET_GEN)$(OUTPUT)./_bpftool gen skeleton skeleton/profiler.bpf.o > $@
+
+$(OUTPUT)prog.o: prog.c profiler.skel.h
+	$(QUIET_CC)$(COMPILE.c) -MMD -o $@ $<
 
 $(OUTPUT)disasm.o: $(srctree)/kernel/bpf/disasm.c
 	$(QUIET_CC)$(COMPILE.c) -MMD -o $@ $<
@@ -125,6 +142,7 @@  $(OUTPUT)%.o: %.c
 clean: $(LIBBPF)-clean
 	$(call QUIET_CLEAN, bpftool)
 	$(Q)$(RM) -- $(OUTPUT)bpftool $(OUTPUT)*.o $(OUTPUT)*.d
+	$(Q)$(RM) -- $(OUTPUT)_bpftool profiler.skel.h skeleton/profiler.bpf.o
 	$(Q)$(RM) -r -- $(OUTPUT)libbpf/
 	$(call QUIET_CLEAN, core-gen)
 	$(Q)$(RM) -- $(OUTPUT)FEATURE-DUMP.bpftool
diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c
index 1996e67a2f00..39f0f14464ad 100644
--- a/tools/bpf/bpftool/prog.c
+++ b/tools/bpf/bpftool/prog.c
@@ -10,12 +10,16 @@ 
 #include <string.h>
 #include <time.h>
 #include <unistd.h>
+#include <signal.h>
 #include <net/if.h>
 #include <sys/types.h>
 #include <sys/stat.h>
+#include <sys/ioctl.h>
+#include <sys/syscall.h>
 
 #include <linux/err.h>
 #include <linux/sizes.h>
+#include <linux/perf_event.h>
 
 #include <bpf/bpf.h>
 #include <bpf/btf.h>
@@ -1537,6 +1541,425 @@  static int do_loadall(int argc, char **argv)
 	return load_with_options(argc, argv, false);
 }
 
+#ifdef BPFTOOL_WITHOUT_SKELETONS
+
+static int do_profile(int argc, char **argv)
+{
+	return 0;
+}
+
+#else /* BPFTOOL_WITHOUT_SKELETONS */
+
+#include "profiler.skel.h"
+
+#define SAMPLE_PERIOD  0x7fffffffffffffffULL
+struct profile_metric {
+	const char *name;
+	struct perf_event_attr attr;
+	bool selected;
+	struct bpf_perf_event_value val;
+
+	/* calculate ratios like instructions per cycle */
+	const int ratio_metric; /* 0 for N/A, 1 for index 0 (cycles)  */
+	const float ratio_mul;
+	const char *ratio_desc;
+} metrics[] = {
+	{
+		.name = "cycles",
+		.attr = {
+			.freq = 0,
+			.sample_period = SAMPLE_PERIOD,
+			.inherit = 0,
+			.type = PERF_TYPE_HARDWARE,
+			.read_format = 0,
+			.sample_type = 0,
+			.config = PERF_COUNT_HW_CPU_CYCLES,
+		},
+	},
+	{
+		.name = "instructions",
+		.attr = {
+			.freq = 0,
+			.sample_period = SAMPLE_PERIOD,
+			.inherit = 0,
+			.type = PERF_TYPE_HARDWARE,
+			.read_format = 0,
+			.sample_type = 0,
+			.config = PERF_COUNT_HW_INSTRUCTIONS,
+		},
+		.ratio_metric = 1,
+		.ratio_mul = 1.0,
+		.ratio_desc = "insn per cycle",
+	},
+	{
+		.name = "l1d_loads",
+		.attr = {
+			.freq = 0,
+			.sample_period = SAMPLE_PERIOD,
+			.inherit = 0,
+			.type = PERF_TYPE_HW_CACHE,
+			.read_format = 0,
+			.sample_type = 0,
+			.config =
+				PERF_COUNT_HW_CACHE_L1D |
+				(PERF_COUNT_HW_CACHE_OP_READ << 8) |
+				(PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16),
+		},
+	},
+	{
+		.name = "llc_misses",
+		.attr = {
+			.freq = 0,
+			.sample_period = SAMPLE_PERIOD,
+			.inherit = 0,
+			.type = PERF_TYPE_HW_CACHE,
+			.read_format = 0,
+			.sample_type = 0,
+			.config =
+				PERF_COUNT_HW_CACHE_LL |
+				(PERF_COUNT_HW_CACHE_OP_READ << 8) |
+				(PERF_COUNT_HW_CACHE_RESULT_MISS << 16),
+		},
+		.ratio_metric = 2,
+		.ratio_mul = 1e6,
+		.ratio_desc = "LLC misses per million isns",
+	},
+};
+
+u64 profile_total_count;
+
+#define MAX_NUM_PROFILE_METRICS 4
+
+static int profile_parse_metrics(int argc, char **argv)
+{
+	unsigned int metric_cnt;
+	int selected_cnt = 0;
+	unsigned int i;
+
+	metric_cnt = sizeof(metrics) / sizeof(struct profile_metric);
+
+	while (argc > 0) {
+		for (i = 0; i < metric_cnt; i++) {
+			if (strcmp(argv[0], metrics[i].name) == 0) {
+				if (!metrics[i].selected)
+					selected_cnt++;
+				metrics[i].selected = true;
+				break;
+			}
+		}
+		if (i == metric_cnt) {
+			p_err("unknown metric %s", argv[0]);
+			return -1;
+		}
+		NEXT_ARG();
+	}
+	if (selected_cnt > MAX_NUM_PROFILE_METRICS) {
+		p_err("too many (%d) metrics, please specify no more than %d metrics at at time",
+		      selected_cnt, MAX_NUM_PROFILE_METRICS);
+		return -1;
+	}
+	return selected_cnt;
+}
+
+static void profile_read_values(struct profiler_bpf *obj)
+{
+	u32 m, cpu, num_cpu = obj->rodata->num_cpu;
+	u64 counts[num_cpu];
+	int reading_map_fd, count_map_fd;
+	u32 key = 0;
+
+	reading_map_fd = bpf_map__fd(obj->maps.accum_readings);
+	count_map_fd = bpf_map__fd(obj->maps.counts);
+	if (reading_map_fd < 0 || count_map_fd < 0) {
+		p_err("failed to get fd for map");
+		return;
+	}
+
+	assert(bpf_map_lookup_elem(count_map_fd, &key, counts) == 0);
+	profile_total_count = 0;
+	for (cpu = 0; cpu < num_cpu; cpu++)
+		profile_total_count += counts[cpu];
+
+	for (m = 0; m < ARRAY_SIZE(metrics); m++) {
+		struct bpf_perf_event_value values[obj->rodata->num_cpu];
+
+		if (!metrics[m].selected)
+			continue;
+
+		assert(bpf_map_lookup_elem(reading_map_fd, &key, values) == 0);
+		for (cpu = 0; cpu < num_cpu; cpu++) {
+			metrics[m].val.counter += values[cpu].counter;
+			metrics[m].val.enabled += values[cpu].enabled;
+			metrics[m].val.running += values[cpu].running;
+		}
+		key++;
+	}
+}
+
+static void profile_print_readings_json(void)
+{
+	u32 m;
+
+	jsonw_start_array(json_wtr);
+	for (m = 0; m < ARRAY_SIZE(metrics); m++) {
+		if (!metrics[m].selected)
+			continue;
+		jsonw_start_object(json_wtr);
+		jsonw_string_field(json_wtr, "metric", metrics[m].name);
+		jsonw_lluint_field(json_wtr, "run_cnt", profile_total_count);
+		jsonw_lluint_field(json_wtr, "value", metrics[m].val.counter);
+		jsonw_lluint_field(json_wtr, "enabled", metrics[m].val.enabled);
+		jsonw_lluint_field(json_wtr, "running", metrics[m].val.running);
+
+		jsonw_end_object(json_wtr);
+	}
+	jsonw_end_array(json_wtr);
+}
+
+static void profile_print_readings_plain(void)
+{
+	u32 m;
+
+	printf("\n%18lu %-20s\n", profile_total_count, "run_cnt");
+	for (m = 0; m < ARRAY_SIZE(metrics); m++) {
+		struct bpf_perf_event_value *val = &metrics[m].val;
+		int r;
+
+		if (!metrics[m].selected)
+			continue;
+		printf("%18llu %-20s", val->counter, metrics[m].name);
+
+		r = metrics[m].ratio_metric - 1;
+		if (r >= 0 && metrics[r].selected) {
+			printf("# %8.2f %-30s",
+			       val->counter * metrics[m].ratio_mul /
+			       metrics[r].val.counter,
+			       metrics[m].ratio_desc);
+		} else {
+			printf("%-41s", "");
+		}
+
+		if (val->enabled > val->running)
+			printf("(%4.2f%%)",
+			       val->running * 100.0 / val->enabled);
+		printf("\n");
+	}
+}
+
+static void profile_print_readings(void)
+{
+	if (json_output)
+		profile_print_readings_json();
+	else
+		profile_print_readings_plain();
+}
+
+static void profile_close_perf_events(struct profiler_bpf *obj)
+{
+	int map_fd, pmu_fd;
+	u32 i;
+
+	map_fd = bpf_map__fd(obj->maps.events);
+	if (map_fd < 0) {
+		p_err("failed to get fd for events map");
+		return;
+	}
+
+	for (i = 0; i < obj->rodata->num_cpu * obj->rodata->num_metric; i++) {
+		bpf_map_lookup_elem(map_fd, &i, &pmu_fd);
+		close(pmu_fd);
+	}
+}
+
+static int profile_open_perf_events(struct profiler_bpf *obj)
+{
+	int map_fd, pmu_fd, i, key = 0;
+	unsigned int cpu, m;
+
+	map_fd = bpf_map__fd(obj->maps.events);
+	if (map_fd < 0) {
+		p_err("failed to get fd for events map");
+		return -1;
+	}
+
+	for (m = 0; m < ARRAY_SIZE(metrics); m++) {
+		if (!metrics[m].selected)
+			continue;
+		for (cpu = 0; cpu < obj->rodata->num_cpu; cpu++) {
+			pmu_fd = syscall(__NR_perf_event_open, &metrics[m].attr,
+					 -1/*pid*/, cpu, -1/*group_fd*/, 0);
+			if (pmu_fd < 0 ||
+			    bpf_map_update_elem(map_fd, &key,
+						&pmu_fd, BPF_ANY) ||
+			    ioctl(pmu_fd, PERF_EVENT_IOC_ENABLE, 0)) {
+				p_err("failed to create event %s on cpu %d",
+				      metrics[m].name, cpu);
+				goto err;
+			}
+			key++;
+		}
+	}
+	return 0;
+err:
+	for (i = key - 1; i >= 0; i--) {
+		bpf_map_lookup_elem(map_fd, &i, &pmu_fd);
+		close(pmu_fd);
+	}
+	return -1;
+}
+
+static char *profile_target_name(int tgt_fd)
+{
+	struct bpf_prog_info_linear *info_linear;
+	struct bpf_func_info *func_info;
+	const struct btf_type *t;
+	char *name = NULL;
+	struct btf *btf;
+
+	info_linear = bpf_program__get_prog_info_linear(
+		tgt_fd, 1UL << BPF_PROG_INFO_FUNC_INFO);
+	if (IS_ERR_OR_NULL(info_linear)) {
+		p_err("failed to get info_linear for prog FD %d", tgt_fd);
+		return NULL;
+	}
+
+	if (info_linear->info.btf_id == 0 ||
+	    btf__get_from_id(info_linear->info.btf_id, &btf)) {
+		p_err("prog FD %d doesn't have valid btf", tgt_fd);
+		goto out;
+	}
+
+	func_info = (struct bpf_func_info *)(info_linear->info.func_info);
+	t = btf__type_by_id(btf, func_info[0].type_id);
+	if (!t) {
+		p_err("btf %d doesn't have type %d",
+		      info_linear->info.btf_id, func_info[0].type_id);
+		goto out;
+	}
+	name = strdup(btf__name_by_offset(btf, t->name_off));
+out:
+	free(info_linear);
+	return name;
+}
+
+struct profiler_bpf *profile_obj;
+int profile_tgt_fd = -1;
+char *profile_tgt_name;
+
+static void profile_print_and_cleanup(void)
+{
+	profile_close_perf_events(profile_obj);
+	profiler_bpf__detach(profile_obj);
+	profile_read_values(profile_obj);
+	profiler_bpf__destroy(profile_obj);
+	profile_print_readings();
+
+	if (profile_tgt_fd >= 0)
+		close(profile_tgt_fd);
+	free(profile_tgt_name);
+}
+
+#define PROFILE_DEFAULT_LONG_DURATION (3600 * 24)
+
+static void int_exit(int signo)
+{
+	profile_print_and_cleanup();
+	exit(0);
+}
+
+static int do_profile(int argc, char **argv)
+{
+	int num_metric, num_cpu, err = -1;
+	struct bpf_program *prog;
+	unsigned long duration;
+	char *endptr;
+
+	/* we at least need: <duration>, "id", <id>, <metric> */
+	if (argc < 4)
+		usage();
+
+	/* parse profiling duration */
+	duration = strtoul(*argv, &endptr, 0);
+	if (*endptr)
+		duration = PROFILE_DEFAULT_LONG_DURATION;
+	else
+		NEXT_ARG();
+
+	/* parse target fd */
+	profile_tgt_fd = prog_parse_fd(&argc, &argv);
+	if (profile_tgt_fd < 0) {
+		p_err("failed to parse fd");
+		return -1;
+	}
+
+	num_metric = profile_parse_metrics(argc, argv);
+	if (num_metric <= 0)
+		goto out;
+
+	num_cpu = libbpf_num_possible_cpus();
+	if (num_cpu <= 0) {
+		p_err("failed to identify number of CPUs");
+		goto out;
+	}
+
+	profile_obj = profiler_bpf__open();
+	if (!profile_obj) {
+		p_err("failed to open and/or load BPF object");
+		goto out;
+	}
+
+	profile_obj->rodata->num_cpu = num_cpu;
+	profile_obj->rodata->num_metric = num_metric;
+
+	/* adjust map sizes */
+	bpf_map__resize(profile_obj->maps.events, num_metric * num_cpu);
+	bpf_map__resize(profile_obj->maps.fentry_readings, num_metric);
+	bpf_map__resize(profile_obj->maps.accum_readings, num_metric);
+	bpf_map__resize(profile_obj->maps.counts, 1);
+
+	/* change target name */
+	profile_tgt_name = profile_target_name(profile_tgt_fd);
+	if (!profile_tgt_name) {
+		p_err("failed to load target function name");
+		goto out;
+	}
+
+	bpf_object__for_each_program(prog, profile_obj->obj) {
+		err = bpf_program__set_attach_target(prog, profile_tgt_fd,
+						     profile_tgt_name);
+		if (err) {
+			p_err("failed to set attach target\n");
+			goto out;
+		}
+	}
+
+	set_max_rlimit();
+	err = profiler_bpf__load(profile_obj);
+	if (err) {
+		p_err("failed to load profile_obj");
+		goto out;
+	}
+
+	profile_open_perf_events(profile_obj);
+
+	err = profiler_bpf__attach(profile_obj);
+	if (err) {
+		p_err("failed to attach profile_obj");
+		goto out;
+	}
+	signal(SIGINT, int_exit);
+
+	sleep(duration);
+	profile_print_and_cleanup();
+	return 0;
+out:
+	close(profile_tgt_fd);
+	free(profile_tgt_name);
+	return err;
+}
+
+#endif /* BPFTOOL_WITHOUT_SKELETONS */
+
 static int do_help(int argc, char **argv)
 {
 	if (json_output) {
@@ -1560,6 +1983,7 @@  static int do_help(int argc, char **argv)
 		"                         [data_out FILE [data_size_out L]] \\\n"
 		"                         [ctx_in FILE [ctx_out FILE [ctx_size_out M]]] \\\n"
 		"                         [repeat N]\n"
+		"       %s %s profile [DURATION] PROG METRICs\n"
 		"       %s %s tracelog\n"
 		"       %s %s help\n"
 		"\n"
@@ -1578,11 +2002,12 @@  static int do_help(int argc, char **argv)
 		"       ATTACH_TYPE := { msg_verdict | stream_verdict | stream_parser |\n"
 		"                        flow_dissector }\n"
 		"       " HELP_SPEC_OPTIONS "\n"
+		"       METRIC := { cycles | instructions | l1d_loads | llc_misses }\n"
 		"",
 		bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2],
 		bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2],
 		bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2],
-		bin_name, argv[-2]);
+		bin_name, argv[-2], bin_name, argv[-2]);
 
 	return 0;
 }
@@ -1599,6 +2024,7 @@  static const struct cmd cmds[] = {
 	{ "detach",	do_detach },
 	{ "tracelog",	do_tracelog },
 	{ "run",	do_run },
+	{ "profile",	do_profile },
 	{ 0 }
 };
 
diff --git a/tools/bpf/bpftool/skeleton/profiler.bpf.c b/tools/bpf/bpftool/skeleton/profiler.bpf.c
new file mode 100644
index 000000000000..abd3a7aacc1f
--- /dev/null
+++ b/tools/bpf/bpftool/skeleton/profiler.bpf.c
@@ -0,0 +1,171 @@ 
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2019 Facebook
+#include "profiler.h"
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+
+#define ___bpf_concat(a, b) a ## b
+#define ___bpf_apply(fn, n) ___bpf_concat(fn, n)
+#define ___bpf_nth(_, _1, _2, _3, _4, _5, _6, _7, _8, _9, _a, _b, _c, N, ...) N
+#define ___bpf_narg(...) \
+	___bpf_nth(_, ##__VA_ARGS__, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)
+#define ___bpf_empty(...) \
+	___bpf_nth(_, ##__VA_ARGS__, N, N, N, N, N, N, N, N, N, N, 0)
+
+#define ___bpf_ctx_cast0() ctx
+#define ___bpf_ctx_cast1(x) ___bpf_ctx_cast0(), (void *)ctx[0]
+#define ___bpf_ctx_cast2(x, args...) ___bpf_ctx_cast1(args), (void *)ctx[1]
+#define ___bpf_ctx_cast3(x, args...) ___bpf_ctx_cast2(args), (void *)ctx[2]
+#define ___bpf_ctx_cast4(x, args...) ___bpf_ctx_cast3(args), (void *)ctx[3]
+#define ___bpf_ctx_cast5(x, args...) ___bpf_ctx_cast4(args), (void *)ctx[4]
+#define ___bpf_ctx_cast6(x, args...) ___bpf_ctx_cast5(args), (void *)ctx[5]
+#define ___bpf_ctx_cast7(x, args...) ___bpf_ctx_cast6(args), (void *)ctx[6]
+#define ___bpf_ctx_cast8(x, args...) ___bpf_ctx_cast7(args), (void *)ctx[7]
+#define ___bpf_ctx_cast9(x, args...) ___bpf_ctx_cast8(args), (void *)ctx[8]
+#define ___bpf_ctx_cast10(x, args...) ___bpf_ctx_cast9(args), (void *)ctx[9]
+#define ___bpf_ctx_cast11(x, args...) ___bpf_ctx_cast10(args), (void *)ctx[10]
+#define ___bpf_ctx_cast12(x, args...) ___bpf_ctx_cast11(args), (void *)ctx[11]
+#define ___bpf_ctx_cast(args...) \
+	___bpf_apply(___bpf_ctx_cast, ___bpf_narg(args))(args)
+
+/*
+ * BPF_PROG is a convenience wrapper for generic tp_btf/fentry/fexit and
+ * similar kinds of BPF programs, that accept input arguments as a single
+ * pointer to untyped u64 array, where each u64 can actually be a typed
+ * pointer or integer of different size. Instead of requring user to write
+ * manual casts and work with array elements by index, BPF_PROG macro
+ * allows user to declare a list of named and typed input arguments in the
+ * same syntax as for normal C function. All the casting is hidden and
+ * performed transparently, while user code can just assume working with
+ * function arguments of specified type and name.
+ *
+ * Original raw context argument is preserved as well as 'ctx' argument.
+ * This is useful when using BPF helpers that expect original context
+ * as one of the parameters (e.g., for bpf_perf_event_output()).
+ */
+#define BPF_PROG(name, args...)						    \
+name(unsigned long long *ctx);						    \
+static __always_inline typeof(name(0))					    \
+____##name(unsigned long long *ctx, ##args);				    \
+typeof(name(0)) name(unsigned long long *ctx)				    \
+{									    \
+	_Pragma("GCC diagnostic push")					    \
+	_Pragma("GCC diagnostic ignored \"-Wint-conversion\"")		    \
+	return ____##name(___bpf_ctx_cast(args));			    \
+	_Pragma("GCC diagnostic pop")					    \
+}									    \
+static __always_inline typeof(name(0))					    \
+____##name(unsigned long long *ctx, ##args)
+
+/* map of perf event fds, num_cpu * num_metric entries */
+struct {
+	__uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
+	__uint(key_size, sizeof(u32));
+	__uint(value_size, sizeof(int));
+} events SEC(".maps");
+
+/* readings at fentry */
+struct {
+	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+	__uint(key_size, sizeof(u32));
+	__uint(value_size, sizeof(struct bpf_perf_event_value));
+} fentry_readings SEC(".maps");
+
+/* accumulated readings */
+struct {
+	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+	__uint(key_size, sizeof(u32));
+	__uint(value_size, sizeof(struct bpf_perf_event_value));
+} accum_readings SEC(".maps");
+
+/* sample counts, one per cpu */
+struct {
+	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+	__uint(key_size, sizeof(u32));
+	__uint(value_size, sizeof(u64));
+} counts SEC(".maps");
+
+const volatile __u32 num_cpu = 1;
+const volatile __u32 num_metric = 1;
+#define MAX_NUM_MATRICS 4
+
+SEC("fentry/XXX")
+int BPF_PROG(fentry_XXX)
+{
+	struct bpf_perf_event_value *ptrs[MAX_NUM_MATRICS];
+	u32 key = bpf_get_smp_processor_id();
+	u32 i;
+
+	/* look up before reading, to reduce error */
+	for (i = 0; i < num_metric && i < MAX_NUM_MATRICS; i++) {
+		u32 flag = i;
+
+		ptrs[i] = bpf_map_lookup_elem(&fentry_readings, &flag);
+		if (!ptrs[i])
+			return 0;
+	}
+
+	for (i = 0; i < num_metric && i < MAX_NUM_MATRICS; i++) {
+		struct bpf_perf_event_value reading;
+		int err;
+
+		err = bpf_perf_event_read_value(&events, key, &reading,
+						sizeof(reading));
+		if (err)
+			return 0;
+		*(ptrs[i]) = reading;
+		key += num_cpu;
+	}
+
+	return 0;
+}
+
+static inline void
+fexit_update_maps(u32 id, struct bpf_perf_event_value *after)
+{
+	struct bpf_perf_event_value *before, diff, *accum;
+
+	before = bpf_map_lookup_elem(&fentry_readings, &id);
+	/* only account samples with a valid fentry_reading */
+	if (before && before->counter) {
+		struct bpf_perf_event_value *accum;
+
+		diff.counter = after->counter - before->counter;
+		diff.enabled = after->enabled - before->enabled;
+		diff.running = after->running - before->running;
+
+		accum = bpf_map_lookup_elem(&accum_readings, &id);
+		if (accum) {
+			accum->counter += diff.counter;
+			accum->enabled += diff.enabled;
+			accum->running += diff.running;
+		}
+	}
+}
+
+SEC("fexit/XXX")
+int BPF_PROG(fexit_XXX)
+{
+	struct bpf_perf_event_value readings[MAX_NUM_MATRICS];
+	u32 cpu = bpf_get_smp_processor_id();
+	u32 i, one = 1, zero = 0;
+	int err;
+	u64 *count;
+
+	/* read all events before updating the maps, to reduce error */
+	for (i = 0; i < num_metric && i < MAX_NUM_MATRICS; i++) {
+		err = bpf_perf_event_read_value(&events, cpu + i * num_cpu,
+						readings + i, sizeof(*readings));
+		if (err)
+			return 0;
+	}
+	count = bpf_map_lookup_elem(&counts, &zero);
+	if (count) {
+		*count += 1;
+		for (i = 0; i < num_metric && i < MAX_NUM_MATRICS; i++)
+			fexit_update_maps(i, &readings[i]);
+	}
+	return 0;
+}
+
+char LICENSE[] SEC("license") = "GPL";
diff --git a/tools/bpf/bpftool/skeleton/profiler.h b/tools/bpf/bpftool/skeleton/profiler.h
new file mode 100644
index 000000000000..ae15cb0c4d43
--- /dev/null
+++ b/tools/bpf/bpftool/skeleton/profiler.h
@@ -0,0 +1,47 @@ 
+/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */
+#ifndef __PROFILER_H
+#define __PROFILER_H
+
+/* useful typedefs from vimlinux.h */
+
+typedef signed char __s8;
+typedef unsigned char __u8;
+typedef short int __s16;
+typedef short unsigned int __u16;
+typedef int __s32;
+typedef unsigned int __u32;
+typedef long long int __s64;
+typedef long long unsigned int __u64;
+
+typedef __s8 s8;
+typedef __u8 u8;
+typedef __s16 s16;
+typedef __u16 u16;
+typedef __s32 s32;
+typedef __u32 u32;
+typedef __s64 s64;
+typedef __u64 u64;
+
+enum {
+	false = 0,
+	true = 1,
+};
+
+#ifdef __CHECKER__
+#define __bitwise__ __attribute__((bitwise))
+#else
+#define __bitwise__
+#endif
+#define __bitwise __bitwise__
+
+typedef __u16 __bitwise __le16;
+typedef __u16 __bitwise __be16;
+typedef __u32 __bitwise __le32;
+typedef __u32 __bitwise __be32;
+typedef __u64 __bitwise __le64;
+typedef __u64 __bitwise __be64;
+
+typedef __u16 __bitwise __sum16;
+typedef __u32 __bitwise __wsum;
+
+#endif /* __PROFILER_H */
diff --git a/tools/scripts/Makefile.include b/tools/scripts/Makefile.include
index ded7a950dc40..59f31f01cb93 100644
--- a/tools/scripts/Makefile.include
+++ b/tools/scripts/Makefile.include
@@ -106,6 +106,7 @@  ifneq ($(silent),1)
   ifneq ($(V),1)
 	QUIET_CC       = @echo '  CC       '$@;
 	QUIET_CC_FPIC  = @echo '  CC FPIC  '$@;
+	QUIET_CLANG    = @echo '  CLANG    '$@;
 	QUIET_AR       = @echo '  AR       '$@;
 	QUIET_LINK     = @echo '  LINK     '$@;
 	QUIET_MKDIR    = @echo '  MKDIR    '$@;