diff mbox series

[bpf-next,02/10] libbpf: implement BPF CO-RE offset relocation algorithm

Message ID 20190724192742.1419254-3-andriin@fb.com
State Changes Requested
Delegated to: BPF Maintainers
Headers show
Series CO-RE offset relocations | expand

Commit Message

Andrii Nakryiko July 24, 2019, 7:27 p.m. UTC
This patch implements the core logic for BPF CO-RE offsets relocations.
All the details are described in code comments.

Signed-off-by: Andrii Nakryiko <andriin@fb.com>
---
 tools/lib/bpf/libbpf.c | 866 ++++++++++++++++++++++++++++++++++++++++-
 tools/lib/bpf/libbpf.h |   1 +
 2 files changed, 861 insertions(+), 6 deletions(-)

Comments

Song Liu July 25, 2019, 7:32 p.m. UTC | #1
> On Jul 24, 2019, at 12:27 PM, Andrii Nakryiko <andriin@fb.com> wrote:
> 
> This patch implements the core logic for BPF CO-RE offsets relocations.
> All the details are described in code comments.

Some description in the change log is still useful. Please at least 
copy-paste key comments here. 

And, this is looooong. I think it is totally possible to split it into
multiple smaller patches. 

I haven't finished all of it. Please see my comments below of parts I
have covered. 

Thanks,
Song

> 
> Signed-off-by: Andrii Nakryiko <andriin@fb.com>
> ---
> tools/lib/bpf/libbpf.c | 866 ++++++++++++++++++++++++++++++++++++++++-
> tools/lib/bpf/libbpf.h |   1 +
> 2 files changed, 861 insertions(+), 6 deletions(-)
> 
> diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
> index 8741c39adb1c..86d87bf10d46 100644
> --- a/tools/lib/bpf/libbpf.c
> +++ b/tools/lib/bpf/libbpf.c
> @@ -38,6 +38,7 @@
> #include <sys/stat.h>
> #include <sys/types.h>
> #include <sys/vfs.h>
> +#include <sys/utsname.h>
> #include <tools/libc_compat.h>
> #include <libelf.h>
> #include <gelf.h>
> @@ -47,6 +48,7 @@
> #include "btf.h"
> #include "str_error.h"
> #include "libbpf_internal.h"
> +#include "hashmap.h"
> 
> #ifndef EM_BPF
> #define EM_BPF 247
> @@ -1013,16 +1015,22 @@ static int bpf_object__init_user_maps(struct bpf_object *obj, bool strict)
> }
> 
> static const struct btf_type *skip_mods_and_typedefs(const struct btf *btf,
> -						     __u32 id)
> +						     __u32 id,
> +						     __u32 *res_id)
> {
> 	const struct btf_type *t = btf__type_by_id(btf, id);

Maybe have a local "__u32 rid;" 

> 
> +	if (res_id)
> +		*res_id = id;
> +

and do "rid = id;" here

> 	while (true) {
> 		switch (BTF_INFO_KIND(t->info)) {
> 		case BTF_KIND_VOLATILE:
> 		case BTF_KIND_CONST:
> 		case BTF_KIND_RESTRICT:
> 		case BTF_KIND_TYPEDEF:
> +			if (res_id)
> +				*res_id = t->type;
and here

> 			t = btf__type_by_id(btf, t->type);
> 			break;
> 		default:
and "*res_id = rid;" right before return?

> @@ -1041,7 +1049,7 @@ static const struct btf_type *skip_mods_and_typedefs(const struct btf *btf,
> static bool get_map_field_int(const char *map_name, const struct btf *btf,
> 			      const struct btf_type *def,
> 			      const struct btf_member *m, __u32 *res) {
> -	const struct btf_type *t = skip_mods_and_typedefs(btf, m->type);
> +	const struct btf_type *t = skip_mods_and_typedefs(btf, m->type, NULL);
> 	const char *name = btf__name_by_offset(btf, m->name_off);
> 	const struct btf_array *arr_info;
> 	const struct btf_type *arr_t;
> @@ -1107,7 +1115,7 @@ static int bpf_object__init_user_btf_map(struct bpf_object *obj,
> 		return -EOPNOTSUPP;
> 	}
> 
> -	def = skip_mods_and_typedefs(obj->btf, var->type);
> +	def = skip_mods_and_typedefs(obj->btf, var->type, NULL);
> 	if (BTF_INFO_KIND(def->info) != BTF_KIND_STRUCT) {
> 		pr_warning("map '%s': unexpected def kind %u.\n",
> 			   map_name, BTF_INFO_KIND(var->info));
> @@ -2289,6 +2297,845 @@ bpf_program_reloc_btf_ext(struct bpf_program *prog, struct bpf_object *obj,
> 	return 0;
> }
> 
> +#define BPF_CORE_SPEC_MAX_LEN 64
> +
> +/* represents BPF CO-RE field or array element accessor */
> +struct bpf_core_accessor {
> +	__u32 type_id;		/* struct/union type or array element type */
> +	__u32 idx;		/* field index or array index */
> +	const char *name;	/* field name or NULL for array accessor */
> +};
> +
> +struct bpf_core_spec {
> +	const struct btf *btf;
> +	/* high-level spec: named fields and array indicies only */

typo: indices

> +	struct bpf_core_accessor spec[BPF_CORE_SPEC_MAX_LEN];
> +	/* high-level spec length */
> +	int len;
> +	/* raw, low-level spec: 1-to-1 with accessor spec string */
> +	int raw_spec[BPF_CORE_SPEC_MAX_LEN];
> +	/* raw spec length */
> +	int raw_len;
> +	/* field byte offset represented by spec */
> +	__u32 offset;
> +};
> +
> +static bool str_is_empty(const char *s)
> +{
> +	return !s || !s[0];
> +}
> +
> +static int btf_kind(const struct btf_type *t)
> +{
> +	return BTF_INFO_KIND(t->info);
> +}
> +
> +static bool btf_is_composite(const struct btf_type *t)
> +{
> +	int kind = btf_kind(t);
> +
> +	return kind == BTF_KIND_STRUCT || kind == BTF_KIND_UNION;
> +}
> +
> +static bool btf_is_array(const struct btf_type *t)
> +{
> +	return btf_kind(t) == BTF_KIND_ARRAY;
> +}
> +
> +/* 
> + * Turn bpf_offset_reloc into a low- and high-level spec representation,
> + * validating correctness along the way, as well as calculating resulting
> + * field offset (in bytes), specified by accessor string. Low-level spec
> + * captures every single level of nestedness, including traversing anonymous
> + * struct/union members. High-level one only captures semantically meaningful
> + * "turning points": named fields and array indicies.
> + * E.g., for this case:
> + *
> + *   struct sample {
> + *       int __unimportant;
> + *       struct {
> + *           int __1;
> + *           int __2;
> + *           int a[7];
> + *       };
> + *   };
> + *
> + *   struct sample *s = ...;
> + *
> + *   int x = &s->a[3]; // access string = '0:1:2:3'
> + *
> + * Low-level spec has 1:1 mapping with each element of access string (it's
> + * just a parsed access string representation): [0, 1, 2, 3].
> + *
> + * High-level spec will capture only 3 points:
> + *   - intial zero-index access by pointer (&s->... is the same as &s[0]...);
> + *   - field 'a' access (corresponds to '2' in low-level spec);
> + *   - array element #3 access (corresponds to '3' in low-level spec).
> + *
> + */

IIUC, high-level points are subset of low-level points. How about we introduce
"anonymous" high-level points, so that high-level points and low-level points
are 1:1 mapping? 

> +static int bpf_core_spec_parse(const struct btf *btf,
> +			       __u32 type_id,
> +			       const char *spec_str,
> +			       struct bpf_core_spec *spec)
> +{
> +	int access_idx, parsed_len, i;
> +	const struct btf_type *t;
> +	__u32 id = type_id;
> +	const char *name;
> +	__s64 sz;
> +
> +	if (str_is_empty(spec_str) || *spec_str == ':')
> +		return -EINVAL;
> +
> +	memset(spec, 0, sizeof(*spec));
> +	spec->btf = btf;
> +
> +	/* parse spec_str="0:1:2:3:4" into array raw_spec=[0, 1, 2, 3, 4] */
> +	while (*spec_str) {
> +		if (*spec_str == ':')
> +			++spec_str;
> +		if (sscanf(spec_str, "%d%n", &access_idx, &parsed_len) != 1)
> +			return -EINVAL;
> +		if (spec->raw_len == BPF_CORE_SPEC_MAX_LEN)
> +			return -E2BIG;
> +		spec_str += parsed_len;
> +		spec->raw_spec[spec->raw_len++] = access_idx;
> +	}
> +
> +	if (spec->raw_len == 0)
> +		return -EINVAL;
> +
> +	for (i = 0; i < spec->raw_len; i++) {
> +		t = skip_mods_and_typedefs(btf, id, &id);
> +		if (!t)
> +			return -EINVAL;
> +
> +		access_idx = spec->raw_spec[i];
> +
> +		if (i == 0) {
> +			/* first spec value is always reloc type array index */
> +			spec->spec[spec->len].type_id = id;
> +			spec->spec[spec->len].idx = access_idx;
> +			spec->len++;
> +
> +			sz = btf__resolve_size(btf, id);
> +			if (sz < 0)
> +				return sz;
> +			spec->offset += access_idx * sz;
          spec->offset = access_idx * sz;  should be enough

> +			continue;
> +		}

Maybe pull i == 0 case out of the for loop? 

> +
> +		if (btf_is_composite(t)) {
> +			const struct btf_member *m = (void *)(t + 1);
> +			__u32 offset;
> +
> +			if (access_idx >= BTF_INFO_VLEN(t->info))
> +				return -EINVAL;
> +
> +			m = &m[access_idx];
> +
> +			if (BTF_INFO_KFLAG(t->info)) {
> +				if (BTF_MEMBER_BITFIELD_SIZE(m->offset))
> +					return -EINVAL;
> +				offset = BTF_MEMBER_BIT_OFFSET(m->offset);
> +			} else {
> +				offset = m->offset;
> +			}
> +			if (m->offset % 8)
> +				return -EINVAL;
> +			spec->offset += offset / 8;
> +
> +			if (m->name_off) {
> +				name = btf__name_by_offset(btf, m->name_off);
> +				if (str_is_empty(name))
> +					return -EINVAL;
> +
> +				spec->spec[spec->len].type_id = id;
> +				spec->spec[spec->len].idx = access_idx;
> +				spec->spec[spec->len].name = name;
> +				spec->len++;
> +			}
> +
> +			id = m->type;
> +		} else if (btf_is_array(t)) {
> +			const struct btf_array *a = (void *)(t + 1);
> +
> +			t = skip_mods_and_typedefs(btf, a->type, &id);
> +			if (!t || access_idx >= a->nelems)
> +				return -EINVAL;
> +
> +			spec->spec[spec->len].type_id = id;
> +			spec->spec[spec->len].idx = access_idx;
> +			spec->len++;
> +
> +			sz = btf__resolve_size(btf, id);
> +			if (sz < 0)
> +				return sz;
> +			spec->offset += access_idx * sz;
> +		} else {
> +			pr_warning("relo for [%u] %s (at idx %d) captures type [%d] of unexpected kind %d\n",
> +				   type_id, spec_str, i, id, btf_kind(t));
> +			return -EINVAL;
> +		}
> +	}
> +
> +	if (spec->len == 0)
> +		return -EINVAL;

Can this ever happen? 

> +
> +	return 0;
> +}
> +
> +/* Given 'some_struct_name___with_flavor' return the length of a name prefix
> + * before last triple underscore. Struct name part after last triple
> + * underscore is ignored by BPF CO-RE relocation during relocation matching.
> + */
> +static size_t bpf_core_essential_name_len(const char *name)
> +{
> +	size_t n = strlen(name);
> +	int i = n - 3;
> +
> +	while (i > 0) {
> +		if (name[i] == '_' && name[i + 1] == '_' && name[i + 2] == '_')
> +			return i;
> +		i--;
> +	}
> +	return n;
> +}
> +
> +/* dynamically sized list of type IDs */
> +struct ids_vec {
> +	__u32 *data;
> +	int len;
> +};
> +
> +static void bpf_core_free_cands(struct ids_vec *cand_ids)
> +{
> +	free(cand_ids->data);
> +	free(cand_ids);
> +}
> +
> +static struct ids_vec *bpf_core_find_cands(const struct btf *local_btf,
> +					   __u32 local_type_id,
> +					   const struct btf *targ_btf)
> +{
> +	size_t local_essent_len, targ_essent_len;
> +	const char *local_name, *targ_name;
> +	const struct btf_type *t;
> +	struct ids_vec *cand_ids;
> +	__u32 *new_ids;
> +	int i, err, n;
> +
> +	t = btf__type_by_id(local_btf, local_type_id);
> +	if (!t)
> +		return ERR_PTR(-EINVAL);
> +
> +	local_name = btf__name_by_offset(local_btf, t->name_off);
> +	if (str_is_empty(local_name))
> +		return ERR_PTR(-EINVAL);
> +	local_essent_len = bpf_core_essential_name_len(local_name);
> +
> +	cand_ids = calloc(1, sizeof(*cand_ids));
> +	if (!cand_ids)
> +		return ERR_PTR(-ENOMEM);
> +
> +	n = btf__get_nr_types(targ_btf);
> +	for (i = 1; i <= n; i++) {
> +		t = btf__type_by_id(targ_btf, i);
> +		targ_name = btf__name_by_offset(targ_btf, t->name_off);
> +		if (str_is_empty(targ_name))
> +			continue;
> +
> +		targ_essent_len = bpf_core_essential_name_len(targ_name);
> +		if (targ_essent_len != local_essent_len)
> +			continue;
> +
> +		if (strncmp(local_name, targ_name, local_essent_len) == 0) {
> +			pr_debug("[%d] (%s): found candidate [%d] (%s)\n",
> +				 local_type_id, local_name, i, targ_name);
> +			new_ids = realloc(cand_ids->data, cand_ids->len + 1);
> +			if (!new_ids) {
> +				err = -ENOMEM;
> +				goto err_out;
> +			}
> +			cand_ids->data = new_ids;
> +			cand_ids->data[cand_ids->len++] = i;
> +		}
> +	}
> +	return cand_ids;
> +err_out:
> +	bpf_core_free_cands(cand_ids);
> +	return ERR_PTR(err);
> +}
> +
> +/* Check two types for compatibility, skipping const/volatile/restrict and
> + * typedefs, to ensure we are relocating offset to the compatible entities:
> + *   - any two STRUCTs/UNIONs are compatible and can be mixed;
> + *   - any two FWDs are compatible;
> + *   - any two PTRs are always compatible;
> + *   - for ENUMs, check sizes, names are ignored;
> + *   - for INT, size and bitness should match, signedness is ignored;
> + *   - for ARRAY, dimensionality is ignored, element types are checked for
> + *     compatibility recursively;
> + *   - everything else shouldn't be ever a target of relocation.
> + * These rules are not set in stone and probably will be adjusted as we get
> + * more experience with using BPF CO-RE relocations.
> + */
> +static int bpf_core_fields_are_compat(const struct btf *local_btf,
> +				      __u32 local_id,
> +				      const struct btf *targ_btf,
> +				      __u32 targ_id)
> +{
> +	const struct btf_type *local_type, *targ_type;
> +	__u16 kind;
> +
> +recur:
> +	local_type = skip_mods_and_typedefs(local_btf, local_id, &local_id);
> +	targ_type = skip_mods_and_typedefs(targ_btf, targ_id, &targ_id);
> +	if (!local_type || !targ_type)
> +		return -EINVAL;
> +
> +	if (btf_is_composite(local_type) && btf_is_composite(targ_type))
> +		return 1;
> +	if (BTF_INFO_KIND(local_type->info) != BTF_INFO_KIND(targ_type->info))
> +		return 0;
> +
> +	kind = BTF_INFO_KIND(local_type->info);
> +	switch (kind) {
> +	case BTF_KIND_FWD:
> +	case BTF_KIND_PTR:
> +		return 1;
> +	case BTF_KIND_ENUM:
> +		return local_type->size == targ_type->size;
> +	case BTF_KIND_INT: {
> +		__u32 loc_int = *(__u32 *)(local_type + 1);
> +		__u32 targ_int = *(__u32 *)(targ_type + 1);
> +
> +		return BTF_INT_OFFSET(loc_int) == 0 &&
> +		       BTF_INT_OFFSET(targ_int) == 0 &&
> +		       local_type->size == targ_type->size &&
> +		       BTF_INT_BITS(loc_int) == BTF_INT_BITS(targ_int);
> +	}
> +	case BTF_KIND_ARRAY: {
> +		const struct btf_array *loc_a, *targ_a;
> +
> +		loc_a = (void *)(local_type + 1);
> +		targ_a = (void *)(targ_type + 1);
> +		local_id = loc_a->type;
> +		targ_id = targ_a->type;
> +		goto recur;
> +	}
> +	default:
> +		pr_warning("unexpected kind %d relocated, local [%d], target [%d]\n",
> +			   kind, local_id, targ_id);
> +		return 0;
> +	}
> +}
> +
> +/* 
> + * Given single high-level accessor (either named field or array index) in
> + * local type, find corresponding high-level accessor for a target type. Along
> + * the way, maintain low-level spec for target as well. Also keep updating
> + * target offset.
> + */

Please describe the recursive algorithm here. I am kinda lost. 
Also, please document the meaning of zero, positive, negative return values.

> +static int bpf_core_match_member(const struct btf *local_btf,
> +				 const struct bpf_core_accessor *local_acc,
> +				 const struct btf *targ_btf,
> +				 __u32 targ_id,
> +				 struct bpf_core_spec *spec,
> +				 __u32 *next_targ_id)
> +{
> +	const struct btf_type *local_type, *targ_type;
> +	const struct btf_member *local_member, *m;
> +	const char *local_name, *targ_name;
> +	__u32 local_id;
> +	int i, n, found;
> +
> +	targ_type = skip_mods_and_typedefs(targ_btf, targ_id, &targ_id);
> +	if (!targ_type)
> +		return -EINVAL;
> +	if (!btf_is_composite(targ_type))
> +		return 0;
> +
> +	local_id = local_acc->type_id;
> +	local_type = btf__type_by_id(local_btf, local_id);
> +	local_member = (void *)(local_type + 1);
> +	local_member += local_acc->idx;
> +	local_name = btf__name_by_offset(local_btf, local_member->name_off);
> +
> +	n = BTF_INFO_VLEN(targ_type->info);
> +	m = (void *)(targ_type + 1);
> +	for (i = 0; i < n; i++, m++) {
> +		__u32 offset;
> +
> +		/* bitfield relocations not supported */
> +		if (BTF_INFO_KFLAG(targ_type->info)) {
> +			if (BTF_MEMBER_BITFIELD_SIZE(m->offset))
> +				continue;
> +			offset = BTF_MEMBER_BIT_OFFSET(m->offset);
> +		} else {
> +			offset = m->offset;
> +		}
> +		if (offset % 8)
> +			continue;
> +
> +		/* too deep struct/union/array nesting */
> +		if (spec->raw_len == BPF_CORE_SPEC_MAX_LEN)
> +			return -E2BIG;
> +
> +		/* speculate this member will be the good one */
> +		spec->offset += offset / 8;
> +		spec->raw_spec[spec->raw_len++] = i;
> +
> +		targ_name = btf__name_by_offset(targ_btf, m->name_off);
> +		if (str_is_empty(targ_name)) {
> +			/* embedded struct/union, we need to go deeper */
> +			found = bpf_core_match_member(local_btf, local_acc,
> +						      targ_btf, m->type,
> +						      spec, next_targ_id);
> +			if (found) /* either found or error */
> +				return found;
> +		} else if (strcmp(local_name, targ_name) == 0) {
> +			/* matching named field */
> +			struct bpf_core_accessor *targ_acc;
> +
> +			targ_acc = &spec->spec[spec->len++];
> +			targ_acc->type_id = targ_id;
> +			targ_acc->idx = i;
> +			targ_acc->name = targ_name;
> +
> +			*next_targ_id = m->type;
> +			found = bpf_core_fields_are_compat(local_btf,
> +							   local_member->type,
> +							   targ_btf, m->type);
> +			if (!found)
> +				spec->len--; /* pop accessor */
> +			return found;
> +		}
> +		/* member turned out to be not we looked for */
> +		spec->offset -= offset / 8;
> +		spec->raw_len--;
> +	}
> +
> +	return 0;
> +}
> +
> +/*
> + * Try to match local spec to a target type and, if successful, produce full
> + * target spec (high-level, low-level + offset).
> + */
> +static int bpf_core_spec_match(struct bpf_core_spec *local_spec,
> +			       const struct btf *targ_btf, __u32 targ_id,
> +			       struct bpf_core_spec *targ_spec)
> +{
> +	const struct btf_type *targ_type;
> +	const struct bpf_core_accessor *local_acc;
> +	struct bpf_core_accessor *targ_acc;
> +	int i, sz, matched;
> +
> +	memset(targ_spec, 0, sizeof(*targ_spec));
> +	targ_spec->btf = targ_btf;
> +
> +	local_acc = &local_spec->spec[0];
> +	targ_acc = &targ_spec->spec[0];
> +
> +	for (i = 0; i < local_spec->len; i++, local_acc++, targ_acc++) {
> +		targ_type = skip_mods_and_typedefs(targ_spec->btf, targ_id,
> +						   &targ_id);
> +		if (!targ_type)
> +			return -EINVAL;
> +
> +		if (local_acc->name) {
> +			if (!btf_is_composite(targ_type))
> +				return 0;
> +
> +			matched = bpf_core_match_member(local_spec->btf,
> +							local_acc,
> +							targ_btf, targ_id,
> +							targ_spec, &targ_id);
> +			if (matched <= 0)
> +				return matched;
> +		} else {
> +			/* for i=0, targ_id is already treated as array element
> +			 * type (because it's the original struct), for others
> +			 * we should find array element type first
> +			 */
> +			if (i > 0) {

i == 0 case would go into "if (local_acc->name)" branch, no? 

> +				const struct btf_array *a;
> +
> +				if (!btf_is_array(targ_type))
> +					return 0;
> +
> +				a = (void *)(targ_type + 1);
> +				if (local_acc->idx >= a->nelems)
> +					return 0;
> +				if (!skip_mods_and_typedefs(targ_btf, a->type,
> +							    &targ_id))
> +					return -EINVAL;
> +			}
> +
> +			/* too deep struct/union/array nesting */
> +			if (targ_spec->raw_len == BPF_CORE_SPEC_MAX_LEN)
> +				return -E2BIG;
> +
> +			targ_acc->type_id = targ_id;
> +			targ_acc->idx = local_acc->idx;
> +			targ_acc->name = NULL;
> +			targ_spec->len++;
> +			targ_spec->raw_spec[targ_spec->raw_len] = targ_acc->idx;
> +			targ_spec->raw_len++;
> +
> +			sz = btf__resolve_size(targ_btf, targ_id);
> +			if (sz < 0)
> +				return sz;
> +			targ_spec->offset += local_acc->idx * sz;
> +		}
> +	}
> +
> +	return 1;
> +}
> +
> +/*
> + * Patch relocatable BPF instruction.
> + * Expected insn->imm value is provided for validation, as well as the new
> + * relocated value.
> + *
> + * Currently three kinds of BPF instructions are supported:
> + * 1. rX = <imm> (assignment with immediate operand);
> + * 2. rX += <imm> (arithmetic operations with immediate operand);
> + * 3. *(rX) = <imm> (indirect memory assignment with immediate operand).
> + *
> + * If actual insn->imm value is wrong, bail out.
> + */
> +static int bpf_core_reloc_insn(struct bpf_program *prog, int insn_off,
> +			       __u32 orig_off, __u32 new_off)
> +{
> +	struct bpf_insn *insn;
> +	int insn_idx;
> +	__u8 class;
> +
> +	if (insn_off % sizeof(struct bpf_insn))
> +		return -EINVAL;
> +	insn_idx = insn_off / sizeof(struct bpf_insn);
> +
> +	insn = &prog->insns[insn_idx];
> +	class = BPF_CLASS(insn->code);
> +
> +	if (class == BPF_ALU || class == BPF_ALU64) {
> +		if (BPF_SRC(insn->code) != BPF_K)
> +			return -EINVAL;
> +		if (insn->imm != orig_off)
> +			return -EINVAL;
> +		insn->imm = new_off;
> +		pr_debug("prog '%s': patched insn #%d (ALU/ALU64) imm %d -> %d\n",
> +			 bpf_program__title(prog, false),
> +			 insn_idx, orig_off, new_off);
> +	} else if (class == BPF_ST && BPF_MODE(insn->code) == BPF_MEM) {
> +		if (insn->imm != orig_off)
> +			return -EINVAL;
> +		insn->imm = new_off;
> +		pr_debug("prog '%s': patched insn #%d (ST | MEM) imm %d -> %d\n",
> +			 bpf_program__title(prog, false),
> +			 insn_idx, orig_off, new_off);
> +	} else {
> +		pr_warning("prog '%s': trying to relocate unrecognized insn #%d, code:%x, src:%x, dst:%x, off:%x, imm:%x\n",
> +			   bpf_program__title(prog, false),
> +			   insn_idx, insn->code, insn->src_reg, insn->dst_reg,
> +			   insn->off, insn->imm);
> +		return -EINVAL;
> +	}
> +	return 0;
> +}
> +
> +/*
> + * Probe few well-known locations for vmlinux kernel image and try to load BTF
> + * data out of it to use for target BTF.
> + */
> +static struct btf *bpf_core_find_kernel_btf(void)
> +{
> +	const char *locations[] = {
> +		"/lib/modules/%1$s/vmlinux-%1$s",
> +		"/usr/lib/modules/%1$s/kernel/vmlinux",
> +	};
> +	char path[PATH_MAX + 1];
> +	struct utsname buf;
> +	struct btf *btf;
> +	int i, err;
> +
> +	err = uname(&buf);
> +	if (err) {
> +		pr_warning("failed to uname(): %d\n", err);
> +		return ERR_PTR(err);
> +	}
> +
> +	for (i = 0; i < ARRAY_SIZE(locations); i++) {
> +		snprintf(path, PATH_MAX, locations[i], buf.release);
> +		pr_debug("attempting to load kernel BTF from '%s'\n", path);
> +
> +		if (access(path, R_OK))
> +			continue;
> +
> +		btf = btf__parse_elf(path, NULL);
> +		if (IS_ERR(btf))
> +			continue;
> +
> +		pr_debug("successfully loaded kernel BTF from '%s'\n", path);
> +		return btf;
> +	}
> +
> +	pr_warning("failed to find valid kernel BTF\n");
> +	return ERR_PTR(-ESRCH);
> +}
> +
> +static size_t bpf_core_hash_fn(const void *key, void *ctx)
> +{
> +	return (size_t)key;
> +}
> +
> +static bool bpf_core_equal_fn(const void *k1, const void *k2, void *ctx)
> +{
> +	return k1 == k2;
> +}
> +
> +static void *u32_to_ptr(__u32 x)
> +{
> +	return (void *)(uintptr_t)x;
> +}
> +
> +/* 
> + * CO-RE relocate single instruction.
> + *
> + * The outline and important points of the algorithm:
> + * 1. For given local type, find corresponding candidate target types.
> + *    Candidate type is a type with the same "essential" name, ignoring
> + *    everything after last triple underscore (___). E.g., `sample`,
> + *    `sample___flavor_one`, `sample___flavor_another_one`, are all candidates
> + *    for each other. Names with triple underscore are referred to as
> + *    "flavors" and are useful, among other things, to allow to
> + *    specify/support incompatible variations of the same kernel struct, which
> + *    might differ between different kernel versions and/or build
> + *    configurations.
> + * 2. For each candidate type, try to match local specification to this
> + *    candidate target type. Matching involves finding corresponding
> + *    high-level spec accessors, meaning that all named fields should match,
> + *    as well as all array accesses should be within the actual bounds. Also,
> + *    types should be compatible (see bpf_core_fields_are_compat for details).
> + * 3. It is supported and expected that there might be multiple flavors
> + *    matching the spec. As long as all the specs resolve to the same set of
> + *    offsets across all candidates, there is not error. If there is any
> + *    ambiguity, CO-RE relocation will fail. This is necessary to accomodate
> + *    imprefection of BTF deduplication, which can cause slight duplication of
> + *    the same BTF type, if some directly or indirectly referenced (by
> + *    pointer) type gets resolved to different actual types in different
> + *    object files. If such situation occurs, deduplicated BTF will end up
> + *    with two (or more) structurally identical types, which differ only in
> + *    types they refer to through pointer. This should be OK in most cases and
> + *    is not an error.
> + * 4. Candidate types search is performed by linearly scanning through all
> + *    types in target BTF. It is anticipated that this is overall more
> + *    efficient memory-wise and not significantly worse (if not better)
> + *    CPU-wise compared to prebuilding a map from all local type names to
> + *    a list of candidate type names. It's also sped up by caching resolved
> + *    list of matching candidates per each local "root" type ID, that has at
> + *    least one bpf_offset_reloc associated with it. This list is shared
> + *    between multiple relocations for the same type ID and is updated as some
> + *    of the candidates are pruned due to structural incompatibility.
> + */
> +static int bpf_core_reloc_offset(struct bpf_program *prog,
> +				 const struct bpf_offset_reloc *relo,
> +				 int relo_idx,
> +				 const struct btf *local_btf,
> +				 const struct btf *targ_btf,
> +				 struct hashmap *cand_cache)
> +{
> +	const char *prog_name = bpf_program__title(prog, false);
> +	struct bpf_core_spec local_spec, cand_spec, targ_spec;
> +	const void *type_key = u32_to_ptr(relo->type_id);
> +	const struct btf_type *local_type, *cand_type;
> +	const char *local_name, *cand_name;
> +	struct ids_vec *cand_ids;
> +	__u32 local_id, cand_id;
> +	const char *spec_str;
> +	int i, j, err;
> +
> +	local_id = relo->type_id;
> +	local_type = btf__type_by_id(local_btf, local_id);
> +	if (!local_type)
> +		return -EINVAL;
> +
> +	local_name = btf__name_by_offset(local_btf, local_type->name_off);
> +	if (str_is_empty(local_name))
> +		return -EINVAL;
> +
> +	spec_str = btf__name_by_offset(local_btf, relo->access_str_off);
> +	if (str_is_empty(spec_str))
> +		return -EINVAL;
> +
> +	pr_debug("prog '%s': relo #%d: insn_off=%d, [%d] (%s) + %s\n",
> +		 prog_name, relo_idx, relo->insn_off,
> +		 local_id, local_name, spec_str);
> +
> +	err = bpf_core_spec_parse(local_btf, local_id, spec_str, &local_spec);
> +	if (err) {
> +		pr_warning("prog '%s': relo #%d: parsing [%d] (%s) + %s failed: %d\n",
> +			   prog_name, relo_idx, local_id, local_name, spec_str,
> +			   err);
> +		return -EINVAL;
> +	}
> +	pr_debug("prog '%s': relo #%d: [%d] (%s) + %s is off %u, len %d, raw_len %d\n",
> +		 prog_name, relo_idx, local_id, local_name, spec_str,
> +		 local_spec.offset, local_spec.len, local_spec.raw_len);
> +
> +	if (!hashmap__find(cand_cache, type_key, (void **)&cand_ids)) {
> +		cand_ids = bpf_core_find_cands(local_btf, local_id, targ_btf);
> +		if (IS_ERR(cand_ids)) {
> +			pr_warning("prog '%s': relo #%d: target candidate search failed for [%d] (%s) + %s: %ld\n",
> +				   prog_name, relo_idx, local_id, local_name,
> +				   spec_str, PTR_ERR(cand_ids));
> +			return PTR_ERR(cand_ids);
> +		}
> +		err = hashmap__set(cand_cache, type_key, cand_ids, NULL, NULL);
> +		if (err) {
> +			bpf_core_free_cands(cand_ids);
> +			return err;
> +		}
> +	}
> +
> +	for (i = 0, j = 0; i < cand_ids->len; i++) {
> +		cand_id = cand_ids->data[j];
> +		cand_type = btf__type_by_id(targ_btf, cand_id);
> +		cand_name = btf__name_by_offset(targ_btf, cand_type->name_off);
> +
> +		err = bpf_core_spec_match(&local_spec, targ_btf,
> +					  cand_id, &cand_spec);
> +		if (err < 0) {
> +			pr_warning("prog '%s': relo #%d: failed to match spec [%d] (%s) + %s to candidate #%d [%d] (%s): %d\n",
> +				   prog_name, relo_idx, local_id, local_name,
> +				   spec_str, i, cand_id, cand_name, err);
> +			return err;
> +		}
> +		if (err == 0) {
> +			pr_debug("prog '%s': relo #%d: candidate #%d [%d] (%s) doesn't match spec\n",
> +				 prog_name, relo_idx, i, cand_id, cand_name);
> +			continue;
> +		}
> +
> +		pr_debug("prog '%s': relo #%d: candidate #%d ([%d] %s) is off %u, len %d, raw_len %d\n",
> +			 prog_name, relo_idx, i, cand_id, cand_name,
> +			 cand_spec.offset, cand_spec.len, cand_spec.raw_len);
> +
> +		if (j == 0) {
> +			targ_spec = cand_spec;
> +		} else if (cand_spec.offset != targ_spec.offset) {
> +			/* if there are many candidates, they should all
> +			 * resolve to the same offset
> +			 */
> +			pr_warning("prog '%s': relo #%d: candidate #%d ([%d] %s): conflicting offset found (%u != %u)\n",
> +				   prog_name, relo_idx, i, cand_id, cand_name,
> +				   cand_spec.offset, targ_spec.offset);
> +			return -EINVAL;
> +		}
> +
> +		cand_ids->data[j++] = cand_spec.spec[0].type_id;
> +	}
> +
> +	cand_ids->len = j;
> +	if (cand_ids->len == 0) {
> +		pr_warning("prog '%s': relo #%d: no matching targets found for [%d] (%s) + %s\n",
> +			   prog_name, relo_idx, local_id, local_name, spec_str);
> +		return -ESRCH;
> +	}
> +
> +	err = bpf_core_reloc_insn(prog, relo->insn_off,
> +				  local_spec.offset, targ_spec.offset);
> +	if (err) {
> +		pr_warning("prog '%s': relo #%d: failed to patch insn at offset %d: %d\n",
> +			   prog_name, relo_idx, relo->insn_off, err);
> +		return -EINVAL;
> +	}
> +
> +	return 0;
> +}
> +
> +static int
> +bpf_core_reloc_offsets(struct bpf_object *obj, const char *targ_btf_path)
> +{
> +	const struct btf_ext_info_sec *sec;
> +	const struct bpf_offset_reloc *rec;
> +	const struct btf_ext_info *seg;
> +	struct hashmap_entry *entry;
> +	struct hashmap *cand_cache = NULL;
> +	struct bpf_program *prog;
> +	struct btf *targ_btf;
> +	const char *sec_name;
> +	int i, err = 0;
> +
> +	if (targ_btf_path)
> +		targ_btf = btf__parse_elf(targ_btf_path, NULL);
> +	else
> +		targ_btf = bpf_core_find_kernel_btf();
> +	if (IS_ERR(targ_btf)) {
> +		pr_warning("failed to get target BTF: %ld\n",
> +			   PTR_ERR(targ_btf));
> +		return PTR_ERR(targ_btf);
> +	}
> +
> +	cand_cache = hashmap__new(bpf_core_hash_fn, bpf_core_equal_fn, NULL);
> +	if (IS_ERR(cand_cache)) {
> +		err = PTR_ERR(cand_cache);
> +		goto out;
> +	}
> +
> +	seg = &obj->btf_ext->offset_reloc_info;
> +	for_each_btf_ext_sec(seg, sec) {
> +		sec_name = btf__name_by_offset(obj->btf, sec->sec_name_off);
> +		if (str_is_empty(sec_name)) {
> +			err = -EINVAL;
> +			goto out;
> +		}
> +		prog = bpf_object__find_program_by_title(obj, sec_name);
> +		if (!prog) {
> +			pr_warning("failed to find program '%s' for CO-RE offset relocation\n",
> +				   sec_name);
> +			err = -EINVAL;
> +			goto out;
> +		}
> +
> +		pr_debug("prog '%s': performing %d CO-RE offset relocs\n",
> +			 sec_name, sec->num_info);
> +
> +		for_each_btf_ext_rec(seg, sec, i, rec) {
> +			err = bpf_core_reloc_offset(prog, rec, i, obj->btf,
> +						    targ_btf, cand_cache);
> +			if (err) {
> +				pr_warning("prog '%s': relo #%d: failed to relocate: %d\n",
> +					   sec_name, i, err);
> +				goto out;
> +			}
> +		}
> +	}
> +
> +out:
> +	btf__free(targ_btf);
> +	if (!IS_ERR_OR_NULL(cand_cache)) {
> +		hashmap__for_each_entry(cand_cache, entry, i) {
> +			bpf_core_free_cands(entry->value);
> +		}
> +		hashmap__free(cand_cache);
> +	}
> +	return err;
> +}
> +
> +static int
> +bpf_object__relocate_core(struct bpf_object *obj, const char *targ_btf_path)
> +{
> +	int err = 0;
> +
> +	if (obj->btf_ext->offset_reloc_info.len)
> +		err = bpf_core_reloc_offsets(obj, targ_btf_path);
> +
> +	return err;
> +}
> +
> static int
> bpf_program__reloc_text(struct bpf_program *prog, struct bpf_object *obj,
> 			struct reloc_desc *relo)
> @@ -2396,14 +3243,21 @@ bpf_program__relocate(struct bpf_program *prog, struct bpf_object *obj)
> 	return 0;
> }
> 
> -
> static int
> -bpf_object__relocate(struct bpf_object *obj)
> +bpf_object__relocate(struct bpf_object *obj, const char *targ_btf_path)
> {
> 	struct bpf_program *prog;
> 	size_t i;
> 	int err;
> 
> +	if (obj->btf_ext) {
> +		err = bpf_object__relocate_core(obj, targ_btf_path);
> +		if (err) {
> +			pr_warning("failed to perform CO-RE relocations: %d\n",
> +				   err);
> +			return err;
> +		}
> +	}
> 	for (i = 0; i < obj->nr_programs; i++) {
> 		prog = &obj->programs[i];
> 
> @@ -2804,7 +3658,7 @@ int bpf_object__load_xattr(struct bpf_object_load_attr *attr)
> 	obj->loaded = true;
> 
> 	CHECK_ERR(bpf_object__create_maps(obj), err, out);
> -	CHECK_ERR(bpf_object__relocate(obj), err, out);
> +	CHECK_ERR(bpf_object__relocate(obj, attr->target_btf_path), err, out);
> 	CHECK_ERR(bpf_object__load_progs(obj, attr->log_level), err, out);
> 
> 	return 0;
> diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
> index 5cbf459ece0b..6004bfe1ebf0 100644
> --- a/tools/lib/bpf/libbpf.h
> +++ b/tools/lib/bpf/libbpf.h
> @@ -92,6 +92,7 @@ LIBBPF_API void bpf_object__close(struct bpf_object *object);
> struct bpf_object_load_attr {
> 	struct bpf_object *obj;
> 	int log_level;
> +	const char *target_btf_path;
> };
> 
> /* Load/unload object into/from kernel */
> -- 
> 2.17.1
>
Alexei Starovoitov July 25, 2019, 11:18 p.m. UTC | #2
On Wed, Jul 24, 2019 at 12:27:34PM -0700, Andrii Nakryiko wrote:
> This patch implements the core logic for BPF CO-RE offsets relocations.
> All the details are described in code comments.
> 
> Signed-off-by: Andrii Nakryiko <andriin@fb.com>
> ---
>  tools/lib/bpf/libbpf.c | 866 ++++++++++++++++++++++++++++++++++++++++-
>  tools/lib/bpf/libbpf.h |   1 +
>  2 files changed, 861 insertions(+), 6 deletions(-)
> 
> diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
> index 8741c39adb1c..86d87bf10d46 100644
> --- a/tools/lib/bpf/libbpf.c
> +++ b/tools/lib/bpf/libbpf.c
> @@ -38,6 +38,7 @@
>  #include <sys/stat.h>
>  #include <sys/types.h>
>  #include <sys/vfs.h>
> +#include <sys/utsname.h>
>  #include <tools/libc_compat.h>
>  #include <libelf.h>
>  #include <gelf.h>
> @@ -47,6 +48,7 @@
>  #include "btf.h"
>  #include "str_error.h"
>  #include "libbpf_internal.h"
> +#include "hashmap.h"
>  
>  #ifndef EM_BPF
>  #define EM_BPF 247
> @@ -1013,16 +1015,22 @@ static int bpf_object__init_user_maps(struct bpf_object *obj, bool strict)
>  }
>  
>  static const struct btf_type *skip_mods_and_typedefs(const struct btf *btf,
> -						     __u32 id)
> +						     __u32 id,
> +						     __u32 *res_id)

I think it would be more readable to format it like:
static const struct btf_type *
skip_mods_and_typedefs(const struct btf *btf, __u32 id, __u32 *res_id)

> +	} else if (class == BPF_ST && BPF_MODE(insn->code) == BPF_MEM) {
> +		if (insn->imm != orig_off)
> +			return -EINVAL;
> +		insn->imm = new_off;
> +		pr_debug("prog '%s': patched insn #%d (ST | MEM) imm %d -> %d\n",
> +			 bpf_program__title(prog, false),
> +			 insn_idx, orig_off, new_off);

I'm pretty sure llvm was not capable of emitting BPF_ST insn.
When did that change?

> +/* 
> + * CO-RE relocate single instruction.
> + *
> + * The outline and important points of the algorithm:
> + * 1. For given local type, find corresponding candidate target types.
> + *    Candidate type is a type with the same "essential" name, ignoring
> + *    everything after last triple underscore (___). E.g., `sample`,
> + *    `sample___flavor_one`, `sample___flavor_another_one`, are all candidates
> + *    for each other. Names with triple underscore are referred to as
> + *    "flavors" and are useful, among other things, to allow to
> + *    specify/support incompatible variations of the same kernel struct, which
> + *    might differ between different kernel versions and/or build
> + *    configurations.

"flavors" is a convention of bpftool btf2c converter, right?
May be mention it here with pointer to the code?

> +	pr_debug("prog '%s': relo #%d: insn_off=%d, [%d] (%s) + %s\n",
> +		 prog_name, relo_idx, relo->insn_off,
> +		 local_id, local_name, spec_str);
> +
> +	err = bpf_core_spec_parse(local_btf, local_id, spec_str, &local_spec);
> +	if (err) {
> +		pr_warning("prog '%s': relo #%d: parsing [%d] (%s) + %s failed: %d\n",
> +			   prog_name, relo_idx, local_id, local_name, spec_str,
> +			   err);
> +		return -EINVAL;
> +	}
> +	pr_debug("prog '%s': relo #%d: [%d] (%s) + %s is off %u, len %d, raw_len %d\n",
> +		 prog_name, relo_idx, local_id, local_name, spec_str,
> +		 local_spec.offset, local_spec.len, local_spec.raw_len);

one warn and two debug that print more or less the same info seems like overkill.

> +	for (i = 0, j = 0; i < cand_ids->len; i++) {
> +		cand_id = cand_ids->data[j];
> +		cand_type = btf__type_by_id(targ_btf, cand_id);
> +		cand_name = btf__name_by_offset(targ_btf, cand_type->name_off);
> +
> +		err = bpf_core_spec_match(&local_spec, targ_btf,
> +					  cand_id, &cand_spec);
> +		if (err < 0) {
> +			pr_warning("prog '%s': relo #%d: failed to match spec [%d] (%s) + %s to candidate #%d [%d] (%s): %d\n",
> +				   prog_name, relo_idx, local_id, local_name,
> +				   spec_str, i, cand_id, cand_name, err);
> +			return err;
> +		}
> +		if (err == 0) {
> +			pr_debug("prog '%s': relo #%d: candidate #%d [%d] (%s) doesn't match spec\n",
> +				 prog_name, relo_idx, i, cand_id, cand_name);
> +			continue;
> +		}
> +
> +		pr_debug("prog '%s': relo #%d: candidate #%d ([%d] %s) is off %u, len %d, raw_len %d\n",
> +			 prog_name, relo_idx, i, cand_id, cand_name,
> +			 cand_spec.offset, cand_spec.len, cand_spec.raw_len);

have the same feeling about 3 printfs above.
Andrii Nakryiko July 27, 2019, 6:11 a.m. UTC | #3
On Thu, Jul 25, 2019 at 12:32 PM Song Liu <songliubraving@fb.com> wrote:
>
>
>
> > On Jul 24, 2019, at 12:27 PM, Andrii Nakryiko <andriin@fb.com> wrote:
> >
> > This patch implements the core logic for BPF CO-RE offsets relocations.
> > All the details are described in code comments.
>
> Some description in the change log is still useful. Please at least
> copy-paste key comments here.

OK, will add some more.

>
> And, this is looooong. I think it is totally possible to split it into
> multiple smaller patches.

I don't really know how to split it further without hurting reviewing
by artificially splitting related code into separate patches. Remove
any single function and algorithm will be incomplete.

Let me give you some high-level overview of how pieces are put
together. There are 9 non-trivial functions, let's go over their
purpose in the orderd in which they are defined in file:

1. bpf_core_spec_parse()

This one take bpf_offset_reloc's type_id and accessor string
("0:1:2:3") and parses it into more convenient bpf_core_spec
datastructure, which has calculated offset and high-level spec
"steps": either named field or array access.

2. bpf_core_find_cands()

Given local type name, finds all possible target BTF types with same
name (modulo "flavor" differences, ___flavor suffix is just ignored).

3. bpf_core_fields_are_compat()

Given local and target field match, checks that their types are
compatible (so that we don't accidentally match, e.g., int against
struct).

4. bpf_core_match_member()

Given named local field, find corresponding field in target struct. To
understand why it's not trivial, here's an example:

Local type:

struct s___local {
  int a;
};

Target type:

struct s___target {
  struct {
    union {
      int a;
    };
  };
};

For both cases you can access a as s.a, but in local case, field a is
immediately inside s___local, while for s___target, you have to
traverse two levels deeper into anonymous fields to get to an `a`
inside anonymous union.

So this function find that `a` by doing exhaustive search across all
named field and anonymous struct/unions. But otherwise it's pretty
straightforward recursive function.

bpf_core_spec_match()

Just goes over high-level spec steps in local spec and tries to figure
out both high-level and low-level steps for targe type. Consider the
above example. For both structs accessing s.a is one high-level step,
but for s___local it's single low-level step (just another :0 in spec
string), while for s___target it's three low-level steps: ":0:0:0",
one step for each BTF type we need to traverse.

Array access is simpler, it's always one high-level and one low-level step.

bpf_core_reloc_insn()

Once we match local and target specs and have local and target
offsets, do the relocations - check that instruction has expected
local offset and replace it with target offset.

bpf_core_find_kernel_btf()

This is the only function that can be moved into separate patch, but
it's also very simple. It just iterates over few known possible
locations for vmlinux image and once found, tries to parse .BTF out of
it, to be used as target BTF.

bpf_core_reloc_offset()

It combines all the above functions to perform single relocation.
Parse spec, get candidates, for each candidate try to find matching
target spec. All candidates that matched are cached for given local
root type.

bpf_core_reloc_offsets()

High-level coordination. Iterate over all per-program .BTF.ext offset
reloc sections, each relocation within them. Find corresponding
program and try to apply relocations one by one.


I think the only non-obvious part here is to understand that
relocation records local raw spec with every single anonymous type
traversal, which is not that useful when we try to match it against
target type, which can have very different composition, but still the
same field access pattern, from C language standpoint (which hides all
those anonymous type traversals from programmer).

But it should be pretty clear now, plus also check tests, they have
lots of cases showing what's compatible and what's not.


>
> I haven't finished all of it. Please see my comments below of parts I
> have covered.
>
> Thanks,
> Song
>
> >
> > Signed-off-by: Andrii Nakryiko <andriin@fb.com>
> > ---
> > tools/lib/bpf/libbpf.c | 866 ++++++++++++++++++++++++++++++++++++++++-
> > tools/lib/bpf/libbpf.h |   1 +
> > 2 files changed, 861 insertions(+), 6 deletions(-)
> >
> > diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
> > index 8741c39adb1c..86d87bf10d46 100644
> > --- a/tools/lib/bpf/libbpf.c
> > +++ b/tools/lib/bpf/libbpf.c
> > @@ -38,6 +38,7 @@
> > #include <sys/stat.h>
> > #include <sys/types.h>
> > #include <sys/vfs.h>
> > +#include <sys/utsname.h>
> > #include <tools/libc_compat.h>
> > #include <libelf.h>
> > #include <gelf.h>
> > @@ -47,6 +48,7 @@
> > #include "btf.h"
> > #include "str_error.h"
> > #include "libbpf_internal.h"
> > +#include "hashmap.h"
> >
> > #ifndef EM_BPF
> > #define EM_BPF 247
> > @@ -1013,16 +1015,22 @@ static int bpf_object__init_user_maps(struct bpf_object *obj, bool strict)
> > }
> >
> > static const struct btf_type *skip_mods_and_typedefs(const struct btf *btf,
> > -                                                  __u32 id)
> > +                                                  __u32 id,
> > +                                                  __u32 *res_id)
> > {
> >       const struct btf_type *t = btf__type_by_id(btf, id);
>
> Maybe have a local "__u32 rid;"
>
> >
> > +     if (res_id)
> > +             *res_id = id;
> > +
>
> and do "rid = id;" here
>
> >       while (true) {
> >               switch (BTF_INFO_KIND(t->info)) {
> >               case BTF_KIND_VOLATILE:
> >               case BTF_KIND_CONST:
> >               case BTF_KIND_RESTRICT:
> >               case BTF_KIND_TYPEDEF:
> > +                     if (res_id)
> > +                             *res_id = t->type;
> and here
>
> >                       t = btf__type_by_id(btf, t->type);
> >                       break;
> >               default:
> and "*res_id = rid;" right before return?

Sure, but why?

>
> > @@ -1041,7 +1049,7 @@ static const struct btf_type *skip_mods_and_typedefs(const struct btf *btf,
> > static bool get_map_field_int(const char *map_name, const struct btf *btf,
> >                             const struct btf_type *def,
> >                             const struct btf_member *m, __u32 *res) {

[...]

> > +struct bpf_core_spec {
> > +     const struct btf *btf;
> > +     /* high-level spec: named fields and array indicies only */
>
> typo: indices

thanks!

>
> > +     struct bpf_core_accessor spec[BPF_CORE_SPEC_MAX_LEN];
> > +     /* high-level spec length */
> > +     int len;
> > +     /* raw, low-level spec: 1-to-1 with accessor spec string */
> > +     int raw_spec[BPF_CORE_SPEC_MAX_LEN];
> > +     /* raw spec length */
> > +     int raw_len;
> > +     /* field byte offset represented by spec */
> > +     __u32 offset;
> > +};

[...]

> > + *
> > + *   int x = &s->a[3]; // access string = '0:1:2:3'
> > + *
> > + * Low-level spec has 1:1 mapping with each element of access string (it's
> > + * just a parsed access string representation): [0, 1, 2, 3].
> > + *
> > + * High-level spec will capture only 3 points:
> > + *   - intial zero-index access by pointer (&s->... is the same as &s[0]...);
> > + *   - field 'a' access (corresponds to '2' in low-level spec);
> > + *   - array element #3 access (corresponds to '3' in low-level spec).
> > + *
> > + */
>
> IIUC, high-level points are subset of low-level points. How about we introduce
> "anonymous" high-level points, so that high-level points and low-level points
> are 1:1 mapping?

No, that will just hurt and complicate things. See above explanation
about why we need high-level points (it's what you as C programmer try
to achieve vs low-level spec is what C-language does in reality, with
all the anonymous struct/union traversal).

What's wrong with this separation? Think about it as recording
"intent" (high-level spec) vs "mechanics" (low-level spec, how exactly
to achieve that intent, in excruciating details).

>
> > +static int bpf_core_spec_parse(const struct btf *btf,
> > +                            __u32 type_id,
> > +                            const char *spec_str,
> > +                            struct bpf_core_spec *spec)
> > +{
> > +     int access_idx, parsed_len, i;
> > +     const struct btf_type *t;
> > +     __u32 id = type_id;
> > +     const char *name;
> > +     __s64 sz;
> > +
> > +     if (str_is_empty(spec_str) || *spec_str == ':')
> > +             return -EINVAL;
> > +
> > +     memset(spec, 0, sizeof(*spec));
> > +     spec->btf = btf;
> > +
> > +     /* parse spec_str="0:1:2:3:4" into array raw_spec=[0, 1, 2, 3, 4] */
> > +     while (*spec_str) {
> > +             if (*spec_str == ':')
> > +                     ++spec_str;
> > +             if (sscanf(spec_str, "%d%n", &access_idx, &parsed_len) != 1)
> > +                     return -EINVAL;
> > +             if (spec->raw_len == BPF_CORE_SPEC_MAX_LEN)
> > +                     return -E2BIG;
> > +             spec_str += parsed_len;
> > +             spec->raw_spec[spec->raw_len++] = access_idx;
> > +     }
> > +
> > +     if (spec->raw_len == 0)
> > +             return -EINVAL;
> > +
> > +     for (i = 0; i < spec->raw_len; i++) {
> > +             t = skip_mods_and_typedefs(btf, id, &id);
> > +             if (!t)
> > +                     return -EINVAL;
> > +
> > +             access_idx = spec->raw_spec[i];
> > +
> > +             if (i == 0) {
> > +                     /* first spec value is always reloc type array index */
> > +                     spec->spec[spec->len].type_id = id;
> > +                     spec->spec[spec->len].idx = access_idx;
> > +                     spec->len++;
> > +
> > +                     sz = btf__resolve_size(btf, id);
> > +                     if (sz < 0)
> > +                             return sz;
> > +                     spec->offset += access_idx * sz;
>           spec->offset = access_idx * sz;  should be enough

No. spec->offset is carefully maintained across multiple low-level
steps, as we traverse down embedded structs/unions.

Think about, e.g.:

struct s {
    int a;
    struct {
        int b;
    };
};

Imagine you are trying to match s.b access. With what you propose
you'll end up with offset 0, but it should be 4.

>
> > +                     continue;
> > +             }
>
> Maybe pull i == 0 case out of the for loop?
>
> > +
> > +             if (btf_is_composite(t)) {

[...]

> > +
> > +     if (spec->len == 0)
> > +             return -EINVAL;
>
> Can this ever happen?

Not really, because I already check raw_len == 0 and exit with error.
I'll remove.

>
> > +
> > +     return 0;
> > +}
> > +

[...]

> > +
> > +/*
> > + * Given single high-level accessor (either named field or array index) in
> > + * local type, find corresponding high-level accessor for a target type. Along
> > + * the way, maintain low-level spec for target as well. Also keep updating
> > + * target offset.
> > + */
>
> Please describe the recursive algorithm here. I am kinda lost.

Explained above. I'll extend description a bit. But it's just
recursive exhaustive search:
1. if struct field is anonymous and is struct/union, go one level
deeper and try to find field with given name inside those.
2. if field has name and it matched what we are searching - check type
compatibility. It has to be compatible, so if it's not, then it's not
a match.

> Also, please document the meaning of zero, positive, negative return values.

Ok. It's standard <0 - error, 0 - false, 1 - true.

>
> > +static int bpf_core_match_member(const struct btf *local_btf,
> > +                              const struct bpf_core_accessor *local_acc,
> > +                              const struct btf *targ_btf,
> > +                              __u32 targ_id,
> > +                              struct bpf_core_spec *spec,
> > +                              __u32 *next_targ_id)
> > +{

[...]

> > +
> > +/*
> > + * Try to match local spec to a target type and, if successful, produce full
> > + * target spec (high-level, low-level + offset).
> > + */
> > +static int bpf_core_spec_match(struct bpf_core_spec *local_spec,
> > +                            const struct btf *targ_btf, __u32 targ_id,
> > +                            struct bpf_core_spec *targ_spec)
> > +{
> > +     const struct btf_type *targ_type;
> > +     const struct bpf_core_accessor *local_acc;
> > +     struct bpf_core_accessor *targ_acc;
> > +     int i, sz, matched;
> > +
> > +     memset(targ_spec, 0, sizeof(*targ_spec));
> > +     targ_spec->btf = targ_btf;
> > +
> > +     local_acc = &local_spec->spec[0];
> > +     targ_acc = &targ_spec->spec[0];
> > +
> > +     for (i = 0; i < local_spec->len; i++, local_acc++, targ_acc++) {
> > +             targ_type = skip_mods_and_typedefs(targ_spec->btf, targ_id,
> > +                                                &targ_id);
> > +             if (!targ_type)
> > +                     return -EINVAL;
> > +
> > +             if (local_acc->name) {
> > +                     if (!btf_is_composite(targ_type))
> > +                             return 0;
> > +
> > +                     matched = bpf_core_match_member(local_spec->btf,
> > +                                                     local_acc,
> > +                                                     targ_btf, targ_id,
> > +                                                     targ_spec, &targ_id);
> > +                     if (matched <= 0)
> > +                             return matched;
> > +             } else {
> > +                     /* for i=0, targ_id is already treated as array element
> > +                      * type (because it's the original struct), for others
> > +                      * we should find array element type first
> > +                      */
> > +                     if (i > 0) {
>
> i == 0 case would go into "if (local_acc->name)" branch, no?

No, i == 0 is always an array access. s->a.b.c is the same as
s[0].a.b.c, so relocation's first spec element is always either zero
for pointer access or any non-negative index for array access. But it
is always array access.

>
> > +                             const struct btf_array *a;
> > +
> > +                             if (!btf_is_array(targ_type))
> > +                                     return 0;
> > +
> > +                             a = (void *)(targ_type + 1);

[...]
Andrii Nakryiko July 27, 2019, 6:25 a.m. UTC | #4
On Thu, Jul 25, 2019 at 4:18 PM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Wed, Jul 24, 2019 at 12:27:34PM -0700, Andrii Nakryiko wrote:
> > This patch implements the core logic for BPF CO-RE offsets relocations.
> > All the details are described in code comments.
> >
> > Signed-off-by: Andrii Nakryiko <andriin@fb.com>
> > ---
> >  tools/lib/bpf/libbpf.c | 866 ++++++++++++++++++++++++++++++++++++++++-
> >  tools/lib/bpf/libbpf.h |   1 +
> >  2 files changed, 861 insertions(+), 6 deletions(-)
> >
> > diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
> > index 8741c39adb1c..86d87bf10d46 100644
> > --- a/tools/lib/bpf/libbpf.c
> > +++ b/tools/lib/bpf/libbpf.c
> > @@ -38,6 +38,7 @@
> >  #include <sys/stat.h>
> >  #include <sys/types.h>
> >  #include <sys/vfs.h>
> > +#include <sys/utsname.h>
> >  #include <tools/libc_compat.h>
> >  #include <libelf.h>
> >  #include <gelf.h>
> > @@ -47,6 +48,7 @@
> >  #include "btf.h"
> >  #include "str_error.h"
> >  #include "libbpf_internal.h"
> > +#include "hashmap.h"
> >
> >  #ifndef EM_BPF
> >  #define EM_BPF 247
> > @@ -1013,16 +1015,22 @@ static int bpf_object__init_user_maps(struct bpf_object *obj, bool strict)
> >  }
> >
> >  static const struct btf_type *skip_mods_and_typedefs(const struct btf *btf,
> > -                                                  __u32 id)
> > +                                                  __u32 id,
> > +                                                  __u32 *res_id)
>
> I think it would be more readable to format it like:
> static const struct btf_type *
> skip_mods_and_typedefs(const struct btf *btf, __u32 id, __u32 *res_id)

Ok.

>
> > +     } else if (class == BPF_ST && BPF_MODE(insn->code) == BPF_MEM) {
> > +             if (insn->imm != orig_off)
> > +                     return -EINVAL;
> > +             insn->imm = new_off;
> > +             pr_debug("prog '%s': patched insn #%d (ST | MEM) imm %d -> %d\n",
> > +                      bpf_program__title(prog, false),
> > +                      insn_idx, orig_off, new_off);
>
> I'm pretty sure llvm was not capable of emitting BPF_ST insn.
> When did that change?

I just looked at possible instructions that could have 32-bit
immediate value. This is `*(rX) = offsetof(struct s, field)`, which I
though is conceivable. Do you think I should drop it?

>
> > +/*
> > + * CO-RE relocate single instruction.
> > + *
> > + * The outline and important points of the algorithm:
> > + * 1. For given local type, find corresponding candidate target types.
> > + *    Candidate type is a type with the same "essential" name, ignoring
> > + *    everything after last triple underscore (___). E.g., `sample`,
> > + *    `sample___flavor_one`, `sample___flavor_another_one`, are all candidates
> > + *    for each other. Names with triple underscore are referred to as
> > + *    "flavors" and are useful, among other things, to allow to
> > + *    specify/support incompatible variations of the same kernel struct, which
> > + *    might differ between different kernel versions and/or build
> > + *    configurations.
>
> "flavors" is a convention of bpftool btf2c converter, right?
> May be mention it here with pointer to the code?

Yes, btf2c converter generates "flavors" on type name conflict (adding
___2, ___3), but it's not the only use case. It's a general way to
have independent incompatible definitions for the same target type.
E.g., locally in your BPF program you can define two thread_structs to
accommodate field rename between kernel version changes:

struct thread_struct___before_47 {
    long fs;
};

struct thread_struct___after_47 {
    long fsbase;
};

Then with conditional relocations you'll use one of them to "extract"
it from real kernel's thread_struct:

void *fsbase;

if (LINUX_VERSION < 407)
    BPF_CORE_READ(&fsbase, sizeof(fsbase),
                  &((struct thread_struct___before_47 *)&thread)->fs);
else
    BPF_CORE_READ(&fsbase, sizeof(fsbase),
                  &((struct thread_struct___after_47 *)&thread)->fsbase);

So it works both ways (for local and target types) by design. I can
mention that btf2c converter uses this convention for types with
conflicting names, but btf2c is not a definition of what flavor is.

>
> > +     pr_debug("prog '%s': relo #%d: insn_off=%d, [%d] (%s) + %s\n",
> > +              prog_name, relo_idx, relo->insn_off,
> > +              local_id, local_name, spec_str);
> > +
> > +     err = bpf_core_spec_parse(local_btf, local_id, spec_str, &local_spec);
> > +     if (err) {
> > +             pr_warning("prog '%s': relo #%d: parsing [%d] (%s) + %s failed: %d\n",
> > +                        prog_name, relo_idx, local_id, local_name, spec_str,
> > +                        err);
> > +             return -EINVAL;
> > +     }
> > +     pr_debug("prog '%s': relo #%d: [%d] (%s) + %s is off %u, len %d, raw_len %d\n",
> > +              prog_name, relo_idx, local_id, local_name, spec_str,
> > +              local_spec.offset, local_spec.len, local_spec.raw_len);
>
> one warn and two debug that print more or less the same info seems like overkill.

Only one of them will ever be emitted, though. And this information is
and will be invaluable to debug issues/explain behavior in the future
once adoption starts. So I'm inclined to keep them, at least for now.
But I think I'll extract spec formatting into a separate reusable
function, which will make this significantly less verbose.

>
> > +     for (i = 0, j = 0; i < cand_ids->len; i++) {
> > +             cand_id = cand_ids->data[j];
> > +             cand_type = btf__type_by_id(targ_btf, cand_id);
> > +             cand_name = btf__name_by_offset(targ_btf, cand_type->name_off);
> > +
> > +             err = bpf_core_spec_match(&local_spec, targ_btf,
> > +                                       cand_id, &cand_spec);
> > +             if (err < 0) {
> > +                     pr_warning("prog '%s': relo #%d: failed to match spec [%d] (%s) + %s to candidate #%d [%d] (%s): %d\n",
> > +                                prog_name, relo_idx, local_id, local_name,
> > +                                spec_str, i, cand_id, cand_name, err);
> > +                     return err;
> > +             }
> > +             if (err == 0) {
> > +                     pr_debug("prog '%s': relo #%d: candidate #%d [%d] (%s) doesn't match spec\n",
> > +                              prog_name, relo_idx, i, cand_id, cand_name);
> > +                     continue;
> > +             }
> > +
> > +             pr_debug("prog '%s': relo #%d: candidate #%d ([%d] %s) is off %u, len %d, raw_len %d\n",
> > +                      prog_name, relo_idx, i, cand_id, cand_name,
> > +                      cand_spec.offset, cand_spec.len, cand_spec.raw_len);
>
> have the same feeling about 3 printfs above.
>

Same as above.
Alexei Starovoitov July 27, 2019, 5 p.m. UTC | #5
On 7/26/19 11:25 PM, Andrii Nakryiko wrote:
>>> +     } else if (class == BPF_ST && BPF_MODE(insn->code) == BPF_MEM) {
>>> +             if (insn->imm != orig_off)
>>> +                     return -EINVAL;
>>> +             insn->imm = new_off;
>>> +             pr_debug("prog '%s': patched insn #%d (ST | MEM) imm %d -> %d\n",
>>> +                      bpf_program__title(prog, false),
>>> +                      insn_idx, orig_off, new_off);
>> I'm pretty sure llvm was not capable of emitting BPF_ST insn.
>> When did that change?
> I just looked at possible instructions that could have 32-bit
> immediate value. This is `*(rX) = offsetof(struct s, field)`, which I
> though is conceivable. Do you think I should drop it?

Just trying to point out that since it's not emitted by llvm
this code is likely untested ?
Or you've created a bpf asm test for this?
Andrii Nakryiko July 27, 2019, 6:24 p.m. UTC | #6
On Sat, Jul 27, 2019 at 10:00 AM Alexei Starovoitov <ast@fb.com> wrote:
>
> On 7/26/19 11:25 PM, Andrii Nakryiko wrote:
> >>> +     } else if (class == BPF_ST && BPF_MODE(insn->code) == BPF_MEM) {
> >>> +             if (insn->imm != orig_off)
> >>> +                     return -EINVAL;
> >>> +             insn->imm = new_off;
> >>> +             pr_debug("prog '%s': patched insn #%d (ST | MEM) imm %d -> %d\n",
> >>> +                      bpf_program__title(prog, false),
> >>> +                      insn_idx, orig_off, new_off);
> >> I'm pretty sure llvm was not capable of emitting BPF_ST insn.
> >> When did that change?
> > I just looked at possible instructions that could have 32-bit
> > immediate value. This is `*(rX) = offsetof(struct s, field)`, which I
> > though is conceivable. Do you think I should drop it?
>
> Just trying to point out that since it's not emitted by llvm
> this code is likely untested ?
> Or you've created a bpf asm test for this?


Yeah, it's untested right now. Let me try to come up with LLVM
assembly + relocation (not yet sure how/whether builtin works with
inline assembly), if that works out, I'll leave this, if not, I'll
drop BPF_ST|BPF_MEM part.

>
>
Song Liu July 27, 2019, 6:59 p.m. UTC | #7
> On Jul 26, 2019, at 11:11 PM, Andrii Nakryiko <andrii.nakryiko@gmail.com> wrote:
> 
> On Thu, Jul 25, 2019 at 12:32 PM Song Liu <songliubraving@fb.com> wrote:
>> 
>> 
>> 
>>> On Jul 24, 2019, at 12:27 PM, Andrii Nakryiko <andriin@fb.com> wrote:
>>> 
>>> This patch implements the core logic for BPF CO-RE offsets relocations.
>>> All the details are described in code comments.
>> 
>> Some description in the change log is still useful. Please at least
>> copy-paste key comments here.
> 
> OK, will add some more.
> 
>> 
>> And, this is looooong. I think it is totally possible to split it into
>> multiple smaller patches.
> 
> I don't really know how to split it further without hurting reviewing
> by artificially splitting related code into separate patches. Remove
> any single function and algorithm will be incomplete.
> 
> Let me give you some high-level overview of how pieces are put
> together. There are 9 non-trivial functions, let's go over their
> purpose in the orderd in which they are defined in file:
> 
> 1. bpf_core_spec_parse()
> 
> This one take bpf_offset_reloc's type_id and accessor string
> ("0:1:2:3") and parses it into more convenient bpf_core_spec
> datastructure, which has calculated offset and high-level spec
> "steps": either named field or array access.
> 
> 2. bpf_core_find_cands()
> 
> Given local type name, finds all possible target BTF types with same
> name (modulo "flavor" differences, ___flavor suffix is just ignored).
> 
> 3. bpf_core_fields_are_compat()
> 
> Given local and target field match, checks that their types are
> compatible (so that we don't accidentally match, e.g., int against
> struct).
> 
> 4. bpf_core_match_member()
> 
> Given named local field, find corresponding field in target struct. To
> understand why it's not trivial, here's an example:
> 
> Local type:
> 
> struct s___local {
>  int a;
> };
> 
> Target type:
> 
> struct s___target {
>  struct {
>    union {
>      int a;
>    };
>  };
> };
> 
> For both cases you can access a as s.a, but in local case, field a is
> immediately inside s___local, while for s___target, you have to
> traverse two levels deeper into anonymous fields to get to an `a`
> inside anonymous union.
> 
> So this function find that `a` by doing exhaustive search across all
> named field and anonymous struct/unions. But otherwise it's pretty
> straightforward recursive function.
> 
> bpf_core_spec_match()
> 
> Just goes over high-level spec steps in local spec and tries to figure
> out both high-level and low-level steps for targe type. Consider the
> above example. For both structs accessing s.a is one high-level step,
> but for s___local it's single low-level step (just another :0 in spec
> string), while for s___target it's three low-level steps: ":0:0:0",
> one step for each BTF type we need to traverse.
> 
> Array access is simpler, it's always one high-level and one low-level step.
> 
> bpf_core_reloc_insn()
> 
> Once we match local and target specs and have local and target
> offsets, do the relocations - check that instruction has expected
> local offset and replace it with target offset.
> 
> bpf_core_find_kernel_btf()
> 
> This is the only function that can be moved into separate patch, but
> it's also very simple. It just iterates over few known possible
> locations for vmlinux image and once found, tries to parse .BTF out of
> it, to be used as target BTF.
> 
> bpf_core_reloc_offset()
> 
> It combines all the above functions to perform single relocation.
> Parse spec, get candidates, for each candidate try to find matching
> target spec. All candidates that matched are cached for given local
> root type.

Thanks for these explanation. They are really helpful. 

I think some example explaining each step of bpf_core_reloc_offset()
will be very helpful. Something like:

Example:

struct s {
	int a;
	struct {
		int b;
		bool c;
	};
};

To get offset for c, we do:

bpf_core_reloc_offset() {
	
	/* input data: xxx */

	/* first step: bpf_core_spec_parse() */

	/* data after first step */

	/* second step: bpf_core_find_cands() */

	/* candidate A and B after second step */

	...
}

Well, it requires quite some work to document this way. Please let me 
know if you feel this is an overkill. 

> 
> bpf_core_reloc_offsets()
> 
> High-level coordination. Iterate over all per-program .BTF.ext offset
> reloc sections, each relocation within them. Find corresponding
> program and try to apply relocations one by one.
> 
> 
> I think the only non-obvious part here is to understand that
> relocation records local raw spec with every single anonymous type
> traversal, which is not that useful when we try to match it against
> target type, which can have very different composition, but still the
> same field access pattern, from C language standpoint (which hides all
> those anonymous type traversals from programmer).
> 
> But it should be pretty clear now, plus also check tests, they have
> lots of cases showing what's compatible and what's not.

I see. I will review the tests. 

>>> 
>>> static const struct btf_type *skip_mods_and_typedefs(const struct btf *btf,
>>> -                                                  __u32 id)
>>> +                                                  __u32 id,
>>> +                                                  __u32 *res_id)
>>> {
>>>      const struct btf_type *t = btf__type_by_id(btf, id);
>> 
>> Maybe have a local "__u32 rid;"
>> 
>>> 
>>> +     if (res_id)
>>> +             *res_id = id;
>>> +
>> 
>> and do "rid = id;" here
>> 
>>>      while (true) {
>>>              switch (BTF_INFO_KIND(t->info)) {
>>>              case BTF_KIND_VOLATILE:
>>>              case BTF_KIND_CONST:
>>>              case BTF_KIND_RESTRICT:
>>>              case BTF_KIND_TYPEDEF:
>>> +                     if (res_id)
>>> +                             *res_id = t->type;
>> and here
>> 
>>>                      t = btf__type_by_id(btf, t->type);
>>>                      break;
>>>              default:
>> and "*res_id = rid;" right before return?
> 
> Sure, but why?

I think it is cleaner that way. But feel free to ignore if you
think otherwise. 

> 
>> 
>>> @@ -1041,7 +1049,7 @@ static const struct btf_type *skip_mods_and_typedefs(const struct btf *btf,
>>> static bool get_map_field_int(const char *map_name, const struct btf *btf,
>>>                            const struct btf_type *def,
>>>                            const struct btf_member *m, __u32 *res) {
> 
> [...]
> 
>>> +struct bpf_core_spec {
>>> +     const struct btf *btf;
>>> +     /* high-level spec: named fields and array indicies only */
>> 
>> typo: indices
> 
> thanks!
> 
>> 
>>> +     struct bpf_core_accessor spec[BPF_CORE_SPEC_MAX_LEN];
>>> +     /* high-level spec length */
>>> +     int len;
>>> +     /* raw, low-level spec: 1-to-1 with accessor spec string */
>>> +     int raw_spec[BPF_CORE_SPEC_MAX_LEN];
>>> +     /* raw spec length */
>>> +     int raw_len;
>>> +     /* field byte offset represented by spec */
>>> +     __u32 offset;
>>> +};
> 
> [...]
> 
>>> + *
>>> + *   int x = &s->a[3]; // access string = '0:1:2:3'
>>> + *
>>> + * Low-level spec has 1:1 mapping with each element of access string (it's
>>> + * just a parsed access string representation): [0, 1, 2, 3].
>>> + *
>>> + * High-level spec will capture only 3 points:
>>> + *   - intial zero-index access by pointer (&s->... is the same as &s[0]...);
>>> + *   - field 'a' access (corresponds to '2' in low-level spec);
>>> + *   - array element #3 access (corresponds to '3' in low-level spec).
>>> + *
>>> + */
>> 
>> IIUC, high-level points are subset of low-level points. How about we introduce
>> "anonymous" high-level points, so that high-level points and low-level points
>> are 1:1 mapping?
> 
> No, that will just hurt and complicate things. See above explanation
> about why we need high-level points (it's what you as C programmer try
> to achieve vs low-level spec is what C-language does in reality, with
> all the anonymous struct/union traversal).
> 
> What's wrong with this separation? Think about it as recording
> "intent" (high-level spec) vs "mechanics" (low-level spec, how exactly
> to achieve that intent, in excruciating details).

There is nothing wrong with separation. I just personally think it is 
cleaner the other way. That's why I raised the question. 

I will go with your assessment, as you looked into this much more than 
I did. :-)

[...]

>>> +
>>> +     memset(spec, 0, sizeof(*spec));
>>> +     spec->btf = btf;
>>> +
>>> +     /* parse spec_str="0:1:2:3:4" into array raw_spec=[0, 1, 2, 3, 4] */
>>> +     while (*spec_str) {
>>> +             if (*spec_str == ':')
>>> +                     ++spec_str;
>>> +             if (sscanf(spec_str, "%d%n", &access_idx, &parsed_len) != 1)
>>> +                     return -EINVAL;
>>> +             if (spec->raw_len == BPF_CORE_SPEC_MAX_LEN)
>>> +                     return -E2BIG;
>>> +             spec_str += parsed_len;
>>> +             spec->raw_spec[spec->raw_len++] = access_idx;
>>> +     }
>>> +
>>> +     if (spec->raw_len == 0)
>>> +             return -EINVAL;
>>> +
>>> +     for (i = 0; i < spec->raw_len; i++) {
>>> +             t = skip_mods_and_typedefs(btf, id, &id);
>>> +             if (!t)
>>> +                     return -EINVAL;
>>> +
>>> +             access_idx = spec->raw_spec[i];
>>> +
>>> +             if (i == 0) {
>>> +                     /* first spec value is always reloc type array index */
>>> +                     spec->spec[spec->len].type_id = id;
>>> +                     spec->spec[spec->len].idx = access_idx;
>>> +                     spec->len++;
>>> +
>>> +                     sz = btf__resolve_size(btf, id);
>>> +                     if (sz < 0)
>>> +                             return sz;
>>> +                     spec->offset += access_idx * sz;
>>          spec->offset = access_idx * sz;  should be enough
> 
> No. spec->offset is carefully maintained across multiple low-level
> steps, as we traverse down embedded structs/unions.
> 
> Think about, e.g.:
> 
> struct s {
>    int a;
>    struct {
>        int b;
>    };
> };
> 
> Imagine you are trying to match s.b access. With what you propose
> you'll end up with offset 0, but it should be 4.

Hmm... this is just for i == 0, right? Which line updated spec->offset
after "memset(spec, 0, sizeof(*spec));"?

> 
>> 
>>> +                     continue;
>>> +             }
>> 
>> Maybe pull i == 0 case out of the for loop?
>> 
>>> +
>>> +             if (btf_is_composite(t)) {
> 
> [...]
> 
>>> +
>>> +     if (spec->len == 0)
>>> +             return -EINVAL;
>> 
>> Can this ever happen?
> 
> Not really, because I already check raw_len == 0 and exit with error.
> I'll remove.
> 
>> 
>>> +
>>> +     return 0;
>>> +}
>>> +
> 
> [...]
> 
>>> +
>>> +/*
>>> + * Given single high-level accessor (either named field or array index) in
>>> + * local type, find corresponding high-level accessor for a target type. Along
>>> + * the way, maintain low-level spec for target as well. Also keep updating
>>> + * target offset.
>>> + */
>> 
>> Please describe the recursive algorithm here. I am kinda lost.
> 
> Explained above. I'll extend description a bit. But it's just
> recursive exhaustive search:
> 1. if struct field is anonymous and is struct/union, go one level
> deeper and try to find field with given name inside those.
> 2. if field has name and it matched what we are searching - check type
> compatibility. It has to be compatible, so if it's not, then it's not
> a match.
> 
>> Also, please document the meaning of zero, positive, negative return values.
> 
> Ok. It's standard <0 - error, 0 - false, 1 - true.
> 
>> 
>>> +static int bpf_core_match_member(const struct btf *local_btf,
>>> +                              const struct bpf_core_accessor *local_acc,
>>> +                              const struct btf *targ_btf,
>>> +                              __u32 targ_id,
>>> +                              struct bpf_core_spec *spec,
>>> +                              __u32 *next_targ_id)
>>> +{
> 
> [...]
> 
>>> +             if (local_acc->name) {
>>> +                     if (!btf_is_composite(targ_type))
>>> +                             return 0;
>>> +
>>> +                     matched = bpf_core_match_member(local_spec->btf,
>>> +                                                     local_acc,
>>> +                                                     targ_btf, targ_id,
>>> +                                                     targ_spec, &targ_id);
>>> +                     if (matched <= 0)
>>> +                             return matched;
>>> +             } else {
>>> +                     /* for i=0, targ_id is already treated as array element
>>> +                      * type (because it's the original struct), for others
>>> +                      * we should find array element type first
>>> +                      */
>>> +                     if (i > 0) {
>> 
>> i == 0 case would go into "if (local_acc->name)" branch, no?
> 
> No, i == 0 is always an array access. s->a.b.c is the same as
> s[0].a.b.c, so relocation's first spec element is always either zero
> for pointer access or any non-negative index for array access. But it
> is always array access.

I see. Thanks for the explanation.

Song
Andrii Nakryiko July 27, 2019, 7:09 p.m. UTC | #8
On Sat, Jul 27, 2019 at 11:59 AM Song Liu <songliubraving@fb.com> wrote:
>
>
>
> > On Jul 26, 2019, at 11:11 PM, Andrii Nakryiko <andrii.nakryiko@gmail.com> wrote:
> >
> > On Thu, Jul 25, 2019 at 12:32 PM Song Liu <songliubraving@fb.com> wrote:
> >>
> >>
> >>
> >>> On Jul 24, 2019, at 12:27 PM, Andrii Nakryiko <andriin@fb.com> wrote:
> >>>
> >>> This patch implements the core logic for BPF CO-RE offsets relocations.
> >>> All the details are described in code comments.
> >>
> >> Some description in the change log is still useful. Please at least
> >> copy-paste key comments here.
> >
> > OK, will add some more.
> >
> >>
> >> And, this is looooong. I think it is totally possible to split it into
> >> multiple smaller patches.
> >
> > I don't really know how to split it further without hurting reviewing
> > by artificially splitting related code into separate patches. Remove
> > any single function and algorithm will be incomplete.
> >
> > Let me give you some high-level overview of how pieces are put
> > together. There are 9 non-trivial functions, let's go over their
> > purpose in the orderd in which they are defined in file:
> >
> > 1. bpf_core_spec_parse()
> >
> > This one take bpf_offset_reloc's type_id and accessor string
> > ("0:1:2:3") and parses it into more convenient bpf_core_spec
> > datastructure, which has calculated offset and high-level spec
> > "steps": either named field or array access.
> >
> > 2. bpf_core_find_cands()
> >
> > Given local type name, finds all possible target BTF types with same
> > name (modulo "flavor" differences, ___flavor suffix is just ignored).
> >
> > 3. bpf_core_fields_are_compat()
> >
> > Given local and target field match, checks that their types are
> > compatible (so that we don't accidentally match, e.g., int against
> > struct).
> >
> > 4. bpf_core_match_member()
> >
> > Given named local field, find corresponding field in target struct. To
> > understand why it's not trivial, here's an example:
> >
> > Local type:
> >
> > struct s___local {
> >  int a;
> > };
> >
> > Target type:
> >
> > struct s___target {
> >  struct {
> >    union {
> >      int a;
> >    };
> >  };
> > };
> >
> > For both cases you can access a as s.a, but in local case, field a is
> > immediately inside s___local, while for s___target, you have to
> > traverse two levels deeper into anonymous fields to get to an `a`
> > inside anonymous union.
> >
> > So this function find that `a` by doing exhaustive search across all
> > named field and anonymous struct/unions. But otherwise it's pretty
> > straightforward recursive function.
> >
> > bpf_core_spec_match()
> >
> > Just goes over high-level spec steps in local spec and tries to figure
> > out both high-level and low-level steps for targe type. Consider the
> > above example. For both structs accessing s.a is one high-level step,
> > but for s___local it's single low-level step (just another :0 in spec
> > string), while for s___target it's three low-level steps: ":0:0:0",
> > one step for each BTF type we need to traverse.
> >
> > Array access is simpler, it's always one high-level and one low-level step.
> >
> > bpf_core_reloc_insn()
> >
> > Once we match local and target specs and have local and target
> > offsets, do the relocations - check that instruction has expected
> > local offset and replace it with target offset.
> >
> > bpf_core_find_kernel_btf()
> >
> > This is the only function that can be moved into separate patch, but
> > it's also very simple. It just iterates over few known possible
> > locations for vmlinux image and once found, tries to parse .BTF out of
> > it, to be used as target BTF.
> >
> > bpf_core_reloc_offset()
> >
> > It combines all the above functions to perform single relocation.
> > Parse spec, get candidates, for each candidate try to find matching
> > target spec. All candidates that matched are cached for given local
> > root type.
>
> Thanks for these explanation. They are really helpful.
>
> I think some example explaining each step of bpf_core_reloc_offset()
> will be very helpful. Something like:
>
> Example:
>
> struct s {
>         int a;
>         struct {
>                 int b;
>                 bool c;
>         };
> };
>
> To get offset for c, we do:
>
> bpf_core_reloc_offset() {
>
>         /* input data: xxx */
>
>         /* first step: bpf_core_spec_parse() */
>
>         /* data after first step */
>
>         /* second step: bpf_core_find_cands() */
>
>         /* candidate A and B after second step */
>
>         ...
> }
>
> Well, it requires quite some work to document this way. Please let me
> know if you feel this is an overkill.

Yeah :) And it's not just work, but I think it's bad if comments
become too specific and document very low-level steps, because code
might evolve and comments can quickly get out of sync and just add to
confusion. Which is why I tried to document high-level ideas, leaving
it up to the source code to be the ultimate reference of minutia
details.

>
> >
> > bpf_core_reloc_offsets()
> >
> > High-level coordination. Iterate over all per-program .BTF.ext offset
> > reloc sections, each relocation within them. Find corresponding
> > program and try to apply relocations one by one.
> >
> >
> > I think the only non-obvious part here is to understand that
> > relocation records local raw spec with every single anonymous type
> > traversal, which is not that useful when we try to match it against
> > target type, which can have very different composition, but still the
> > same field access pattern, from C language standpoint (which hides all
> > those anonymous type traversals from programmer).
> >
> > But it should be pretty clear now, plus also check tests, they have
> > lots of cases showing what's compatible and what's not.
>
> I see. I will review the tests.
>
> >>>
> >>> static const struct btf_type *skip_mods_and_typedefs(const struct btf *btf,
> >>> -                                                  __u32 id)
> >>> +                                                  __u32 id,
> >>> +                                                  __u32 *res_id)
> >>> {
> >>>      const struct btf_type *t = btf__type_by_id(btf, id);
> >>
> >> Maybe have a local "__u32 rid;"
> >>
> >>>
> >>> +     if (res_id)
> >>> +             *res_id = id;
> >>> +
> >>
> >> and do "rid = id;" here
> >>
> >>>      while (true) {
> >>>              switch (BTF_INFO_KIND(t->info)) {
> >>>              case BTF_KIND_VOLATILE:
> >>>              case BTF_KIND_CONST:
> >>>              case BTF_KIND_RESTRICT:
> >>>              case BTF_KIND_TYPEDEF:
> >>> +                     if (res_id)
> >>> +                             *res_id = t->type;
> >> and here
> >>
> >>>                      t = btf__type_by_id(btf, t->type);
> >>>                      break;
> >>>              default:
> >> and "*res_id = rid;" right before return?
> >
> > Sure, but why?
>
> I think it is cleaner that way. But feel free to ignore if you
> think otherwise.
>
> >
> >>
> >>> @@ -1041,7 +1049,7 @@ static const struct btf_type *skip_mods_and_typedefs(const struct btf *btf,
> >>> static bool get_map_field_int(const char *map_name, const struct btf *btf,
> >>>                            const struct btf_type *def,
> >>>                            const struct btf_member *m, __u32 *res) {
> >
> > [...]
> >
> >>> +struct bpf_core_spec {
> >>> +     const struct btf *btf;
> >>> +     /* high-level spec: named fields and array indicies only */
> >>
> >> typo: indices
> >
> > thanks!
> >
> >>
> >>> +     struct bpf_core_accessor spec[BPF_CORE_SPEC_MAX_LEN];
> >>> +     /* high-level spec length */
> >>> +     int len;
> >>> +     /* raw, low-level spec: 1-to-1 with accessor spec string */
> >>> +     int raw_spec[BPF_CORE_SPEC_MAX_LEN];
> >>> +     /* raw spec length */
> >>> +     int raw_len;
> >>> +     /* field byte offset represented by spec */
> >>> +     __u32 offset;
> >>> +};
> >
> > [...]
> >
> >>> + *
> >>> + *   int x = &s->a[3]; // access string = '0:1:2:3'
> >>> + *
> >>> + * Low-level spec has 1:1 mapping with each element of access string (it's
> >>> + * just a parsed access string representation): [0, 1, 2, 3].
> >>> + *
> >>> + * High-level spec will capture only 3 points:
> >>> + *   - intial zero-index access by pointer (&s->... is the same as &s[0]...);
> >>> + *   - field 'a' access (corresponds to '2' in low-level spec);
> >>> + *   - array element #3 access (corresponds to '3' in low-level spec).
> >>> + *
> >>> + */
> >>
> >> IIUC, high-level points are subset of low-level points. How about we introduce
> >> "anonymous" high-level points, so that high-level points and low-level points
> >> are 1:1 mapping?
> >
> > No, that will just hurt and complicate things. See above explanation
> > about why we need high-level points (it's what you as C programmer try
> > to achieve vs low-level spec is what C-language does in reality, with
> > all the anonymous struct/union traversal).
> >
> > What's wrong with this separation? Think about it as recording
> > "intent" (high-level spec) vs "mechanics" (low-level spec, how exactly
> > to achieve that intent, in excruciating details).
>
> There is nothing wrong with separation. I just personally think it is
> cleaner the other way. That's why I raised the question.
>
> I will go with your assessment, as you looked into this much more than
> I did. :-)

For me it's a machine view of the problem (raw spec) vs human view of
the problem (high-level spec, which resembles how you think about this
in C code). I'll keep it separate unless it proves to be problematic
going forward.

>
> [...]
>
> >>> +
> >>> +     memset(spec, 0, sizeof(*spec));
> >>> +     spec->btf = btf;
> >>> +
> >>> +     /* parse spec_str="0:1:2:3:4" into array raw_spec=[0, 1, 2, 3, 4] */
> >>> +     while (*spec_str) {
> >>> +             if (*spec_str == ':')
> >>> +                     ++spec_str;
> >>> +             if (sscanf(spec_str, "%d%n", &access_idx, &parsed_len) != 1)
> >>> +                     return -EINVAL;
> >>> +             if (spec->raw_len == BPF_CORE_SPEC_MAX_LEN)
> >>> +                     return -E2BIG;
> >>> +             spec_str += parsed_len;
> >>> +             spec->raw_spec[spec->raw_len++] = access_idx;
> >>> +     }
> >>> +
> >>> +     if (spec->raw_len == 0)
> >>> +             return -EINVAL;
> >>> +
> >>> +     for (i = 0; i < spec->raw_len; i++) {
> >>> +             t = skip_mods_and_typedefs(btf, id, &id);
> >>> +             if (!t)
> >>> +                     return -EINVAL;
> >>> +
> >>> +             access_idx = spec->raw_spec[i];
> >>> +
> >>> +             if (i == 0) {
> >>> +                     /* first spec value is always reloc type array index */
> >>> +                     spec->spec[spec->len].type_id = id;
> >>> +                     spec->spec[spec->len].idx = access_idx;
> >>> +                     spec->len++;
> >>> +
> >>> +                     sz = btf__resolve_size(btf, id);
> >>> +                     if (sz < 0)
> >>> +                             return sz;
> >>> +                     spec->offset += access_idx * sz;
> >>          spec->offset = access_idx * sz;  should be enough
> >
> > No. spec->offset is carefully maintained across multiple low-level
> > steps, as we traverse down embedded structs/unions.
> >
> > Think about, e.g.:
> >
> > struct s {
> >    int a;
> >    struct {
> >        int b;
> >    };
> > };
> >
> > Imagine you are trying to match s.b access. With what you propose
> > you'll end up with offset 0, but it should be 4.
>
> Hmm... this is just for i == 0, right? Which line updated spec->offset
> after "memset(spec, 0, sizeof(*spec));"?

Ah, I missed that you are referring to the special i == 0 case. I can
do assignment, yes, you are right. I'll probably also extract it out
of the loop to make it less confusing.

>
> >
> >>
> >>> +                     continue;
> >>> +             }
> >>
> >> Maybe pull i == 0 case out of the for loop?
> >>
> >>> +
> >>> +             if (btf_is_composite(t)) {
> >
> > [...]
> >
> >>> +
> >>> +     if (spec->len == 0)
> >>> +             return -EINVAL;
> >>
> >> Can this ever happen?
> >
> > Not really, because I already check raw_len == 0 and exit with error.
> > I'll remove.
> >
> >>
> >>> +
> >>> +     return 0;
> >>> +}
> >>> +
> >
> > [...]
> >
> >>> +
> >>> +/*
> >>> + * Given single high-level accessor (either named field or array index) in
> >>> + * local type, find corresponding high-level accessor for a target type. Along
> >>> + * the way, maintain low-level spec for target as well. Also keep updating
> >>> + * target offset.
> >>> + */
> >>
> >> Please describe the recursive algorithm here. I am kinda lost.
> >
> > Explained above. I'll extend description a bit. But it's just
> > recursive exhaustive search:
> > 1. if struct field is anonymous and is struct/union, go one level
> > deeper and try to find field with given name inside those.
> > 2. if field has name and it matched what we are searching - check type
> > compatibility. It has to be compatible, so if it's not, then it's not
> > a match.
> >
> >> Also, please document the meaning of zero, positive, negative return values.
> >
> > Ok. It's standard <0 - error, 0 - false, 1 - true.
> >
> >>
> >>> +static int bpf_core_match_member(const struct btf *local_btf,
> >>> +                              const struct bpf_core_accessor *local_acc,
> >>> +                              const struct btf *targ_btf,
> >>> +                              __u32 targ_id,
> >>> +                              struct bpf_core_spec *spec,
> >>> +                              __u32 *next_targ_id)
> >>> +{
> >
> > [...]
> >
> >>> +             if (local_acc->name) {
> >>> +                     if (!btf_is_composite(targ_type))
> >>> +                             return 0;
> >>> +
> >>> +                     matched = bpf_core_match_member(local_spec->btf,
> >>> +                                                     local_acc,
> >>> +                                                     targ_btf, targ_id,
> >>> +                                                     targ_spec, &targ_id);
> >>> +                     if (matched <= 0)
> >>> +                             return matched;
> >>> +             } else {
> >>> +                     /* for i=0, targ_id is already treated as array element
> >>> +                      * type (because it's the original struct), for others
> >>> +                      * we should find array element type first
> >>> +                      */
> >>> +                     if (i > 0) {
> >>
> >> i == 0 case would go into "if (local_acc->name)" branch, no?
> >
> > No, i == 0 is always an array access. s->a.b.c is the same as
> > s[0].a.b.c, so relocation's first spec element is always either zero
> > for pointer access or any non-negative index for array access. But it
> > is always array access.
>
> I see. Thanks for the explanation.
>
> Song
Yonghong Song July 27, 2019, 9:29 p.m. UTC | #9
On 7/27/19 11:24 AM, Andrii Nakryiko wrote:
> On Sat, Jul 27, 2019 at 10:00 AM Alexei Starovoitov <ast@fb.com> wrote:
>>
>> On 7/26/19 11:25 PM, Andrii Nakryiko wrote:
>>>>> +     } else if (class == BPF_ST && BPF_MODE(insn->code) == BPF_MEM) {
>>>>> +             if (insn->imm != orig_off)
>>>>> +                     return -EINVAL;
>>>>> +             insn->imm = new_off;
>>>>> +             pr_debug("prog '%s': patched insn #%d (ST | MEM) imm %d -> %d\n",
>>>>> +                      bpf_program__title(prog, false),
>>>>> +                      insn_idx, orig_off, new_off);
>>>> I'm pretty sure llvm was not capable of emitting BPF_ST insn.
>>>> When did that change?
>>> I just looked at possible instructions that could have 32-bit
>>> immediate value. This is `*(rX) = offsetof(struct s, field)`, which I
>>> though is conceivable. Do you think I should drop it?
>>
>> Just trying to point out that since it's not emitted by llvm
>> this code is likely untested ?
>> Or you've created a bpf asm test for this?
> 
> 
> Yeah, it's untested right now. Let me try to come up with LLVM
> assembly + relocation (not yet sure how/whether builtin works with
> inline assembly), if that works out, I'll leave this, if not, I'll
> drop BPF_ST|BPF_MEM part.

FYI. The llvm does not have assembly code format for BPF_ST instructions 
as it does not generate code for it. So inline asm through llvm won't 
work. llvm disasseembler won't be able to decode BPF_ST either.

>>
>>
Andrii Nakryiko July 27, 2019, 9:36 p.m. UTC | #10
On Sat, Jul 27, 2019 at 2:29 PM Yonghong Song <yhs@fb.com> wrote:
>
>
>
> On 7/27/19 11:24 AM, Andrii Nakryiko wrote:
> > On Sat, Jul 27, 2019 at 10:00 AM Alexei Starovoitov <ast@fb.com> wrote:
> >>
> >> On 7/26/19 11:25 PM, Andrii Nakryiko wrote:
> >>>>> +     } else if (class == BPF_ST && BPF_MODE(insn->code) == BPF_MEM) {
> >>>>> +             if (insn->imm != orig_off)
> >>>>> +                     return -EINVAL;
> >>>>> +             insn->imm = new_off;
> >>>>> +             pr_debug("prog '%s': patched insn #%d (ST | MEM) imm %d -> %d\n",
> >>>>> +                      bpf_program__title(prog, false),
> >>>>> +                      insn_idx, orig_off, new_off);
> >>>> I'm pretty sure llvm was not capable of emitting BPF_ST insn.
> >>>> When did that change?
> >>> I just looked at possible instructions that could have 32-bit
> >>> immediate value. This is `*(rX) = offsetof(struct s, field)`, which I
> >>> though is conceivable. Do you think I should drop it?
> >>
> >> Just trying to point out that since it's not emitted by llvm
> >> this code is likely untested ?
> >> Or you've created a bpf asm test for this?
> >
> >
> > Yeah, it's untested right now. Let me try to come up with LLVM
> > assembly + relocation (not yet sure how/whether builtin works with
> > inline assembly), if that works out, I'll leave this, if not, I'll
> > drop BPF_ST|BPF_MEM part.
>
> FYI. The llvm does not have assembly code format for BPF_ST instructions
> as it does not generate code for it. So inline asm through llvm won't
> work. llvm disasseembler won't be able to decode BPF_ST either.

Well then, I'll just drop it for now. Thanks!

>
> >>
> >>
Song Liu July 28, 2019, 12:24 a.m. UTC | #11
> On Jul 27, 2019, at 12:09 PM, Andrii Nakryiko <andrii.nakryiko@gmail.com> wrote:
> 
> On Sat, Jul 27, 2019 at 11:59 AM Song Liu <songliubraving@fb.com> wrote:
>> 
>> 
>> 
>>> On Jul 26, 2019, at 11:11 PM, Andrii Nakryiko <andrii.nakryiko@gmail.com> wrote:
>>> 
>>> On Thu, Jul 25, 2019 at 12:32 PM Song Liu <songliubraving@fb.com> wrote:
>>>> 
>>>> 
>>>> 
>>>>> On Jul 24, 2019, at 12:27 PM, Andrii Nakryiko <andriin@fb.com> wrote:
>>>>> 
>>>>> This patch implements the core logic for BPF CO-RE offsets relocations.
>>>>> All the details are described in code comments.
>>>> 
>>>> Some description in the change log is still useful. Please at least
>>>> copy-paste key comments here.
>>> 
>>> OK, will add some more.
>>> 
>>>> 
>>>> And, this is looooong. I think it is totally possible to split it into
>>>> multiple smaller patches.
>>> 
>>> I don't really know how to split it further without hurting reviewing
>>> by artificially splitting related code into separate patches. Remove
>>> any single function and algorithm will be incomplete.
>>> 
>>> Let me give you some high-level overview of how pieces are put
>>> together. There are 9 non-trivial functions, let's go over their
>>> purpose in the orderd in which they are defined in file:
>>> 
>>> 1. bpf_core_spec_parse()
>>> 
>>> This one take bpf_offset_reloc's type_id and accessor string
>>> ("0:1:2:3") and parses it into more convenient bpf_core_spec
>>> datastructure, which has calculated offset and high-level spec
>>> "steps": either named field or array access.
>>> 
>>> 2. bpf_core_find_cands()
>>> 
>>> Given local type name, finds all possible target BTF types with same
>>> name (modulo "flavor" differences, ___flavor suffix is just ignored).
>>> 
>>> 3. bpf_core_fields_are_compat()
>>> 
>>> Given local and target field match, checks that their types are
>>> compatible (so that we don't accidentally match, e.g., int against
>>> struct).
>>> 
>>> 4. bpf_core_match_member()
>>> 
>>> Given named local field, find corresponding field in target struct. To
>>> understand why it's not trivial, here's an example:
>>> 
>>> Local type:
>>> 
>>> struct s___local {
>>> int a;
>>> };
>>> 
>>> Target type:
>>> 
>>> struct s___target {
>>> struct {
>>>   union {
>>>     int a;
>>>   };
>>> };
>>> };
>>> 
>>> For both cases you can access a as s.a, but in local case, field a is
>>> immediately inside s___local, while for s___target, you have to
>>> traverse two levels deeper into anonymous fields to get to an `a`
>>> inside anonymous union.
>>> 
>>> So this function find that `a` by doing exhaustive search across all
>>> named field and anonymous struct/unions. But otherwise it's pretty
>>> straightforward recursive function.
>>> 
>>> bpf_core_spec_match()
>>> 
>>> Just goes over high-level spec steps in local spec and tries to figure
>>> out both high-level and low-level steps for targe type. Consider the
>>> above example. For both structs accessing s.a is one high-level step,
>>> but for s___local it's single low-level step (just another :0 in spec
>>> string), while for s___target it's three low-level steps: ":0:0:0",
>>> one step for each BTF type we need to traverse.
>>> 
>>> Array access is simpler, it's always one high-level and one low-level step.
>>> 
>>> bpf_core_reloc_insn()
>>> 
>>> Once we match local and target specs and have local and target
>>> offsets, do the relocations - check that instruction has expected
>>> local offset and replace it with target offset.
>>> 
>>> bpf_core_find_kernel_btf()
>>> 
>>> This is the only function that can be moved into separate patch, but
>>> it's also very simple. It just iterates over few known possible
>>> locations for vmlinux image and once found, tries to parse .BTF out of
>>> it, to be used as target BTF.
>>> 
>>> bpf_core_reloc_offset()
>>> 
>>> It combines all the above functions to perform single relocation.
>>> Parse spec, get candidates, for each candidate try to find matching
>>> target spec. All candidates that matched are cached for given local
>>> root type.
>> 
>> Thanks for these explanation. They are really helpful.
>> 
>> I think some example explaining each step of bpf_core_reloc_offset()
>> will be very helpful. Something like:
>> 
>> Example:
>> 
>> struct s {
>>        int a;
>>        struct {
>>                int b;
>>                bool c;
>>        };
>> };
>> 
>> To get offset for c, we do:
>> 
>> bpf_core_reloc_offset() {
>> 
>>        /* input data: xxx */
>> 
>>        /* first step: bpf_core_spec_parse() */
>> 
>>        /* data after first step */
>> 
>>        /* second step: bpf_core_find_cands() */
>> 
>>        /* candidate A and B after second step */
>> 
>>        ...
>> }
>> 
>> Well, it requires quite some work to document this way. Please let me
>> know if you feel this is an overkill.
> 
> Yeah :) And it's not just work, but I think it's bad if comments
> become too specific and document very low-level steps, because code
> might evolve and comments can quickly get out of sync and just add to
> confusion. Which is why I tried to document high-level ideas, leaving
> it up to the source code to be the ultimate reference of minutia
> details.

Fair enough. 

> 
>> 
>>> 
>>> bpf_core_reloc_offsets()
>>> 
>>> High-level coordination. Iterate over all per-program .BTF.ext offset
>>> reloc sections, each relocation within them. Find corresponding
>>> program and try to apply relocations one by one.
>>> 
>>> 
>>> I think the only non-obvious part here is to understand that
>>> relocation records local raw spec with every single anonymous type
>>> traversal, which is not that useful when we try to match it against
>>> target type, which can have very different composition, but still the
>>> same field access pattern, from C language standpoint (which hides all
>>> those anonymous type traversals from programmer).
>>> 
>>> But it should be pretty clear now, plus also check tests, they have
>>> lots of cases showing what's compatible and what's not.
>> 
>> I see. I will review the tests.
>> 
>>>>> 
>>>>> static const struct btf_type *skip_mods_and_typedefs(const struct btf *btf,
>>>>> -                                                  __u32 id)
>>>>> +                                                  __u32 id,
>>>>> +                                                  __u32 *res_id)
>>>>> {
>>>>>     const struct btf_type *t = btf__type_by_id(btf, id);
>>>> 
>>>> Maybe have a local "__u32 rid;"
>>>> 
>>>>> 
>>>>> +     if (res_id)
>>>>> +             *res_id = id;
>>>>> +
>>>> 
>>>> and do "rid = id;" here
>>>> 
>>>>>     while (true) {
>>>>>             switch (BTF_INFO_KIND(t->info)) {
>>>>>             case BTF_KIND_VOLATILE:
>>>>>             case BTF_KIND_CONST:
>>>>>             case BTF_KIND_RESTRICT:
>>>>>             case BTF_KIND_TYPEDEF:
>>>>> +                     if (res_id)
>>>>> +                             *res_id = t->type;
>>>> and here
>>>> 
>>>>>                     t = btf__type_by_id(btf, t->type);
>>>>>                     break;
>>>>>             default:
>>>> and "*res_id = rid;" right before return?
>>> 
>>> Sure, but why?
>> 
>> I think it is cleaner that way. But feel free to ignore if you
>> think otherwise.
>> 
>>> 
>>>> 
>>>>> @@ -1041,7 +1049,7 @@ static const struct btf_type *skip_mods_and_typedefs(const struct btf *btf,
>>>>> static bool get_map_field_int(const char *map_name, const struct btf *btf,
>>>>>                           const struct btf_type *def,
>>>>>                           const struct btf_member *m, __u32 *res) {
>>> 
>>> [...]
>>> 
>>>>> +struct bpf_core_spec {
>>>>> +     const struct btf *btf;
>>>>> +     /* high-level spec: named fields and array indicies only */
>>>> 
>>>> typo: indices
>>> 
>>> thanks!
>>> 
>>>> 
>>>>> +     struct bpf_core_accessor spec[BPF_CORE_SPEC_MAX_LEN];
>>>>> +     /* high-level spec length */
>>>>> +     int len;
>>>>> +     /* raw, low-level spec: 1-to-1 with accessor spec string */
>>>>> +     int raw_spec[BPF_CORE_SPEC_MAX_LEN];
>>>>> +     /* raw spec length */
>>>>> +     int raw_len;
>>>>> +     /* field byte offset represented by spec */
>>>>> +     __u32 offset;
>>>>> +};
>>> 
>>> [...]
>>> 
>>>>> + *
>>>>> + *   int x = &s->a[3]; // access string = '0:1:2:3'
>>>>> + *
>>>>> + * Low-level spec has 1:1 mapping with each element of access string (it's
>>>>> + * just a parsed access string representation): [0, 1, 2, 3].
>>>>> + *
>>>>> + * High-level spec will capture only 3 points:
>>>>> + *   - intial zero-index access by pointer (&s->... is the same as &s[0]...);
>>>>> + *   - field 'a' access (corresponds to '2' in low-level spec);
>>>>> + *   - array element #3 access (corresponds to '3' in low-level spec).
>>>>> + *
>>>>> + */
>>>> 
>>>> IIUC, high-level points are subset of low-level points. How about we introduce
>>>> "anonymous" high-level points, so that high-level points and low-level points
>>>> are 1:1 mapping?
>>> 
>>> No, that will just hurt and complicate things. See above explanation
>>> about why we need high-level points (it's what you as C programmer try
>>> to achieve vs low-level spec is what C-language does in reality, with
>>> all the anonymous struct/union traversal).
>>> 
>>> What's wrong with this separation? Think about it as recording
>>> "intent" (high-level spec) vs "mechanics" (low-level spec, how exactly
>>> to achieve that intent, in excruciating details).
>> 
>> There is nothing wrong with separation. I just personally think it is
>> cleaner the other way. That's why I raised the question.
>> 
>> I will go with your assessment, as you looked into this much more than
>> I did. :-)
> 
> For me it's a machine view of the problem (raw spec) vs human view of
> the problem (high-level spec, which resembles how you think about this
> in C code). I'll keep it separate unless it proves to be problematic
> going forward.

[...]

>>> 
>>> No. spec->offset is carefully maintained across multiple low-level
>>> steps, as we traverse down embedded structs/unions.
>>> 
>>> Think about, e.g.:
>>> 
>>> struct s {
>>>   int a;
>>>   struct {
>>>       int b;
>>>   };
>>> };
>>> 
>>> Imagine you are trying to match s.b access. With what you propose
>>> you'll end up with offset 0, but it should be 4.
>> 
>> Hmm... this is just for i == 0, right? Which line updated spec->offset
>> after "memset(spec, 0, sizeof(*spec));"?
> 
> Ah, I missed that you are referring to the special i == 0 case. I can
> do assignment, yes, you are right. I'll probably also extract it out
> of the loop to make it less confusing.

Yes, please. 

> 
>>>>> +                     continue;
>>>>> +             }
>>>> 
>>>> Maybe pull i == 0 case out of the for loop?

As I mentioned earlier. ;-)

>>>> 
>>>>> +
>>>>> +             if (btf_is_composite(t)) {
>>> 
>>> [...]
>>> 
>>>>> +
>>>>> +     if (spec->len == 0)
>>>>> +             return -EINVAL;
>>>> 
>>>> Can this ever happen?
>>> 
>>> Not really, because I already check raw_len == 0 and exit with error.
>>> I'll remove.
>>> 
>>>> 
>>>>> +
>>>>> +     return 0;
>>>>> +}
>>>>> +
>>> 

[...]
Song Liu July 29, 2019, 7:56 p.m. UTC | #12
> On Jul 24, 2019, at 12:27 PM, Andrii Nakryiko <andriin@fb.com> wrote:
> 
> This patch implements the core logic for BPF CO-RE offsets relocations.
> All the details are described in code comments.
> 
> Signed-off-by: Andrii Nakryiko <andriin@fb.com>
> ---
> tools/lib/bpf/libbpf.c | 866 ++++++++++++++++++++++++++++++++++++++++-
> tools/lib/bpf/libbpf.h |   1 +
> 2 files changed, 861 insertions(+), 6 deletions(-)
> 
> diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
> index 8741c39adb1c..86d87bf10d46 100644
> --- a/tools/lib/bpf/libbpf.c
> +++ b/tools/lib/bpf/libbpf.c
> @@ -38,6 +38,7 @@
> #include <sys/stat.h>
> #include <sys/types.h>
> #include <sys/vfs.h>
> +#include <sys/utsname.h>
> #include <tools/libc_compat.h>
> #include <libelf.h>
> #include <gelf.h>
> @@ -47,6 +48,7 @@
> #include "btf.h"
> #include "str_error.h"
> #include "libbpf_internal.h"
> +#include "hashmap.h"
> 
> #ifndef EM_BPF
> #define EM_BPF 247
> @@ -1013,16 +1015,22 @@ static int bpf_object__init_user_maps(struct bpf_object *obj, bool strict)
> }
> 
> static const struct btf_type *skip_mods_and_typedefs(const struct btf *btf,
> -						     __u32 id)
> +						     __u32 id,
> +						     __u32 *res_id)
> {
> 	const struct btf_type *t = btf__type_by_id(btf, id);
> 
> +	if (res_id)
> +		*res_id = id;
> +
> 	while (true) {
> 		switch (BTF_INFO_KIND(t->info)) {
> 		case BTF_KIND_VOLATILE:
> 		case BTF_KIND_CONST:
> 		case BTF_KIND_RESTRICT:
> 		case BTF_KIND_TYPEDEF:
> +			if (res_id)
> +				*res_id = t->type;
> 			t = btf__type_by_id(btf, t->type);
> 			break;
> 		default:
> @@ -1041,7 +1049,7 @@ static const struct btf_type *skip_mods_and_typedefs(const struct btf *btf,
> static bool get_map_field_int(const char *map_name, const struct btf *btf,
> 			      const struct btf_type *def,
> 			      const struct btf_member *m, __u32 *res) {
> -	const struct btf_type *t = skip_mods_and_typedefs(btf, m->type);
> +	const struct btf_type *t = skip_mods_and_typedefs(btf, m->type, NULL);
> 	const char *name = btf__name_by_offset(btf, m->name_off);
> 	const struct btf_array *arr_info;
> 	const struct btf_type *arr_t;
> @@ -1107,7 +1115,7 @@ static int bpf_object__init_user_btf_map(struct bpf_object *obj,
> 		return -EOPNOTSUPP;
> 	}
> 
> -	def = skip_mods_and_typedefs(obj->btf, var->type);
> +	def = skip_mods_and_typedefs(obj->btf, var->type, NULL);
> 	if (BTF_INFO_KIND(def->info) != BTF_KIND_STRUCT) {
> 		pr_warning("map '%s': unexpected def kind %u.\n",
> 			   map_name, BTF_INFO_KIND(var->info));
> @@ -2289,6 +2297,845 @@ bpf_program_reloc_btf_ext(struct bpf_program *prog, struct bpf_object *obj,
> 	return 0;
> }
> 
> +#define BPF_CORE_SPEC_MAX_LEN 64
> +
> +/* represents BPF CO-RE field or array element accessor */
> +struct bpf_core_accessor {
> +	__u32 type_id;		/* struct/union type or array element type */
> +	__u32 idx;		/* field index or array index */
> +	const char *name;	/* field name or NULL for array accessor */
> +};
> +
> +struct bpf_core_spec {
> +	const struct btf *btf;
> +	/* high-level spec: named fields and array indicies only */
> +	struct bpf_core_accessor spec[BPF_CORE_SPEC_MAX_LEN];
> +	/* high-level spec length */
> +	int len;
> +	/* raw, low-level spec: 1-to-1 with accessor spec string */
> +	int raw_spec[BPF_CORE_SPEC_MAX_LEN];
> +	/* raw spec length */
> +	int raw_len;
> +	/* field byte offset represented by spec */
> +	__u32 offset;
> +};
> +
> +static bool str_is_empty(const char *s)
> +{
> +	return !s || !s[0];
> +}
> +
> +static int btf_kind(const struct btf_type *t)
> +{
> +	return BTF_INFO_KIND(t->info);
> +}
> +
> +static bool btf_is_composite(const struct btf_type *t)
> +{
> +	int kind = btf_kind(t);
> +
> +	return kind == BTF_KIND_STRUCT || kind == BTF_KIND_UNION;
> +}
> +
> +static bool btf_is_array(const struct btf_type *t)
> +{
> +	return btf_kind(t) == BTF_KIND_ARRAY;
> +}
> +
> +/* 
> + * Turn bpf_offset_reloc into a low- and high-level spec representation,
> + * validating correctness along the way, as well as calculating resulting
> + * field offset (in bytes), specified by accessor string. Low-level spec
> + * captures every single level of nestedness, including traversing anonymous
> + * struct/union members. High-level one only captures semantically meaningful
> + * "turning points": named fields and array indicies.
> + * E.g., for this case:
> + *
> + *   struct sample {
> + *       int __unimportant;
> + *       struct {
> + *           int __1;
> + *           int __2;
> + *           int a[7];
> + *       };
> + *   };
> + *
> + *   struct sample *s = ...;
> + *
> + *   int x = &s->a[3]; // access string = '0:1:2:3'
> + *
> + * Low-level spec has 1:1 mapping with each element of access string (it's
> + * just a parsed access string representation): [0, 1, 2, 3].
> + *
> + * High-level spec will capture only 3 points:
> + *   - intial zero-index access by pointer (&s->... is the same as &s[0]...);
> + *   - field 'a' access (corresponds to '2' in low-level spec);
> + *   - array element #3 access (corresponds to '3' in low-level spec).
> + *
> + */
> +static int bpf_core_spec_parse(const struct btf *btf,
> +			       __u32 type_id,
> +			       const char *spec_str,
> +			       struct bpf_core_spec *spec)
> +{
> +	int access_idx, parsed_len, i;
> +	const struct btf_type *t;
> +	__u32 id = type_id;
> +	const char *name;
> +	__s64 sz;
> +
> +	if (str_is_empty(spec_str) || *spec_str == ':')
> +		return -EINVAL;
> +
> +	memset(spec, 0, sizeof(*spec));
> +	spec->btf = btf;
> +
> +	/* parse spec_str="0:1:2:3:4" into array raw_spec=[0, 1, 2, 3, 4] */
> +	while (*spec_str) {
> +		if (*spec_str == ':')
> +			++spec_str;
> +		if (sscanf(spec_str, "%d%n", &access_idx, &parsed_len) != 1)
> +			return -EINVAL;
> +		if (spec->raw_len == BPF_CORE_SPEC_MAX_LEN)
> +			return -E2BIG;
> +		spec_str += parsed_len;
> +		spec->raw_spec[spec->raw_len++] = access_idx;
> +	}
> +
> +	if (spec->raw_len == 0)
> +		return -EINVAL;
> +
> +	for (i = 0; i < spec->raw_len; i++) {
> +		t = skip_mods_and_typedefs(btf, id, &id);
> +		if (!t)
> +			return -EINVAL;
> +
> +		access_idx = spec->raw_spec[i];
> +
> +		if (i == 0) {
> +			/* first spec value is always reloc type array index */
> +			spec->spec[spec->len].type_id = id;
> +			spec->spec[spec->len].idx = access_idx;
> +			spec->len++;
> +
> +			sz = btf__resolve_size(btf, id);
> +			if (sz < 0)
> +				return sz;
> +			spec->offset += access_idx * sz;
> +			continue;
> +		}
> +
> +		if (btf_is_composite(t)) {
> +			const struct btf_member *m = (void *)(t + 1);
> +			__u32 offset;
> +
> +			if (access_idx >= BTF_INFO_VLEN(t->info))
> +				return -EINVAL;
> +
> +			m = &m[access_idx];
> +
> +			if (BTF_INFO_KFLAG(t->info)) {
> +				if (BTF_MEMBER_BITFIELD_SIZE(m->offset))
> +					return -EINVAL;
> +				offset = BTF_MEMBER_BIT_OFFSET(m->offset);
> +			} else {
> +				offset = m->offset;
> +			}
> +			if (m->offset % 8)
> +				return -EINVAL;
> +			spec->offset += offset / 8;
> +
> +			if (m->name_off) {
> +				name = btf__name_by_offset(btf, m->name_off);
> +				if (str_is_empty(name))
> +					return -EINVAL;
> +
> +				spec->spec[spec->len].type_id = id;
> +				spec->spec[spec->len].idx = access_idx;
> +				spec->spec[spec->len].name = name;
> +				spec->len++;
> +			}
> +
> +			id = m->type;
> +		} else if (btf_is_array(t)) {
> +			const struct btf_array *a = (void *)(t + 1);
> +
> +			t = skip_mods_and_typedefs(btf, a->type, &id);
> +			if (!t || access_idx >= a->nelems)
> +				return -EINVAL;
> +
> +			spec->spec[spec->len].type_id = id;
> +			spec->spec[spec->len].idx = access_idx;
> +			spec->len++;
> +
> +			sz = btf__resolve_size(btf, id);
> +			if (sz < 0)
> +				return sz;
> +			spec->offset += access_idx * sz;
> +		} else {
> +			pr_warning("relo for [%u] %s (at idx %d) captures type [%d] of unexpected kind %d\n",
> +				   type_id, spec_str, i, id, btf_kind(t));
> +			return -EINVAL;
> +		}
> +	}
> +
> +	if (spec->len == 0)
> +		return -EINVAL;
> +
> +	return 0;
> +}
> +
> +/* Given 'some_struct_name___with_flavor' return the length of a name prefix
> + * before last triple underscore. Struct name part after last triple
> + * underscore is ignored by BPF CO-RE relocation during relocation matching.
> + */
> +static size_t bpf_core_essential_name_len(const char *name)
> +{
> +	size_t n = strlen(name);
> +	int i = n - 3;
> +
> +	while (i > 0) {
> +		if (name[i] == '_' && name[i + 1] == '_' && name[i + 2] == '_')
> +			return i;
> +		i--;
> +	}
> +	return n;
> +}
> +
> +/* dynamically sized list of type IDs */
> +struct ids_vec {
> +	__u32 *data;
> +	int len;
> +};
> +
> +static void bpf_core_free_cands(struct ids_vec *cand_ids)
> +{
> +	free(cand_ids->data);
> +	free(cand_ids);
> +}
> +
> +static struct ids_vec *bpf_core_find_cands(const struct btf *local_btf,
> +					   __u32 local_type_id,
> +					   const struct btf *targ_btf)
> +{
> +	size_t local_essent_len, targ_essent_len;
> +	const char *local_name, *targ_name;
> +	const struct btf_type *t;
> +	struct ids_vec *cand_ids;
> +	__u32 *new_ids;
> +	int i, err, n;
> +
> +	t = btf__type_by_id(local_btf, local_type_id);
> +	if (!t)
> +		return ERR_PTR(-EINVAL);
> +
> +	local_name = btf__name_by_offset(local_btf, t->name_off);
> +	if (str_is_empty(local_name))
> +		return ERR_PTR(-EINVAL);
> +	local_essent_len = bpf_core_essential_name_len(local_name);
> +
> +	cand_ids = calloc(1, sizeof(*cand_ids));
> +	if (!cand_ids)
> +		return ERR_PTR(-ENOMEM);
> +
> +	n = btf__get_nr_types(targ_btf);
> +	for (i = 1; i <= n; i++) {
> +		t = btf__type_by_id(targ_btf, i);
> +		targ_name = btf__name_by_offset(targ_btf, t->name_off);
> +		if (str_is_empty(targ_name))
> +			continue;
> +
> +		targ_essent_len = bpf_core_essential_name_len(targ_name);
> +		if (targ_essent_len != local_essent_len)
> +			continue;
> +
> +		if (strncmp(local_name, targ_name, local_essent_len) == 0) {
> +			pr_debug("[%d] (%s): found candidate [%d] (%s)\n",
> +				 local_type_id, local_name, i, targ_name);
> +			new_ids = realloc(cand_ids->data, cand_ids->len + 1);
> +			if (!new_ids) {
> +				err = -ENOMEM;
> +				goto err_out;
> +			}
> +			cand_ids->data = new_ids;
> +			cand_ids->data[cand_ids->len++] = i;
> +		}
> +	}
> +	return cand_ids;
> +err_out:
> +	bpf_core_free_cands(cand_ids);
> +	return ERR_PTR(err);
> +}
> +
> +/* Check two types for compatibility, skipping const/volatile/restrict and
> + * typedefs, to ensure we are relocating offset to the compatible entities:
> + *   - any two STRUCTs/UNIONs are compatible and can be mixed;
> + *   - any two FWDs are compatible;
> + *   - any two PTRs are always compatible;
> + *   - for ENUMs, check sizes, names are ignored;
> + *   - for INT, size and bitness should match, signedness is ignored;
> + *   - for ARRAY, dimensionality is ignored, element types are checked for
> + *     compatibility recursively;
> + *   - everything else shouldn't be ever a target of relocation.
> + * These rules are not set in stone and probably will be adjusted as we get
> + * more experience with using BPF CO-RE relocations.
> + */
> +static int bpf_core_fields_are_compat(const struct btf *local_btf,
> +				      __u32 local_id,
> +				      const struct btf *targ_btf,
> +				      __u32 targ_id)
> +{
> +	const struct btf_type *local_type, *targ_type;
> +	__u16 kind;
> +
> +recur:
> +	local_type = skip_mods_and_typedefs(local_btf, local_id, &local_id);
> +	targ_type = skip_mods_and_typedefs(targ_btf, targ_id, &targ_id);
> +	if (!local_type || !targ_type)
> +		return -EINVAL;
> +
> +	if (btf_is_composite(local_type) && btf_is_composite(targ_type))
> +		return 1;
> +	if (BTF_INFO_KIND(local_type->info) != BTF_INFO_KIND(targ_type->info))
> +		return 0;
> +
> +	kind = BTF_INFO_KIND(local_type->info);
> +	switch (kind) {
> +	case BTF_KIND_FWD:
> +	case BTF_KIND_PTR:
> +		return 1;
> +	case BTF_KIND_ENUM:
> +		return local_type->size == targ_type->size;
> +	case BTF_KIND_INT: {
> +		__u32 loc_int = *(__u32 *)(local_type + 1);
> +		__u32 targ_int = *(__u32 *)(targ_type + 1);
> +
> +		return BTF_INT_OFFSET(loc_int) == 0 &&
> +		       BTF_INT_OFFSET(targ_int) == 0 &&
> +		       local_type->size == targ_type->size &&
> +		       BTF_INT_BITS(loc_int) == BTF_INT_BITS(targ_int);
> +	}
> +	case BTF_KIND_ARRAY: {
> +		const struct btf_array *loc_a, *targ_a;
> +
> +		loc_a = (void *)(local_type + 1);
> +		targ_a = (void *)(targ_type + 1);
> +		local_id = loc_a->type;
> +		targ_id = targ_a->type;
> +		goto recur;
> +	}
> +	default:
> +		pr_warning("unexpected kind %d relocated, local [%d], target [%d]\n",
> +			   kind, local_id, targ_id);
> +		return 0;
> +	}
> +}
> +
> +/* 
> + * Given single high-level accessor (either named field or array index) in
> + * local type, find corresponding high-level accessor for a target type. Along
> + * the way, maintain low-level spec for target as well. Also keep updating
> + * target offset.
> + */
> +static int bpf_core_match_member(const struct btf *local_btf,
> +				 const struct bpf_core_accessor *local_acc,
> +				 const struct btf *targ_btf,
> +				 __u32 targ_id,
> +				 struct bpf_core_spec *spec,
> +				 __u32 *next_targ_id)
> +{
> +	const struct btf_type *local_type, *targ_type;
> +	const struct btf_member *local_member, *m;
> +	const char *local_name, *targ_name;
> +	__u32 local_id;
> +	int i, n, found;
> +
> +	targ_type = skip_mods_and_typedefs(targ_btf, targ_id, &targ_id);
> +	if (!targ_type)
> +		return -EINVAL;
> +	if (!btf_is_composite(targ_type))
> +		return 0;
> +
> +	local_id = local_acc->type_id;
> +	local_type = btf__type_by_id(local_btf, local_id);
> +	local_member = (void *)(local_type + 1);
> +	local_member += local_acc->idx;
> +	local_name = btf__name_by_offset(local_btf, local_member->name_off);
> +
> +	n = BTF_INFO_VLEN(targ_type->info);
> +	m = (void *)(targ_type + 1);
> +	for (i = 0; i < n; i++, m++) {
> +		__u32 offset;
> +
> +		/* bitfield relocations not supported */
> +		if (BTF_INFO_KFLAG(targ_type->info)) {
> +			if (BTF_MEMBER_BITFIELD_SIZE(m->offset))
> +				continue;
> +			offset = BTF_MEMBER_BIT_OFFSET(m->offset);
> +		} else {
> +			offset = m->offset;
> +		}
> +		if (offset % 8)
> +			continue;
> +
> +		/* too deep struct/union/array nesting */
> +		if (spec->raw_len == BPF_CORE_SPEC_MAX_LEN)
> +			return -E2BIG;
> +
> +		/* speculate this member will be the good one */
> +		spec->offset += offset / 8;
> +		spec->raw_spec[spec->raw_len++] = i;
> +
> +		targ_name = btf__name_by_offset(targ_btf, m->name_off);
> +		if (str_is_empty(targ_name)) {
> +			/* embedded struct/union, we need to go deeper */
> +			found = bpf_core_match_member(local_btf, local_acc,
> +						      targ_btf, m->type,
> +						      spec, next_targ_id);
> +			if (found) /* either found or error */
> +				return found;
> +		} else if (strcmp(local_name, targ_name) == 0) {
> +			/* matching named field */
> +			struct bpf_core_accessor *targ_acc;
> +
> +			targ_acc = &spec->spec[spec->len++];
> +			targ_acc->type_id = targ_id;
> +			targ_acc->idx = i;
> +			targ_acc->name = targ_name;
> +
> +			*next_targ_id = m->type;
> +			found = bpf_core_fields_are_compat(local_btf,
> +							   local_member->type,
> +							   targ_btf, m->type);
> +			if (!found)
> +				spec->len--; /* pop accessor */
> +			return found;
> +		}
> +		/* member turned out to be not we looked for */

	/* member turned out not to be what we looked for */ 

Or something similar. 

The rest of this patch looks good to me. 

Thanks,
Song
diff mbox series

Patch

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 8741c39adb1c..86d87bf10d46 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -38,6 +38,7 @@ 
 #include <sys/stat.h>
 #include <sys/types.h>
 #include <sys/vfs.h>
+#include <sys/utsname.h>
 #include <tools/libc_compat.h>
 #include <libelf.h>
 #include <gelf.h>
@@ -47,6 +48,7 @@ 
 #include "btf.h"
 #include "str_error.h"
 #include "libbpf_internal.h"
+#include "hashmap.h"
 
 #ifndef EM_BPF
 #define EM_BPF 247
@@ -1013,16 +1015,22 @@  static int bpf_object__init_user_maps(struct bpf_object *obj, bool strict)
 }
 
 static const struct btf_type *skip_mods_and_typedefs(const struct btf *btf,
-						     __u32 id)
+						     __u32 id,
+						     __u32 *res_id)
 {
 	const struct btf_type *t = btf__type_by_id(btf, id);
 
+	if (res_id)
+		*res_id = id;
+
 	while (true) {
 		switch (BTF_INFO_KIND(t->info)) {
 		case BTF_KIND_VOLATILE:
 		case BTF_KIND_CONST:
 		case BTF_KIND_RESTRICT:
 		case BTF_KIND_TYPEDEF:
+			if (res_id)
+				*res_id = t->type;
 			t = btf__type_by_id(btf, t->type);
 			break;
 		default:
@@ -1041,7 +1049,7 @@  static const struct btf_type *skip_mods_and_typedefs(const struct btf *btf,
 static bool get_map_field_int(const char *map_name, const struct btf *btf,
 			      const struct btf_type *def,
 			      const struct btf_member *m, __u32 *res) {
-	const struct btf_type *t = skip_mods_and_typedefs(btf, m->type);
+	const struct btf_type *t = skip_mods_and_typedefs(btf, m->type, NULL);
 	const char *name = btf__name_by_offset(btf, m->name_off);
 	const struct btf_array *arr_info;
 	const struct btf_type *arr_t;
@@ -1107,7 +1115,7 @@  static int bpf_object__init_user_btf_map(struct bpf_object *obj,
 		return -EOPNOTSUPP;
 	}
 
-	def = skip_mods_and_typedefs(obj->btf, var->type);
+	def = skip_mods_and_typedefs(obj->btf, var->type, NULL);
 	if (BTF_INFO_KIND(def->info) != BTF_KIND_STRUCT) {
 		pr_warning("map '%s': unexpected def kind %u.\n",
 			   map_name, BTF_INFO_KIND(var->info));
@@ -2289,6 +2297,845 @@  bpf_program_reloc_btf_ext(struct bpf_program *prog, struct bpf_object *obj,
 	return 0;
 }
 
+#define BPF_CORE_SPEC_MAX_LEN 64
+
+/* represents BPF CO-RE field or array element accessor */
+struct bpf_core_accessor {
+	__u32 type_id;		/* struct/union type or array element type */
+	__u32 idx;		/* field index or array index */
+	const char *name;	/* field name or NULL for array accessor */
+};
+
+struct bpf_core_spec {
+	const struct btf *btf;
+	/* high-level spec: named fields and array indicies only */
+	struct bpf_core_accessor spec[BPF_CORE_SPEC_MAX_LEN];
+	/* high-level spec length */
+	int len;
+	/* raw, low-level spec: 1-to-1 with accessor spec string */
+	int raw_spec[BPF_CORE_SPEC_MAX_LEN];
+	/* raw spec length */
+	int raw_len;
+	/* field byte offset represented by spec */
+	__u32 offset;
+};
+
+static bool str_is_empty(const char *s)
+{
+	return !s || !s[0];
+}
+
+static int btf_kind(const struct btf_type *t)
+{
+	return BTF_INFO_KIND(t->info);
+}
+
+static bool btf_is_composite(const struct btf_type *t)
+{
+	int kind = btf_kind(t);
+
+	return kind == BTF_KIND_STRUCT || kind == BTF_KIND_UNION;
+}
+
+static bool btf_is_array(const struct btf_type *t)
+{
+	return btf_kind(t) == BTF_KIND_ARRAY;
+}
+
+/* 
+ * Turn bpf_offset_reloc into a low- and high-level spec representation,
+ * validating correctness along the way, as well as calculating resulting
+ * field offset (in bytes), specified by accessor string. Low-level spec
+ * captures every single level of nestedness, including traversing anonymous
+ * struct/union members. High-level one only captures semantically meaningful
+ * "turning points": named fields and array indicies.
+ * E.g., for this case:
+ *
+ *   struct sample {
+ *       int __unimportant;
+ *       struct {
+ *           int __1;
+ *           int __2;
+ *           int a[7];
+ *       };
+ *   };
+ *
+ *   struct sample *s = ...;
+ *
+ *   int x = &s->a[3]; // access string = '0:1:2:3'
+ *
+ * Low-level spec has 1:1 mapping with each element of access string (it's
+ * just a parsed access string representation): [0, 1, 2, 3].
+ *
+ * High-level spec will capture only 3 points:
+ *   - intial zero-index access by pointer (&s->... is the same as &s[0]...);
+ *   - field 'a' access (corresponds to '2' in low-level spec);
+ *   - array element #3 access (corresponds to '3' in low-level spec).
+ *
+ */
+static int bpf_core_spec_parse(const struct btf *btf,
+			       __u32 type_id,
+			       const char *spec_str,
+			       struct bpf_core_spec *spec)
+{
+	int access_idx, parsed_len, i;
+	const struct btf_type *t;
+	__u32 id = type_id;
+	const char *name;
+	__s64 sz;
+
+	if (str_is_empty(spec_str) || *spec_str == ':')
+		return -EINVAL;
+
+	memset(spec, 0, sizeof(*spec));
+	spec->btf = btf;
+
+	/* parse spec_str="0:1:2:3:4" into array raw_spec=[0, 1, 2, 3, 4] */
+	while (*spec_str) {
+		if (*spec_str == ':')
+			++spec_str;
+		if (sscanf(spec_str, "%d%n", &access_idx, &parsed_len) != 1)
+			return -EINVAL;
+		if (spec->raw_len == BPF_CORE_SPEC_MAX_LEN)
+			return -E2BIG;
+		spec_str += parsed_len;
+		spec->raw_spec[spec->raw_len++] = access_idx;
+	}
+
+	if (spec->raw_len == 0)
+		return -EINVAL;
+
+	for (i = 0; i < spec->raw_len; i++) {
+		t = skip_mods_and_typedefs(btf, id, &id);
+		if (!t)
+			return -EINVAL;
+
+		access_idx = spec->raw_spec[i];
+
+		if (i == 0) {
+			/* first spec value is always reloc type array index */
+			spec->spec[spec->len].type_id = id;
+			spec->spec[spec->len].idx = access_idx;
+			spec->len++;
+
+			sz = btf__resolve_size(btf, id);
+			if (sz < 0)
+				return sz;
+			spec->offset += access_idx * sz;
+			continue;
+		}
+
+		if (btf_is_composite(t)) {
+			const struct btf_member *m = (void *)(t + 1);
+			__u32 offset;
+
+			if (access_idx >= BTF_INFO_VLEN(t->info))
+				return -EINVAL;
+
+			m = &m[access_idx];
+
+			if (BTF_INFO_KFLAG(t->info)) {
+				if (BTF_MEMBER_BITFIELD_SIZE(m->offset))
+					return -EINVAL;
+				offset = BTF_MEMBER_BIT_OFFSET(m->offset);
+			} else {
+				offset = m->offset;
+			}
+			if (m->offset % 8)
+				return -EINVAL;
+			spec->offset += offset / 8;
+
+			if (m->name_off) {
+				name = btf__name_by_offset(btf, m->name_off);
+				if (str_is_empty(name))
+					return -EINVAL;
+
+				spec->spec[spec->len].type_id = id;
+				spec->spec[spec->len].idx = access_idx;
+				spec->spec[spec->len].name = name;
+				spec->len++;
+			}
+
+			id = m->type;
+		} else if (btf_is_array(t)) {
+			const struct btf_array *a = (void *)(t + 1);
+
+			t = skip_mods_and_typedefs(btf, a->type, &id);
+			if (!t || access_idx >= a->nelems)
+				return -EINVAL;
+
+			spec->spec[spec->len].type_id = id;
+			spec->spec[spec->len].idx = access_idx;
+			spec->len++;
+
+			sz = btf__resolve_size(btf, id);
+			if (sz < 0)
+				return sz;
+			spec->offset += access_idx * sz;
+		} else {
+			pr_warning("relo for [%u] %s (at idx %d) captures type [%d] of unexpected kind %d\n",
+				   type_id, spec_str, i, id, btf_kind(t));
+			return -EINVAL;
+		}
+	}
+
+	if (spec->len == 0)
+		return -EINVAL;
+
+	return 0;
+}
+
+/* Given 'some_struct_name___with_flavor' return the length of a name prefix
+ * before last triple underscore. Struct name part after last triple
+ * underscore is ignored by BPF CO-RE relocation during relocation matching.
+ */
+static size_t bpf_core_essential_name_len(const char *name)
+{
+	size_t n = strlen(name);
+	int i = n - 3;
+
+	while (i > 0) {
+		if (name[i] == '_' && name[i + 1] == '_' && name[i + 2] == '_')
+			return i;
+		i--;
+	}
+	return n;
+}
+
+/* dynamically sized list of type IDs */
+struct ids_vec {
+	__u32 *data;
+	int len;
+};
+
+static void bpf_core_free_cands(struct ids_vec *cand_ids)
+{
+	free(cand_ids->data);
+	free(cand_ids);
+}
+
+static struct ids_vec *bpf_core_find_cands(const struct btf *local_btf,
+					   __u32 local_type_id,
+					   const struct btf *targ_btf)
+{
+	size_t local_essent_len, targ_essent_len;
+	const char *local_name, *targ_name;
+	const struct btf_type *t;
+	struct ids_vec *cand_ids;
+	__u32 *new_ids;
+	int i, err, n;
+
+	t = btf__type_by_id(local_btf, local_type_id);
+	if (!t)
+		return ERR_PTR(-EINVAL);
+
+	local_name = btf__name_by_offset(local_btf, t->name_off);
+	if (str_is_empty(local_name))
+		return ERR_PTR(-EINVAL);
+	local_essent_len = bpf_core_essential_name_len(local_name);
+
+	cand_ids = calloc(1, sizeof(*cand_ids));
+	if (!cand_ids)
+		return ERR_PTR(-ENOMEM);
+
+	n = btf__get_nr_types(targ_btf);
+	for (i = 1; i <= n; i++) {
+		t = btf__type_by_id(targ_btf, i);
+		targ_name = btf__name_by_offset(targ_btf, t->name_off);
+		if (str_is_empty(targ_name))
+			continue;
+
+		targ_essent_len = bpf_core_essential_name_len(targ_name);
+		if (targ_essent_len != local_essent_len)
+			continue;
+
+		if (strncmp(local_name, targ_name, local_essent_len) == 0) {
+			pr_debug("[%d] (%s): found candidate [%d] (%s)\n",
+				 local_type_id, local_name, i, targ_name);
+			new_ids = realloc(cand_ids->data, cand_ids->len + 1);
+			if (!new_ids) {
+				err = -ENOMEM;
+				goto err_out;
+			}
+			cand_ids->data = new_ids;
+			cand_ids->data[cand_ids->len++] = i;
+		}
+	}
+	return cand_ids;
+err_out:
+	bpf_core_free_cands(cand_ids);
+	return ERR_PTR(err);
+}
+
+/* Check two types for compatibility, skipping const/volatile/restrict and
+ * typedefs, to ensure we are relocating offset to the compatible entities:
+ *   - any two STRUCTs/UNIONs are compatible and can be mixed;
+ *   - any two FWDs are compatible;
+ *   - any two PTRs are always compatible;
+ *   - for ENUMs, check sizes, names are ignored;
+ *   - for INT, size and bitness should match, signedness is ignored;
+ *   - for ARRAY, dimensionality is ignored, element types are checked for
+ *     compatibility recursively;
+ *   - everything else shouldn't be ever a target of relocation.
+ * These rules are not set in stone and probably will be adjusted as we get
+ * more experience with using BPF CO-RE relocations.
+ */
+static int bpf_core_fields_are_compat(const struct btf *local_btf,
+				      __u32 local_id,
+				      const struct btf *targ_btf,
+				      __u32 targ_id)
+{
+	const struct btf_type *local_type, *targ_type;
+	__u16 kind;
+
+recur:
+	local_type = skip_mods_and_typedefs(local_btf, local_id, &local_id);
+	targ_type = skip_mods_and_typedefs(targ_btf, targ_id, &targ_id);
+	if (!local_type || !targ_type)
+		return -EINVAL;
+
+	if (btf_is_composite(local_type) && btf_is_composite(targ_type))
+		return 1;
+	if (BTF_INFO_KIND(local_type->info) != BTF_INFO_KIND(targ_type->info))
+		return 0;
+
+	kind = BTF_INFO_KIND(local_type->info);
+	switch (kind) {
+	case BTF_KIND_FWD:
+	case BTF_KIND_PTR:
+		return 1;
+	case BTF_KIND_ENUM:
+		return local_type->size == targ_type->size;
+	case BTF_KIND_INT: {
+		__u32 loc_int = *(__u32 *)(local_type + 1);
+		__u32 targ_int = *(__u32 *)(targ_type + 1);
+
+		return BTF_INT_OFFSET(loc_int) == 0 &&
+		       BTF_INT_OFFSET(targ_int) == 0 &&
+		       local_type->size == targ_type->size &&
+		       BTF_INT_BITS(loc_int) == BTF_INT_BITS(targ_int);
+	}
+	case BTF_KIND_ARRAY: {
+		const struct btf_array *loc_a, *targ_a;
+
+		loc_a = (void *)(local_type + 1);
+		targ_a = (void *)(targ_type + 1);
+		local_id = loc_a->type;
+		targ_id = targ_a->type;
+		goto recur;
+	}
+	default:
+		pr_warning("unexpected kind %d relocated, local [%d], target [%d]\n",
+			   kind, local_id, targ_id);
+		return 0;
+	}
+}
+
+/* 
+ * Given single high-level accessor (either named field or array index) in
+ * local type, find corresponding high-level accessor for a target type. Along
+ * the way, maintain low-level spec for target as well. Also keep updating
+ * target offset.
+ */
+static int bpf_core_match_member(const struct btf *local_btf,
+				 const struct bpf_core_accessor *local_acc,
+				 const struct btf *targ_btf,
+				 __u32 targ_id,
+				 struct bpf_core_spec *spec,
+				 __u32 *next_targ_id)
+{
+	const struct btf_type *local_type, *targ_type;
+	const struct btf_member *local_member, *m;
+	const char *local_name, *targ_name;
+	__u32 local_id;
+	int i, n, found;
+
+	targ_type = skip_mods_and_typedefs(targ_btf, targ_id, &targ_id);
+	if (!targ_type)
+		return -EINVAL;
+	if (!btf_is_composite(targ_type))
+		return 0;
+
+	local_id = local_acc->type_id;
+	local_type = btf__type_by_id(local_btf, local_id);
+	local_member = (void *)(local_type + 1);
+	local_member += local_acc->idx;
+	local_name = btf__name_by_offset(local_btf, local_member->name_off);
+
+	n = BTF_INFO_VLEN(targ_type->info);
+	m = (void *)(targ_type + 1);
+	for (i = 0; i < n; i++, m++) {
+		__u32 offset;
+
+		/* bitfield relocations not supported */
+		if (BTF_INFO_KFLAG(targ_type->info)) {
+			if (BTF_MEMBER_BITFIELD_SIZE(m->offset))
+				continue;
+			offset = BTF_MEMBER_BIT_OFFSET(m->offset);
+		} else {
+			offset = m->offset;
+		}
+		if (offset % 8)
+			continue;
+
+		/* too deep struct/union/array nesting */
+		if (spec->raw_len == BPF_CORE_SPEC_MAX_LEN)
+			return -E2BIG;
+
+		/* speculate this member will be the good one */
+		spec->offset += offset / 8;
+		spec->raw_spec[spec->raw_len++] = i;
+
+		targ_name = btf__name_by_offset(targ_btf, m->name_off);
+		if (str_is_empty(targ_name)) {
+			/* embedded struct/union, we need to go deeper */
+			found = bpf_core_match_member(local_btf, local_acc,
+						      targ_btf, m->type,
+						      spec, next_targ_id);
+			if (found) /* either found or error */
+				return found;
+		} else if (strcmp(local_name, targ_name) == 0) {
+			/* matching named field */
+			struct bpf_core_accessor *targ_acc;
+
+			targ_acc = &spec->spec[spec->len++];
+			targ_acc->type_id = targ_id;
+			targ_acc->idx = i;
+			targ_acc->name = targ_name;
+
+			*next_targ_id = m->type;
+			found = bpf_core_fields_are_compat(local_btf,
+							   local_member->type,
+							   targ_btf, m->type);
+			if (!found)
+				spec->len--; /* pop accessor */
+			return found;
+		}
+		/* member turned out to be not we looked for */
+		spec->offset -= offset / 8;
+		spec->raw_len--;
+	}
+
+	return 0;
+}
+
+/*
+ * Try to match local spec to a target type and, if successful, produce full
+ * target spec (high-level, low-level + offset).
+ */
+static int bpf_core_spec_match(struct bpf_core_spec *local_spec,
+			       const struct btf *targ_btf, __u32 targ_id,
+			       struct bpf_core_spec *targ_spec)
+{
+	const struct btf_type *targ_type;
+	const struct bpf_core_accessor *local_acc;
+	struct bpf_core_accessor *targ_acc;
+	int i, sz, matched;
+
+	memset(targ_spec, 0, sizeof(*targ_spec));
+	targ_spec->btf = targ_btf;
+
+	local_acc = &local_spec->spec[0];
+	targ_acc = &targ_spec->spec[0];
+
+	for (i = 0; i < local_spec->len; i++, local_acc++, targ_acc++) {
+		targ_type = skip_mods_and_typedefs(targ_spec->btf, targ_id,
+						   &targ_id);
+		if (!targ_type)
+			return -EINVAL;
+
+		if (local_acc->name) {
+			if (!btf_is_composite(targ_type))
+				return 0;
+
+			matched = bpf_core_match_member(local_spec->btf,
+							local_acc,
+							targ_btf, targ_id,
+							targ_spec, &targ_id);
+			if (matched <= 0)
+				return matched;
+		} else {
+			/* for i=0, targ_id is already treated as array element
+			 * type (because it's the original struct), for others
+			 * we should find array element type first
+			 */
+			if (i > 0) {
+				const struct btf_array *a;
+
+				if (!btf_is_array(targ_type))
+					return 0;
+
+				a = (void *)(targ_type + 1);
+				if (local_acc->idx >= a->nelems)
+					return 0;
+				if (!skip_mods_and_typedefs(targ_btf, a->type,
+							    &targ_id))
+					return -EINVAL;
+			}
+
+			/* too deep struct/union/array nesting */
+			if (targ_spec->raw_len == BPF_CORE_SPEC_MAX_LEN)
+				return -E2BIG;
+
+			targ_acc->type_id = targ_id;
+			targ_acc->idx = local_acc->idx;
+			targ_acc->name = NULL;
+			targ_spec->len++;
+			targ_spec->raw_spec[targ_spec->raw_len] = targ_acc->idx;
+			targ_spec->raw_len++;
+
+			sz = btf__resolve_size(targ_btf, targ_id);
+			if (sz < 0)
+				return sz;
+			targ_spec->offset += local_acc->idx * sz;
+		}
+	}
+
+	return 1;
+}
+
+/*
+ * Patch relocatable BPF instruction.
+ * Expected insn->imm value is provided for validation, as well as the new
+ * relocated value.
+ *
+ * Currently three kinds of BPF instructions are supported:
+ * 1. rX = <imm> (assignment with immediate operand);
+ * 2. rX += <imm> (arithmetic operations with immediate operand);
+ * 3. *(rX) = <imm> (indirect memory assignment with immediate operand).
+ *
+ * If actual insn->imm value is wrong, bail out.
+ */
+static int bpf_core_reloc_insn(struct bpf_program *prog, int insn_off,
+			       __u32 orig_off, __u32 new_off)
+{
+	struct bpf_insn *insn;
+	int insn_idx;
+	__u8 class;
+
+	if (insn_off % sizeof(struct bpf_insn))
+		return -EINVAL;
+	insn_idx = insn_off / sizeof(struct bpf_insn);
+
+	insn = &prog->insns[insn_idx];
+	class = BPF_CLASS(insn->code);
+
+	if (class == BPF_ALU || class == BPF_ALU64) {
+		if (BPF_SRC(insn->code) != BPF_K)
+			return -EINVAL;
+		if (insn->imm != orig_off)
+			return -EINVAL;
+		insn->imm = new_off;
+		pr_debug("prog '%s': patched insn #%d (ALU/ALU64) imm %d -> %d\n",
+			 bpf_program__title(prog, false),
+			 insn_idx, orig_off, new_off);
+	} else if (class == BPF_ST && BPF_MODE(insn->code) == BPF_MEM) {
+		if (insn->imm != orig_off)
+			return -EINVAL;
+		insn->imm = new_off;
+		pr_debug("prog '%s': patched insn #%d (ST | MEM) imm %d -> %d\n",
+			 bpf_program__title(prog, false),
+			 insn_idx, orig_off, new_off);
+	} else {
+		pr_warning("prog '%s': trying to relocate unrecognized insn #%d, code:%x, src:%x, dst:%x, off:%x, imm:%x\n",
+			   bpf_program__title(prog, false),
+			   insn_idx, insn->code, insn->src_reg, insn->dst_reg,
+			   insn->off, insn->imm);
+		return -EINVAL;
+	}
+	return 0;
+}
+
+/*
+ * Probe few well-known locations for vmlinux kernel image and try to load BTF
+ * data out of it to use for target BTF.
+ */
+static struct btf *bpf_core_find_kernel_btf(void)
+{
+	const char *locations[] = {
+		"/lib/modules/%1$s/vmlinux-%1$s",
+		"/usr/lib/modules/%1$s/kernel/vmlinux",
+	};
+	char path[PATH_MAX + 1];
+	struct utsname buf;
+	struct btf *btf;
+	int i, err;
+
+	err = uname(&buf);
+	if (err) {
+		pr_warning("failed to uname(): %d\n", err);
+		return ERR_PTR(err);
+	}
+
+	for (i = 0; i < ARRAY_SIZE(locations); i++) {
+		snprintf(path, PATH_MAX, locations[i], buf.release);
+		pr_debug("attempting to load kernel BTF from '%s'\n", path);
+
+		if (access(path, R_OK))
+			continue;
+
+		btf = btf__parse_elf(path, NULL);
+		if (IS_ERR(btf))
+			continue;
+
+		pr_debug("successfully loaded kernel BTF from '%s'\n", path);
+		return btf;
+	}
+
+	pr_warning("failed to find valid kernel BTF\n");
+	return ERR_PTR(-ESRCH);
+}
+
+static size_t bpf_core_hash_fn(const void *key, void *ctx)
+{
+	return (size_t)key;
+}
+
+static bool bpf_core_equal_fn(const void *k1, const void *k2, void *ctx)
+{
+	return k1 == k2;
+}
+
+static void *u32_to_ptr(__u32 x)
+{
+	return (void *)(uintptr_t)x;
+}
+
+/* 
+ * CO-RE relocate single instruction.
+ *
+ * The outline and important points of the algorithm:
+ * 1. For given local type, find corresponding candidate target types.
+ *    Candidate type is a type with the same "essential" name, ignoring
+ *    everything after last triple underscore (___). E.g., `sample`,
+ *    `sample___flavor_one`, `sample___flavor_another_one`, are all candidates
+ *    for each other. Names with triple underscore are referred to as
+ *    "flavors" and are useful, among other things, to allow to
+ *    specify/support incompatible variations of the same kernel struct, which
+ *    might differ between different kernel versions and/or build
+ *    configurations.
+ * 2. For each candidate type, try to match local specification to this
+ *    candidate target type. Matching involves finding corresponding
+ *    high-level spec accessors, meaning that all named fields should match,
+ *    as well as all array accesses should be within the actual bounds. Also,
+ *    types should be compatible (see bpf_core_fields_are_compat for details).
+ * 3. It is supported and expected that there might be multiple flavors
+ *    matching the spec. As long as all the specs resolve to the same set of
+ *    offsets across all candidates, there is not error. If there is any
+ *    ambiguity, CO-RE relocation will fail. This is necessary to accomodate
+ *    imprefection of BTF deduplication, which can cause slight duplication of
+ *    the same BTF type, if some directly or indirectly referenced (by
+ *    pointer) type gets resolved to different actual types in different
+ *    object files. If such situation occurs, deduplicated BTF will end up
+ *    with two (or more) structurally identical types, which differ only in
+ *    types they refer to through pointer. This should be OK in most cases and
+ *    is not an error.
+ * 4. Candidate types search is performed by linearly scanning through all
+ *    types in target BTF. It is anticipated that this is overall more
+ *    efficient memory-wise and not significantly worse (if not better)
+ *    CPU-wise compared to prebuilding a map from all local type names to
+ *    a list of candidate type names. It's also sped up by caching resolved
+ *    list of matching candidates per each local "root" type ID, that has at
+ *    least one bpf_offset_reloc associated with it. This list is shared
+ *    between multiple relocations for the same type ID and is updated as some
+ *    of the candidates are pruned due to structural incompatibility.
+ */
+static int bpf_core_reloc_offset(struct bpf_program *prog,
+				 const struct bpf_offset_reloc *relo,
+				 int relo_idx,
+				 const struct btf *local_btf,
+				 const struct btf *targ_btf,
+				 struct hashmap *cand_cache)
+{
+	const char *prog_name = bpf_program__title(prog, false);
+	struct bpf_core_spec local_spec, cand_spec, targ_spec;
+	const void *type_key = u32_to_ptr(relo->type_id);
+	const struct btf_type *local_type, *cand_type;
+	const char *local_name, *cand_name;
+	struct ids_vec *cand_ids;
+	__u32 local_id, cand_id;
+	const char *spec_str;
+	int i, j, err;
+
+	local_id = relo->type_id;
+	local_type = btf__type_by_id(local_btf, local_id);
+	if (!local_type)
+		return -EINVAL;
+
+	local_name = btf__name_by_offset(local_btf, local_type->name_off);
+	if (str_is_empty(local_name))
+		return -EINVAL;
+
+	spec_str = btf__name_by_offset(local_btf, relo->access_str_off);
+	if (str_is_empty(spec_str))
+		return -EINVAL;
+
+	pr_debug("prog '%s': relo #%d: insn_off=%d, [%d] (%s) + %s\n",
+		 prog_name, relo_idx, relo->insn_off,
+		 local_id, local_name, spec_str);
+
+	err = bpf_core_spec_parse(local_btf, local_id, spec_str, &local_spec);
+	if (err) {
+		pr_warning("prog '%s': relo #%d: parsing [%d] (%s) + %s failed: %d\n",
+			   prog_name, relo_idx, local_id, local_name, spec_str,
+			   err);
+		return -EINVAL;
+	}
+	pr_debug("prog '%s': relo #%d: [%d] (%s) + %s is off %u, len %d, raw_len %d\n",
+		 prog_name, relo_idx, local_id, local_name, spec_str,
+		 local_spec.offset, local_spec.len, local_spec.raw_len);
+
+	if (!hashmap__find(cand_cache, type_key, (void **)&cand_ids)) {
+		cand_ids = bpf_core_find_cands(local_btf, local_id, targ_btf);
+		if (IS_ERR(cand_ids)) {
+			pr_warning("prog '%s': relo #%d: target candidate search failed for [%d] (%s) + %s: %ld\n",
+				   prog_name, relo_idx, local_id, local_name,
+				   spec_str, PTR_ERR(cand_ids));
+			return PTR_ERR(cand_ids);
+		}
+		err = hashmap__set(cand_cache, type_key, cand_ids, NULL, NULL);
+		if (err) {
+			bpf_core_free_cands(cand_ids);
+			return err;
+		}
+	}
+
+	for (i = 0, j = 0; i < cand_ids->len; i++) {
+		cand_id = cand_ids->data[j];
+		cand_type = btf__type_by_id(targ_btf, cand_id);
+		cand_name = btf__name_by_offset(targ_btf, cand_type->name_off);
+
+		err = bpf_core_spec_match(&local_spec, targ_btf,
+					  cand_id, &cand_spec);
+		if (err < 0) {
+			pr_warning("prog '%s': relo #%d: failed to match spec [%d] (%s) + %s to candidate #%d [%d] (%s): %d\n",
+				   prog_name, relo_idx, local_id, local_name,
+				   spec_str, i, cand_id, cand_name, err);
+			return err;
+		}
+		if (err == 0) {
+			pr_debug("prog '%s': relo #%d: candidate #%d [%d] (%s) doesn't match spec\n",
+				 prog_name, relo_idx, i, cand_id, cand_name);
+			continue;
+		}
+
+		pr_debug("prog '%s': relo #%d: candidate #%d ([%d] %s) is off %u, len %d, raw_len %d\n",
+			 prog_name, relo_idx, i, cand_id, cand_name,
+			 cand_spec.offset, cand_spec.len, cand_spec.raw_len);
+
+		if (j == 0) {
+			targ_spec = cand_spec;
+		} else if (cand_spec.offset != targ_spec.offset) {
+			/* if there are many candidates, they should all
+			 * resolve to the same offset
+			 */
+			pr_warning("prog '%s': relo #%d: candidate #%d ([%d] %s): conflicting offset found (%u != %u)\n",
+				   prog_name, relo_idx, i, cand_id, cand_name,
+				   cand_spec.offset, targ_spec.offset);
+			return -EINVAL;
+		}
+
+		cand_ids->data[j++] = cand_spec.spec[0].type_id;
+	}
+
+	cand_ids->len = j;
+	if (cand_ids->len == 0) {
+		pr_warning("prog '%s': relo #%d: no matching targets found for [%d] (%s) + %s\n",
+			   prog_name, relo_idx, local_id, local_name, spec_str);
+		return -ESRCH;
+	}
+
+	err = bpf_core_reloc_insn(prog, relo->insn_off,
+				  local_spec.offset, targ_spec.offset);
+	if (err) {
+		pr_warning("prog '%s': relo #%d: failed to patch insn at offset %d: %d\n",
+			   prog_name, relo_idx, relo->insn_off, err);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int
+bpf_core_reloc_offsets(struct bpf_object *obj, const char *targ_btf_path)
+{
+	const struct btf_ext_info_sec *sec;
+	const struct bpf_offset_reloc *rec;
+	const struct btf_ext_info *seg;
+	struct hashmap_entry *entry;
+	struct hashmap *cand_cache = NULL;
+	struct bpf_program *prog;
+	struct btf *targ_btf;
+	const char *sec_name;
+	int i, err = 0;
+
+	if (targ_btf_path)
+		targ_btf = btf__parse_elf(targ_btf_path, NULL);
+	else
+		targ_btf = bpf_core_find_kernel_btf();
+	if (IS_ERR(targ_btf)) {
+		pr_warning("failed to get target BTF: %ld\n",
+			   PTR_ERR(targ_btf));
+		return PTR_ERR(targ_btf);
+	}
+
+	cand_cache = hashmap__new(bpf_core_hash_fn, bpf_core_equal_fn, NULL);
+	if (IS_ERR(cand_cache)) {
+		err = PTR_ERR(cand_cache);
+		goto out;
+	}
+
+	seg = &obj->btf_ext->offset_reloc_info;
+	for_each_btf_ext_sec(seg, sec) {
+		sec_name = btf__name_by_offset(obj->btf, sec->sec_name_off);
+		if (str_is_empty(sec_name)) {
+			err = -EINVAL;
+			goto out;
+		}
+		prog = bpf_object__find_program_by_title(obj, sec_name);
+		if (!prog) {
+			pr_warning("failed to find program '%s' for CO-RE offset relocation\n",
+				   sec_name);
+			err = -EINVAL;
+			goto out;
+		}
+
+		pr_debug("prog '%s': performing %d CO-RE offset relocs\n",
+			 sec_name, sec->num_info);
+
+		for_each_btf_ext_rec(seg, sec, i, rec) {
+			err = bpf_core_reloc_offset(prog, rec, i, obj->btf,
+						    targ_btf, cand_cache);
+			if (err) {
+				pr_warning("prog '%s': relo #%d: failed to relocate: %d\n",
+					   sec_name, i, err);
+				goto out;
+			}
+		}
+	}
+
+out:
+	btf__free(targ_btf);
+	if (!IS_ERR_OR_NULL(cand_cache)) {
+		hashmap__for_each_entry(cand_cache, entry, i) {
+			bpf_core_free_cands(entry->value);
+		}
+		hashmap__free(cand_cache);
+	}
+	return err;
+}
+
+static int
+bpf_object__relocate_core(struct bpf_object *obj, const char *targ_btf_path)
+{
+	int err = 0;
+
+	if (obj->btf_ext->offset_reloc_info.len)
+		err = bpf_core_reloc_offsets(obj, targ_btf_path);
+
+	return err;
+}
+
 static int
 bpf_program__reloc_text(struct bpf_program *prog, struct bpf_object *obj,
 			struct reloc_desc *relo)
@@ -2396,14 +3243,21 @@  bpf_program__relocate(struct bpf_program *prog, struct bpf_object *obj)
 	return 0;
 }
 
-
 static int
-bpf_object__relocate(struct bpf_object *obj)
+bpf_object__relocate(struct bpf_object *obj, const char *targ_btf_path)
 {
 	struct bpf_program *prog;
 	size_t i;
 	int err;
 
+	if (obj->btf_ext) {
+		err = bpf_object__relocate_core(obj, targ_btf_path);
+		if (err) {
+			pr_warning("failed to perform CO-RE relocations: %d\n",
+				   err);
+			return err;
+		}
+	}
 	for (i = 0; i < obj->nr_programs; i++) {
 		prog = &obj->programs[i];
 
@@ -2804,7 +3658,7 @@  int bpf_object__load_xattr(struct bpf_object_load_attr *attr)
 	obj->loaded = true;
 
 	CHECK_ERR(bpf_object__create_maps(obj), err, out);
-	CHECK_ERR(bpf_object__relocate(obj), err, out);
+	CHECK_ERR(bpf_object__relocate(obj, attr->target_btf_path), err, out);
 	CHECK_ERR(bpf_object__load_progs(obj, attr->log_level), err, out);
 
 	return 0;
diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index 5cbf459ece0b..6004bfe1ebf0 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -92,6 +92,7 @@  LIBBPF_API void bpf_object__close(struct bpf_object *object);
 struct bpf_object_load_attr {
 	struct bpf_object *obj;
 	int log_level;
+	const char *target_btf_path;
 };
 
 /* Load/unload object into/from kernel */