diff mbox series

sparc64: Expose mdesc to sysfs

Message ID 1506676498-213746-1-git-send-email-eric.saint.etienne@oracle.com
State Changes Requested
Delegated to: David Miller
Headers show
Series sparc64: Expose mdesc to sysfs | expand

Commit Message

Eric Saint Etienne Sept. 29, 2017, 9:14 a.m. UTC
To get a snapshot as complete as possible of what the system is composed of,
this commit adds the ability to browse machine description from sysfs.

This commit eases porting of user-space utilities that exist on other architectures.

Whenever the HV changes machine description content, sysfs will be re-populated.
This is done almost atomically by swapping the old mdesc sysfs folder with the
new one. While the tree under the new folder is being created, reading from
/sys/firmware/mdesc will return EAGAIN. That way userspace programs will not see
stale content in /sys/firmware/mdesc

Signed-off-by: Eric Saint Etienne <eric.saint.etienne@oracle.com>
Reviewed-by: David Aldridge <david.j.aldridge@oracle.com>
---
 arch/sparc/Kconfig              |    8 +
 arch/sparc/kernel/Makefile      |    3 +
 arch/sparc/kernel/mdesc.c       |    8 +
 arch/sparc/kernel/mdesc_sysfs.c |  831 +++++++++++++++++++++++++++++++++++++++
 4 files changed, 850 insertions(+), 0 deletions(-)
 create mode 100644 arch/sparc/kernel/mdesc_sysfs.c

Comments

Eric Saint Etienne Dec. 1, 2017, 5:15 p.m. UTC | #1
Hi Dave,

Do you have any concern with this patch?

Thanks.

Eric

On 29/09/17 10:14, Eric Saint-Etienne wrote:
> To get a snapshot as complete as possible of what the system is composed of,
> this commit adds the ability to browse machine description from sysfs.
>
> This commit eases porting of user-space utilities that exist on other architectures.
>
> Whenever the HV changes machine description content, sysfs will be re-populated.
> This is done almost atomically by swapping the old mdesc sysfs folder with the
> new one. While the tree under the new folder is being created, reading from
> /sys/firmware/mdesc will return EAGAIN. That way userspace programs will not see
> stale content in /sys/firmware/mdesc
>
> Signed-off-by: Eric Saint Etienne <eric.saint.etienne@oracle.com>
> Reviewed-by: David Aldridge <david.j.aldridge@oracle.com>
> ---
>   arch/sparc/Kconfig              |    8 +
>   arch/sparc/kernel/Makefile      |    3 +
>   arch/sparc/kernel/mdesc.c       |    8 +
>   arch/sparc/kernel/mdesc_sysfs.c |  831 +++++++++++++++++++++++++++++++++++++++
>   4 files changed, 850 insertions(+), 0 deletions(-)
>   create mode 100644 arch/sparc/kernel/mdesc_sysfs.c
>
> diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
> index a4a6261..4e2b640 100644
> --- a/arch/sparc/Kconfig
> +++ b/arch/sparc/Kconfig
> @@ -470,6 +470,14 @@ config UBOOT_ENTRY_ADDR
>   endmenu
>   endif
>   
> +config MDESC_SYSFS
> +	bool "Maps machine description to sysfs"
> +	depends on SPARC64
> +	default y
> +	---help---
> +	  If you say Y here, the directed acyclic graph described in the machine
> +	  description will be expanded and made available in /sys/firmware/mdesc
> +
>   endmenu
>   
>   menu "Bus options (PCI etc.)"
> diff --git a/arch/sparc/kernel/Makefile b/arch/sparc/kernel/Makefile
> index aac6098..328eb9c 100644
> --- a/arch/sparc/kernel/Makefile
> +++ b/arch/sparc/kernel/Makefile
> @@ -118,3 +118,6 @@ obj-$(CONFIG_SPARC64)	+= $(pc--y)
>   
>   obj-$(CONFIG_UPROBES)	+= uprobes.o
>   obj-$(CONFIG_SPARC64)	+= jump_label.o
> +
> +obj-$(CONFIG_MDESC_SYSFS) += mdesc_sysfs.o
> +
> diff --git a/arch/sparc/kernel/mdesc.c b/arch/sparc/kernel/mdesc.c
> index fa466ce..75db846 100644
> --- a/arch/sparc/kernel/mdesc.c
> +++ b/arch/sparc/kernel/mdesc.c
> @@ -87,6 +87,10 @@ struct md_node_ops {
>   	mdesc_node_match_f	node_match;
>   };
>   
> +#ifdef MDESC_SYSFS
> +int mdesc_sysfs_populate(void);
> +#endif
> +
>   static int get_vdev_port_node_info(struct mdesc_handle *md, u64 node,
>   				   union md_node_info *node_info);
>   static void rel_vdev_port_node_info(union md_node_info *node_info);
> @@ -523,6 +527,10 @@ void mdesc_update(void)
>   	cur_mdesc = hp;
>   	spin_unlock_irqrestore(&mdesc_lock, flags);
>   
> +#ifdef MDESC_SYSFS
> +	mdesc_sysfs_populate();
> +#endif
> +
>   	mdesc_notify_clients(orig_hp, hp);
>   
>   	spin_lock_irqsave(&mdesc_lock, flags);
> diff --git a/arch/sparc/kernel/mdesc_sysfs.c b/arch/sparc/kernel/mdesc_sysfs.c
> new file mode 100644
> index 0000000..f2e048c
> --- /dev/null
> +++ b/arch/sparc/kernel/mdesc_sysfs.c
> @@ -0,0 +1,831 @@
> +/*
> + * mdesc_sysfs.c - Maps machine description DAG to sysfs
> + *
> + * Copyright (c) 2017 Oracle and/or its affiliates. All rights reserved.
> + * Author: Eric Saint-Etienne
> + *
> + * This program is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU General Public License version
> + * 2 as published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope that it will be useful, but
> + * WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program; see the file COPYING.  If not, write to
> + * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139,
> + * USA.
> + *
> + */
> +
> +#include <linux/init.h>
> +#include <linux/module.h>
> +#include <linux/slab.h>
> +#include <asm/mdesc.h>
> +
> +#define MDESC_SYSFS_TOO_LARGE "Error: Not enough space to output the data, retrieve it from /dev/mdesc instead."
> +
> +/*
> + * Machine Description is described in slides 37-39 here:
> + * http://www.oracle.com/technetwork/systems/opensparc/2008-oct-opensparc-slide-cast-09-mw-1539018.html
> + *
> + * It is composed of three main sections decribed by the following mdesc header.
> + * Since these blocks are contiguous, describing their size is sufficient to
> + * precisely locate them within the mdesc blob.
> + *
> + * - The NODE block contains the nodes of the directed acyclic graph (DAG)
> + * - The NAME block contains names of properties referred to by the DAG nodes
> + * - The DATA block contains values for the properties: strings and binary data
> + *
> + * A public specification of mdesc is available online at:
> + * https://kenai.com/downloads/hypervisor/hypervisor-api-3.0draft7.pdf
> + */
> +struct mdesc_hdr {
> +	u32	version; /* Transport version */
> +	u32	node_sz; /* node block size */
> +	u32	name_sz; /* name block size */
> +	u32	data_sz; /* data block size */
> +} __aligned(16)__;
> +
> +struct mdesc_elem {
> +	u8	tag;
> +#define MD_LIST_END	0x00
> +#define MD_NODE		0x4e
> +#define MD_NODE_END	0x45
> +#define MD_NOOP		0x20
> +#define MD_PROP_ARC	0x61
> +#define MD_PROP_VAL	0x76
> +#define MD_PROP_STR	0x73
> +#define MD_PROP_DATA	0x64
> +	u8	name_len;
> +	u16	resv;
> +	u32	name_offset;
> +	union {
> +		struct {
> +			u32	data_len;
> +			u32	data_offset;
> +		} data;
> +		u64	val;
> +	} d;
> +};
> +
> +struct mdesc_mem_ops {
> +	struct mdesc_handle *(*alloc)(unsigned int mdesc_size);
> +	void (*free)(struct mdesc_handle *handle);
> +};
> +
> +struct mdesc_handle {
> +	struct list_head     list;
> +	struct mdesc_mem_ops *mops;
> +	void                 *self_base;
> +	atomic_t             refcnt;
> +	unsigned int         handle_size;
> +	struct mdesc_hdr     mdesc;
> +};
> +
> +/*
> + * The following 3 functions return a pointer of the right type to the
> + * 3 main mdesc blocks
> + */
> +
> +static inline struct mdesc_elem *node_block(struct mdesc_hdr *mdesc)
> +{
> +	return (struct mdesc_elem *) (mdesc + 1);
> +}
> +
> +static inline void *name_block(struct mdesc_hdr *mdesc)
> +{
> +	return ((void *) node_block(mdesc)) + mdesc->node_sz;
> +}
> +
> +static inline void *data_block(struct mdesc_hdr *mdesc)
> +{
> +	return ((void *) name_block(mdesc)) + mdesc->name_sz;
> +}
> +
> +/*
> + * Dynamically allocates the right amount of characters to format according
> + * to the printf-like format specified in "fmt".
> + * "*buf" is allocated and filled.
> + *
> + * Return 0 upon success
> + */
> +static int alloc_sprintf(char **buf, const char *fmt, ...)
> +{
> +	va_list args;
> +	size_t len, actual;
> +
> +	va_start(args, fmt);
> +	len = vsnprintf(NULL, 0, fmt, args);
> +	va_end(args);
> +
> +	*buf = kmalloc(len+1, GFP_KERNEL);
> +	if (!*buf)
> +		return -ENOMEM;
> +
> +	va_start(args, fmt);
> +	actual = vscnprintf(*buf, len + 1, fmt, args);
> +	va_end(args);
> +
> +	if (actual != len) {
> +		kfree(*buf);
> +		*buf = NULL;
> +		return -ENOMEM;
> +	}
> +
> +	return 0;
> +}
> +
> +/*
> + * Flags used with the following formatting functions that modify the way
> + * node names are formatted
> + */
> +#define FORMAT_APPEND    0x01
> +#define FORMAT_USE_INDEX 0x02
> +
> +/*
> + * Take an arc node in "ep" and format it using the "id" field as to avoid
> + * collision when adding to sysfs.
> + * Ex: multiple "cpu" nodes in the same folder will be named: "0", "1"...
> + *
> + * When FORMAT_APPEND is used, the names will become: "cpu0", "cpu1"...
> + * The formatted string is found in *formatted, to be freed by the caller
> + *
> + * Return 0 upon success
> + */
> +static int format_using_id(struct mdesc_handle *hp,
> +			   struct mdesc_elem *ep,
> +			   char const *name,
> +			   char **formatted,
> +			   unsigned int flags,
> +			   unsigned int index,
> +			   unsigned int total)
> +{
> +	const u64 *prop_id = mdesc_get_property(hp, ep->d.val, "id", NULL);
> +
> +	if (!prop_id) {
> +		pr_err("mdesc %s doesn't have a property \"id\"\n", name);
> +		return -ENOENT;
> +	}
> +
> +	if (flags & FORMAT_APPEND)
> +		return alloc_sprintf(formatted, "%s%lld", name, *prop_id);
> +
> +	return alloc_sprintf(formatted, "%lld", *prop_id);
> +}
> +
> +/*
> + * Take an arc node in "ep" and format it using the "name" field as to avoid
> + * collision when adding to sysfs.
> + * Ex: multiple "virtual-device" nodes in the same folder will be named:
> + * "disk", "network"...
> + *
> + * When FORMAT_APPEND is used: "virtual-device-disk", virtual-device-network"...
> + *
> + * Because the "name" field is not guaranteed to be unique, one can use the
> + * flag FORMAT_USE_INDEX: "virtual-device0-disk", virtual-device1-network"...
> + * In this case "index" is used.
> + *
> + * The formatted string is found in *formatted, to be freed by the caller
> + *
> + * Return 0 upon success
> + */
> +static int format_using_name(struct mdesc_handle *hp,
> +			     struct mdesc_elem *ep,
> +			     const char *name,
> +			     char **formatted,
> +			     unsigned int flags,
> +			     unsigned int index,
> +			     unsigned int total)
> +{
> +	const char *prop_name = mdesc_get_property(hp, ep->d.val, "name", NULL);
> +
> +	if (!prop_name) {
> +		pr_err("mdesc %s doesn't have a property \"name\"\n", name);
> +		return -ENOENT;
> +	}
> +
> +	if (flags & FORMAT_APPEND) {
> +		if (flags & FORMAT_USE_INDEX)
> +			return alloc_sprintf(formatted, "%s%d_%s",
> +					     name, index, prop_name);
> +		else
> +			return alloc_sprintf(formatted, "%s_%s",
> +					     name, prop_name);
> +	}
> +
> +	if (flags & FORMAT_USE_INDEX)
> +		return alloc_sprintf(formatted, "%s%d", prop_name, index);
> +	else
> +		return alloc_sprintf(formatted, "%s", prop_name);
> +}
> +
> +/*
> + * Default formatting used in format_name() for arcs nodes.
> + * Called when we find multiple arcs with the same name for which no
> + * formatting is specified in format_nodes array.
> + * This default formatting guarantees uniqueness because it is using the node
> + * index in the mdesc nodes array
> + */
> +static char *format_name_fallback(struct mdesc_handle *hp,
> +				  struct mdesc_elem *ep,
> +				  const char *name,
> +				  unsigned int    index,
> +				  unsigned int    total)
> +{
> +	char *formatted = NULL;
> +	int err;
> +
> +	err = alloc_sprintf(&formatted, "%s_0x%x", name, ep->d.val);
> +	return err ? NULL : formatted;
> +}
> +
> +struct format_node {
> +	char	*name;
> +	int	(*format)(struct mdesc_handle*, struct mdesc_elem*, const char*,
> +			char**, unsigned int, unsigned int, unsigned int);
> +	unsigned int flags;
> +};
> +
> +struct format_node format_nodes[] = {
> +#define FORMAT_APPEND_USE_INDEX (FORMAT_APPEND | FORMAT_USE_INDEX)
> +	{ "virtual-device",       format_using_name, FORMAT_APPEND_USE_INDEX },
> +	{ "domain-services-port", format_using_id,   FORMAT_APPEND },
> +	{ "virtual-device-port",  format_using_id,   FORMAT_APPEND },
> +	{ "channel-endpoint",     format_using_id,   FORMAT_APPEND },
> +	{ "cpu",                  format_using_id,   FORMAT_APPEND },
> +	{ NULL, NULL, 0 }
> +};
> +
> +/*
> + * Format the name of an arc node according to the table "format_nodes".
> + * If no entry is found for the arc node in "ep", it falls back to using
> + * format_name_fallback()
> + * Returns the formatted string, to be freed by the caller
> + */
> +static char *format_name(struct mdesc_handle *hp,
> +			 struct mdesc_elem *ep,
> +			 const char *name,
> +			 int    index,
> +			 int    total)
> +{
> +	struct format_node *fn;
> +	char   *formatted = NULL;
> +	bool   found = false;
> +	int    err = 0;
> +
> +	for (fn = format_nodes; fn->name; fn++) {
> +		if (strcmp(fn->name, name))
> +			continue;
> +		err = fn->format(hp, ep, name, &formatted,
> +				 fn->flags, index, total);
> +		found = true;
> +		break;
> +	}
> +
> +	if (!found && (total != 1)) {
> +		/*
> +		 * there's no handler for this node and we've got more
> +		 * than one so we format it by appending the index
> +		 * (by order of appearance)
> +		 */
> +		err = alloc_sprintf(&formatted, "%s%lld", name, index);
> +	}
> +
> +	if (err)
> +		return format_name_fallback(hp, ep, name, index, total);
> +
> +	return formatted;
> +}
> +
> +/*
> + * Takes an arc node in "ep" and create a folder in sysfs with the name as
> + * provided by format_name() which uses the format_nodes array as input
> + * "total" is the number of arcs nodes with the same type (ex: cpu)
> + * "index" is the occurrence number of the arc node (ex: cpu number 32)
> + * Fills *kobj with the kobject
> + * Return 0 upon success
> + */
> +static int create_sysfs_folder(struct mdesc_handle *hp,
> +			       struct mdesc_elem *ep,
> +			       const char *name,
> +			       struct kobject **kobj,
> +			       struct kobject *parent_kobj,
> +			       unsigned int index,
> +			       unsigned int total)
> +{
> +	char *formatted;
> +	int  err = 0;
> +
> +	formatted = format_name(hp, ep, name, index, total);
> +	if (!formatted)
> +		formatted = (char *) name;
> +
> +	*kobj = kobject_create_and_add(formatted, parent_kobj);
> +
> +	err = (*kobj == NULL) ? -ENOMEM : 0;
> +	if (err)
> +		pr_err("mdesc_sysfs: unable to create sysfs entry for \"%s\"\n",
> +		       name);
> +
> +	if (formatted != name)
> +		kfree(formatted);
> +
> +	return err;
> +}
> +
> +/*
> + * Similar to create_sysfs_folder() except that because the node in "ep"
> + * already exist elsewhere in sysfs, a link is created instead.
> + * Return 0 upon success
> + */
> +static int create_sysfs_link(struct mdesc_handle *hp,
> +			     struct mdesc_elem *ep,
> +			     const char *name,
> +			     struct kobject *target_kobj,
> +			     struct kobject *parent_kobj,
> +			     unsigned int index,
> +			     unsigned int total)
> +{
> +	char *formatted;
> +	int  err = 0;
> +
> +	formatted = format_name(hp, ep, name, index, total);
> +	if (!formatted)
> +		formatted = (char *) name;
> +
> +	err = sysfs_create_link(parent_kobj, target_kobj, formatted);
> +	if (err)
> +		pr_err("mdesc_sysfs: unable to create sysfs link for \"%s\"\n",
> +		       name);
> +
> +	if (formatted != name)
> +		kfree(formatted);
> +
> +	return err;
> +}
> +
> +struct mdesc_arcs {
> +	struct list_head list;
> +	const char *name;
> +	unsigned int count;
> +};
> +
> +/*
> + * Take a node in "ep" and build a linked list of mdesc_arcs elements.
> + * Each element contains a name and how many occurrences exists in "ep".
> + * Return 0 upon success
> + */
> +static int sysfs_populate_build_arcs_list(struct mdesc_handle *hp,
> +					  struct mdesc_elem *ep,
> +					  struct mdesc_arcs *arcs_list)
> +{
> +	struct mdesc_elem *base = node_block(&hp->mdesc);
> +	const char *names = name_block(&hp->mdesc);
> +
> +	for (ep++; ep->tag != MD_NODE_END; ep++) {
> +		const char *arc_name;
> +		struct mdesc_arcs *p;
> +
> +		/* We only care of the forward arcs of the DAG */
> +		if (ep->tag != MD_PROP_ARC ||
> +				strcmp(names + ep->name_offset, "fwd"))
> +			continue;
> +
> +		arc_name = names + (base + ep->d.val)->name_offset;
> +		list_for_each_entry(p, &arcs_list->list, list) {
> +			if (!strcmp(p->name, arc_name))
> +				break;
> +		}
> +		/* Have we found arc_name in the list? */
> +		if (p == arcs_list) {
> +			/* no, we add it to the list */
> +			p = kzalloc(sizeof(struct mdesc_arcs), GFP_KERNEL);
> +			if (p == NULL) {
> +				struct list_head *pos, *q;
> +
> +				/* An error occured: free the whole list */
> +				list_for_each_safe(pos, q, &arcs_list->list) {
> +					p = list_entry(pos, struct mdesc_arcs,
> +							list);
> +					list_del(pos);
> +					kfree(p);
> +				}
> +				return -ENOMEM;
> +			}
> +			p->name = arc_name;
> +			/* add tail allows to keep mdesc's order */
> +			list_add_tail(&p->list, &arcs_list->list);
> +		}
> +		/* increment the occurrence for this arc */
> +		p->count++;
> +	}
> +	return 0;
> +}
> +
> +struct sysfs_property {
> +	struct kobj_attribute kattr;
> +	char *value;
> +	unsigned int length;
> +};
> +
> +static bool updating;
> +
> +static ssize_t mdesc_sysfs_show(struct kobject *object,
> +				struct kobj_attribute *attr,
> +				char *buf)
> +{
> +	struct sysfs_property *prop;
> +
> +	if (updating)
> +		return -EAGAIN; /* This version is stale */
> +
> +	prop = container_of(attr, struct sysfs_property, kattr);
> +	if (prop->length <= PAGE_SIZE) {
> +		memcpy(buf, prop->value, prop->length);
> +		return prop->length;
> +	}
> +	return sprintf(buf, "%s\n", MDESC_SYSFS_TOO_LARGE);
> +}
> +
> +/*
> + * Append a property to the list that will be later
> + * passed to sysfs_create_files()
> + * "attributes" is a pointer to an array of "struct attribute*"
> + */
> +static int add_property(struct sysfs_property *prop,
> +			struct attribute ***attributes,
> +			int *length,
> +			int *size)
> +{
> +	if (*size <= (*length + 1)) {
> +		struct attribute **p;
> +
> +		*size += 16; /* pre-allocation to relieve the allocator */
> +		p = krealloc(*attributes, *size * sizeof(struct attribute *),
> +				GFP_KERNEL);
> +		if (!p)
> +			return -ENOMEM;
> +
> +		*attributes = p;
> +	}
> +	(*attributes)[(*length)++] = &prop->kattr.attr;
> +	/* the array must be NULL terminated */
> +	(*attributes)[*length] = NULL;
> +	return 0;
> +}
> +
> +/*
> + * Given an arc node in "ep", loop through its entries and create
> + * a list of attributes suitable for sysfs_create_files()
> + * the entries should be be unique within "ep" but there are versions
> + * where entries are duplicated, so we only add them once to the list
> + */
> +static int mdesc_sysfs_add_propeties(struct mdesc_handle *hp,
> +				     struct mdesc_elem *ep,
> +				     struct kobject *parent_kobj)
> +{
> +	const char *names = name_block(&hp->mdesc);
> +	void *data = data_block(&hp->mdesc);
> +	struct attribute **sysfs_attributes = NULL;
> +	int sysfs_attributes_length = 0;
> +	int sysfs_attributes_size = 0;
> +	int err;
> +
> +	for (ep++; ep->tag != MD_NODE_END; ep++) {
> +		struct sysfs_property *prop;
> +		char *s = NULL;
> +		size_t len = 0;
> +		bool valid = false;
> +
> +		switch (ep->tag) {
> +		case MD_PROP_VAL:
> +			err = alloc_sprintf(&s, "%lld\n", ep->d.val);
> +			if (err)
> +				return -ENOMEM;
> +			len = strlen(s);
> +			valid = true;
> +			break;
> +		case MD_PROP_STR:
> +			s = kstrndup(data + ep->d.data.data_offset,
> +				     ep->d.data.data_len, GFP_KERNEL);
> +			s = kmalloc(ep->d.data.data_len+1, GFP_KERNEL);
> +			if (s == NULL)
> +				return -ENOMEM;
> +			memcpy(s, data + ep->d.data.data_offset,
> +			       ep->d.data.data_len-1);
> +			s[ep->d.data.data_len-1] = '\n';
> +			s[ep->d.data.data_len] = '\0';
> +			len = ep->d.data.data_len;
> +			valid = true;
> +			break;
> +		case MD_PROP_DATA:
> +			s = kmalloc(ep->d.data.data_len, GFP_KERNEL);
> +			if (s == NULL)
> +				return -ENOMEM;
> +			memcpy(s, data + ep->d.data.data_offset,
> +					ep->d.data.data_len);
> +			len = ep->d.data.data_len;
> +			valid = true;
> +			break;
> +		default:
> +			break;
> +		}
> +
> +		prop = kmalloc(sizeof(*prop), GFP_KERNEL);
> +		if (!prop) {
> +			kfree(s);
> +			kfree(sysfs_attributes);
> +			return -ENOMEM;
> +		}
> +
> +		if (valid) {
> +			const char *c_name = names + ep->name_offset;
> +			char *name;
> +			int i;
> +			bool found = false;
> +
> +			/*
> +			 * Check if a property with the same name
> +			 * has already been added
> +			 */
> +			for (i = 0; i < sysfs_attributes_length; i++) {
> +				struct sysfs_property *prop;
> +
> +				prop = container_of(sysfs_attributes[i],
> +						    struct sysfs_property,
> +						    kattr.attr);
> +				if (!strcmp(prop->kattr.attr.name, c_name)) {
> +					found = true;
> +					break;
> +				}
> +			}
> +			if (found) {
> +				pr_info("mdesc_sysfs: duplicate property found\n");
> +				continue;
> +			}
> +
> +			name = kstrdup(c_name, GFP_KERNEL);
> +			if (!name) {
> +				kfree(s);
> +				kfree(sysfs_attributes);
> +				return -ENOMEM;
> +			}
> +
> +			/* Set the name */
> +			prop->kattr.attr.name = name;
> +			prop->kattr.attr.mode = S_IRUSR;
> +			prop->kattr.show = mdesc_sysfs_show;
> +			prop->kattr.store = NULL;
> +			prop->value = s;
> +			prop->length = len;
> +
> +			/* Add to the (automatically growing) array */
> +			add_property(prop, &sysfs_attributes,
> +				     &sysfs_attributes_length,
> +				     &sysfs_attributes_size);
> +		}
> +	}
> +
> +	if (sysfs_attributes) {
> +		int err;
> +		const struct attribute **pattr = (const struct attribute **)
> +						 sysfs_attributes;
> +
> +		err = sysfs_create_files(parent_kobj, pattr);
> +		if (err)
> +			pr_err("mdesc_sysfs: unable to create sysfs properties\n");
> +
> +		/*
> +		 * Free the array of attributes, but not the individual elements
> +		 * because each element points to a "struct sysfs_property" that
> +		 * mdesc_sysfs_show() uses whenever users read from sysfs
> +		 */
> +		kfree(sysfs_attributes);
> +	}
> +
> +	return 0;
> +}
> +
> +/*
> + * Function called on the root that recursively add nodes to sysfs
> + * It first looks at the current node in "ep", gathering statistics
> + * on every forward arcs it points to,
> + * then for every arc type it create a sysfs folder or link
> + * Finally it adds the properties (non arcs) to sysfs
> + * Return 0 upon success
> + */
> +static int sysfs_populate(struct mdesc_handle *hp,
> +			  struct mdesc_elem *ep,
> +			  struct kobject *parent_kobj,
> +			  struct kobject **all_nodes)
> +{
> +	struct mdesc_elem *base = node_block(&hp->mdesc);
> +	const char *names = name_block(&hp->mdesc);
> +	struct mdesc_arcs arcs_list, *p;
> +	struct list_head *pos, *q;
> +	int err = 0;
> +
> +	/*
> +	 * Check that it's indeed an arc node
> +	 * and lose the 1st entry as it's not useful hereafter
> +	 */
> +	if (ep++->tag != MD_NODE) {
> +		pr_err("mdesc_sysfs: Not a valid arc node\n");
> +		return -EINVAL;
> +	}
> +
> +	/*
> +	 * In a high level language we'd create a dictionary
> +	 * for example: {"cpu": 64", "memory": 1, ...}
> +	 * In the kernel we use a linked list of structs.
> +	 * We end up with: ("cpu", 64) --> ("memory", 1) -->...
> +	 */
> +	INIT_LIST_HEAD(&arcs_list.list);
> +	err = sysfs_populate_build_arcs_list(hp, ep, &arcs_list);
> +	if (err)
> +		return err;
> +
> +	/*
> +	 * For each arc type, we browse the current arc "ep" and
> +	 * add to sysfs. That allows us to keep an index for these
> +	 * arcs that don't contain an "id" or "name" field
> +	 */
> +	list_for_each_entry(p, &arcs_list.list, list) {
> +		struct mdesc_elem *elem;
> +		unsigned int index = 0;
> +
> +		for (elem = ep; elem->tag != MD_NODE_END; elem++) {
> +			struct kobject *kobj;
> +			const char *name;
> +
> +			/* Need to be a forward arc... */
> +			if (elem->tag != MD_PROP_ARC ||
> +			    strcmp(names + elem->name_offset, "fwd"))
> +				continue;
> +
> +			/*
> +			 * ...whose name matches the element of the list
> +			 * that we're currently considering
> +			 */
> +			name = names + (base + elem->d.val)->name_offset;
> +			if (strcmp(p->name, name))
> +				continue;
> +
> +			/* Have we already added this arc to sysfs? */
> +			if (all_nodes[elem->d.val]) {
> +				/* Yes, we link to it */
> +				err = create_sysfs_link(hp, elem, name,
> +					all_nodes[elem->d.val], parent_kobj,
> +					index, p->count);
> +			} else {
> +				/* No, we create a folder for it */
> +				err = create_sysfs_folder(hp, elem, name,
> +					&kobj, parent_kobj, index, p->count);
> +				if (err)
> +					return -ENOMEM;
> +
> +				all_nodes[elem->d.val] = kobj;
> +
> +				/* And we add to sysfs whatever it contains */
> +				sysfs_populate(hp, base + elem->d.val,
> +					       kobj, all_nodes);
> +			}
> +
> +			/*
> +			 * If it's the last entry of its kind, there's
> +			 * no point in looping again
> +			 */
> +			if (++index == p->count)
> +				break;
> +		}
> +	}
> +
> +	/* Free the linked list */
> +	list_for_each_safe(pos, q, &arcs_list.list) {
> +		p = list_entry(pos, struct mdesc_arcs, list);
> +		list_del(pos);
> +		kfree(p);
> +	}
> +
> +	/* Add the properties (non-arc) to sysfs (string, int, binary data) */
> +	return mdesc_sysfs_add_propeties(hp, ep, parent_kobj);
> +}
> +
> +static struct kobject *mdesc_kobj;
> +
> +int mdesc_sysfs_populate(void)
> +{
> +	struct mdesc_handle *hp;
> +	unsigned int num_node;
> +	struct kobject **all_nodes, *old_mdesc_kobj;
> +	unsigned int major, minor;
> +	mode_t mode;
> +	int retval = 0;
> +
> +	hp = mdesc_grab();
> +	if (!hp) {
> +		pr_err("mdesc_sysfs: hv mdesc not found\n");
> +		return -ENODEV;
> +	}
> +
> +	major = hp->mdesc.version >> 16;
> +	minor = hp->mdesc.version & 0xffff;
> +
> +	if (major != 1) {
> +		pr_err("mdesc_sysfs: unsupported hv mdesc version: %d.%d\n",
> +		       major, minor);
> +		mdesc_release(hp);
> +		return -EINVAL;
> +	}
> +
> +	pr_info("mdesc_sysfs: machine desc version %d.%d\n", major, minor);
> +
> +	num_node = hp->mdesc.node_sz / sizeof(struct mdesc_hdr);
> +	all_nodes = kcalloc(num_node, sizeof(struct kobject *), GFP_KERNEL);
> +	if (!all_nodes) {
> +		mdesc_release(hp);
> +		return -ENOMEM;
> +	}
> +
> +	updating = true;
> +
> +	old_mdesc_kobj = mdesc_kobj;
> +	mdesc_kobj = kobject_create_and_add(".mdesc.tmp", firmware_kobj);
> +	if (!mdesc_kobj) {
> +		pr_err("mdesc_sysfs: unable to create sysfs entry\n");
> +		kfree(all_nodes);
> +		mdesc_release(hp);
> +		return -ENOMEM;
> +	}
> +	mode = mdesc_kobj->sd->mode;
> +	/* Remove access to everyone since we're populating
> +	 * the tree. Regardless, root has always access! */
> +	mdesc_kobj->sd->mode &= ~0777;
> +
> +	retval = sysfs_populate(hp, node_block(&hp->mdesc),
> +				mdesc_kobj, all_nodes);
> +
> +	mdesc_release(hp);
> +	kfree(all_nodes);
> +
> +	/* Rename existing mdesc to avoid conflicts */
> +	if (retval)
> +		return retval;
> +
> +	if (old_mdesc_kobj) {
> +		retval = kobject_rename(old_mdesc_kobj, ".mdesc.old");
> +		if (retval) {
> +			pr_err("mdesc_sysfs: unable to rename /sys/firmware/mdesc\n");
> +			kobject_del(mdesc_kobj);
> +			kobject_put(mdesc_kobj);
> +			updating = false;
> +			return retval;
> +		}
> +	}
> +
> +	/* Restore original mode and set name */
> +	retval = kobject_rename(mdesc_kobj, "mdesc");
> +	if (!retval) {
> +		mdesc_kobj->sd->mode = mode;
> +		if (old_mdesc_kobj) {
> +			kobject_del(old_mdesc_kobj);
> +			kobject_put(old_mdesc_kobj);
> +		}
> +	} else {
> +		int dummy;
> +
> +		/* Attempt to restore the old entry (if any) */
> +		if (old_mdesc_kobj)
> +			dummy = kobject_rename(old_mdesc_kobj, "mdesc");
> +		kobject_del(mdesc_kobj);
> +		kobject_put(mdesc_kobj);
> +	}
> +
> +	updating = false;
> +
> +	return retval;
> +}
> +EXPORT_SYMBOL(mdesc_sysfs_populate);
> +
> +MODULE_DESCRIPTION("Maps machine description graph to sysfs in /sys/firmware/mdesc");
> +MODULE_AUTHOR("Oracle");
> +MODULE_LICENSE("GPL");
> +
> +static int __init mdesc_sysfs_init(void)
> +{
> +	return mdesc_sysfs_populate();
> +}
> +
> +static void __exit mdesc_sysfs_cleanup(void)
> +{
> +	if (mdesc_kobj) {
> +		kobject_del(mdesc_kobj);
> +		kobject_put(mdesc_kobj);
> +	}
> +}
> +
> +module_init(mdesc_sysfs_init);
> +module_exit(mdesc_sysfs_cleanup);

--
To unsubscribe from this list: send the line "unsubscribe sparclinux" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Miller Dec. 1, 2017, 7:46 p.m. UTC | #2
From: Eric Saint Etienne <eric.saint.etienne@oracle.com>
Date: Fri, 1 Dec 2017 17:15:13 +0000

> Do you have any concern with this patch?

I'm just wondering if the blacklist of sensitive nodes is complete and
also futureproof.

That's my first concern.

Also the hypervisor specification URL is completely non-functional,
the kenai.com stuff went down years ago.

It's pretty rediculous for me to include code written to a
specification nobody can access and check against the implementation.
--
To unsubscribe from this list: send the line "unsubscribe sparclinux" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Saint Etienne Dec. 1, 2017, 8:37 p.m. UTC | #3
Hi Dave,

Can you please provide me with an example of an actual mdesc entry that 
needs sanitization? I believe you are thinking of passwords and crypto 
keys but I couldn't find any such entry on any machine I have access to.

It makes sense for this mdesc sysfs module to use the same level of 
sanitization that has been used for so many years for /dev/mdesc without 
any problem.

Regarding the machine description format specification url, what 
specification document did you use or share with the community when you 
initially wrote and upstreamed mdesc.c? Certainly the same document 
could be used for this module as the format hasn't changed since then.

Thank you.

Kind Regards,

-eric


On 01/12/17 19:46, David Miller wrote:
> From: Eric Saint Etienne <eric.saint.etienne@oracle.com>
> Date: Fri, 1 Dec 2017 17:15:13 +0000
>
>> Do you have any concern with this patch?
> I'm just wondering if the blacklist of sensitive nodes is complete and
> also futureproof.
>
> That's my first concern.
>
> Also the hypervisor specification URL is completely non-functional,
> the kenai.com stuff went down years ago.
>
> It's pretty rediculous for me to include code written to a
> specification nobody can access and check against the implementation.

--
To unsubscribe from this list: send the line "unsubscribe sparclinux" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Miller Dec. 1, 2017, 9:28 p.m. UTC | #4
From: Eric Saint Etienne <eric.saint.etienne@oracle.com>
Date: Fri, 1 Dec 2017 20:37:15 +0000

> Regarding the machine description format specification url, what
> specification document did you use or share with the community when
> you initially wrote and upstreamed mdesc.c? Certainly the same
> document could be used for this module as the format hasn't changed
> since then.

kenai.com was up and running at the time I was doing this code.

I also had documents under NDA (which allowed me to publish the work I
did upstream) at the time as well.
--
To unsubscribe from this list: send the line "unsubscribe sparclinux" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Saint Etienne Dec. 7, 2017, 4:40 p.m. UTC | #5
> Can you please provide me with an example of an actual mdesc
 > entry that needs sanitization? I believe you are thinking of
 > passwords and crypto keys but I couldn't find any such entry
 > on any machine I have access to.

Dave, I still have to read from you on this.

--
To unsubscribe from this list: send the line "unsubscribe sparclinux" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Miller Dec. 7, 2017, 6:05 p.m. UTC | #6
From: Eric Saint Etienne <eric.saint.etienne@oracle.com>
Date: Thu, 7 Dec 2017 16:40:14 +0000

>> Can you please provide me with an example of an actual mdesc
>> entry that needs sanitization? I believe you are thinking of
>> passwords and crypto keys but I couldn't find any such entry
>> on any machine I have access to.
> 
> Dave, I still have to read from you on this.

I don't know, but based upon private communication we received from
Greg Onufer some might exist.

Please do a detailed audit of the mdesc properties that might contain
passwords or other sensitive issues, and please provide the results
of your audit on the list here.

Thank you.
--
To unsubscribe from this list: send the line "unsubscribe sparclinux" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alexandre Chartre Dec. 8, 2017, 8:17 a.m. UTC | #7
On 12/07/2017 07:05 PM, David Miller wrote:
> From: Eric Saint Etienne <eric.saint.etienne@oracle.com>
> Date: Thu, 7 Dec 2017 16:40:14 +0000
>
>>> Can you please provide me with an example of an actual mdesc
>>> entry that needs sanitization? I believe you are thinking of
>>> passwords and crypto keys but I couldn't find any such entry
>>> on any machine I have access to.
>>
>> Dave, I still have to read from you on this.
>
> I don't know, but based upon private communication we received from
> Greg Onufer some might exist.
>
> Please do a detailed audit of the mdesc properties that might contain
> passwords or other sensitive issues, and please provide the results
> of your audit on the list here.
>

FYI, on Solaris, when dumping /dev/mdesc, Solaris filters out the "security-password"
property from the "variables" node, and the "keystore" node.

- "security-password" holds the OBP password if it is set
- "keystore" holds security keys.

alex.
--
To unsubscribe from this list: send the line "unsubscribe sparclinux" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Saint Etienne Dec. 14, 2017, 11:01 a.m. UTC | #8
>>> Can you please provide me with an example of an actual mdesc
>>> entry that needs sanitization? I believe you are thinking of
>>> passwords and crypto keys but I couldn't find any such entry
>>> on any machine I have access to.
>> Dave, I still have to read from you on this.
> I don't know, but based upon private communication we received from
> Greg Onufer some might exist.
>
> Please do a detailed audit of the mdesc properties that might contain
> passwords or other sensitive issues, and please provide the results
> of your audit on the list here.
Alexandre Chartres did part of that audit for us: he pointed out 2 
sensitive mdesc keys that contain passwords and cryptographic keys.

I can't spend much more time on this patch. It's already out there in 
the mailing list archive for whoever wants to use it.

That said this sanitization task should be on top of your sparc todo 
list IMHO because the existing /dev/mdesc driver doesn't filter anything 
as of today, so it leaks critical/sensitive data to the OS.

All the best with fixing /dev/mdesc.

-eric
--
To unsubscribe from this list: send the line "unsubscribe sparclinux" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Miller Dec. 14, 2017, 1:44 p.m. UTC | #9
From: Eric Saint Etienne <eric.saint.etienne@oracle.com>
Date: Thu, 14 Dec 2017 11:01:58 +0000

> That said this sanitization task should be on top of your sparc todo
> list IMHO because the existing /dev/mdesc driver doesn't filter
> anything as of today, so it leaks critical/sensitive data to the OS.

It's root only.
--
To unsubscribe from this list: send the line "unsubscribe sparclinux" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Saint Etienne Dec. 14, 2017, 2:59 p.m. UTC | #10
As I stated early in the email thread, mdesc sysfs module has the same

permissions as /dev/mdesc chardev. So if you're happy with /dev/mdesc

being root, then you're also happy with mdesc sysfs expsing the same data

with the same perimssions and I don't understand that fuss around the audit.

Please elaborate as I don't understand what's happening here. Thank you.

Eric Saint-Etienne - Principal Linux Kernel Engineer

On 14/12/17 13:44, David Miller wrote:
> From: Eric Saint Etienne <eric.saint.etienne@oracle.com>
> Date: Thu, 14 Dec 2017 11:01:58 +0000
>
>> That said this sanitization task should be on top of your sparc todo
>> list IMHO because the existing /dev/mdesc driver doesn't filter
>> anything as of today, so it leaks critical/sensitive data to the OS.
> It's root only.

--
To unsubscribe from this list: send the line "unsubscribe sparclinux" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Miller Dec. 14, 2017, 4:08 p.m. UTC | #11
From: Eric Saint Etienne <eric.saint.etienne@oracle.com>
Date: Thu, 14 Dec 2017 14:59:55 +0000

> Please elaborate as I don't understand what's happening here. Thank
> you.

Well, re-reviewing your patch I do have technical feedback.

You're copying several datastructures and code from the existing MDESC
code.

Please share as much code as possible with arch/sparc/kernel/mdesc.c

Thank you.
--
To unsubscribe from this list: send the line "unsubscribe sparclinux" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox series

Patch

diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index a4a6261..4e2b640 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -470,6 +470,14 @@  config UBOOT_ENTRY_ADDR
 endmenu
 endif
 
+config MDESC_SYSFS
+	bool "Maps machine description to sysfs"
+	depends on SPARC64
+	default y
+	---help---
+	  If you say Y here, the directed acyclic graph described in the machine
+	  description will be expanded and made available in /sys/firmware/mdesc
+
 endmenu
 
 menu "Bus options (PCI etc.)"
diff --git a/arch/sparc/kernel/Makefile b/arch/sparc/kernel/Makefile
index aac6098..328eb9c 100644
--- a/arch/sparc/kernel/Makefile
+++ b/arch/sparc/kernel/Makefile
@@ -118,3 +118,6 @@  obj-$(CONFIG_SPARC64)	+= $(pc--y)
 
 obj-$(CONFIG_UPROBES)	+= uprobes.o
 obj-$(CONFIG_SPARC64)	+= jump_label.o
+
+obj-$(CONFIG_MDESC_SYSFS) += mdesc_sysfs.o
+
diff --git a/arch/sparc/kernel/mdesc.c b/arch/sparc/kernel/mdesc.c
index fa466ce..75db846 100644
--- a/arch/sparc/kernel/mdesc.c
+++ b/arch/sparc/kernel/mdesc.c
@@ -87,6 +87,10 @@  struct md_node_ops {
 	mdesc_node_match_f	node_match;
 };
 
+#ifdef MDESC_SYSFS
+int mdesc_sysfs_populate(void);
+#endif
+
 static int get_vdev_port_node_info(struct mdesc_handle *md, u64 node,
 				   union md_node_info *node_info);
 static void rel_vdev_port_node_info(union md_node_info *node_info);
@@ -523,6 +527,10 @@  void mdesc_update(void)
 	cur_mdesc = hp;
 	spin_unlock_irqrestore(&mdesc_lock, flags);
 
+#ifdef MDESC_SYSFS
+	mdesc_sysfs_populate();
+#endif
+
 	mdesc_notify_clients(orig_hp, hp);
 
 	spin_lock_irqsave(&mdesc_lock, flags);
diff --git a/arch/sparc/kernel/mdesc_sysfs.c b/arch/sparc/kernel/mdesc_sysfs.c
new file mode 100644
index 0000000..f2e048c
--- /dev/null
+++ b/arch/sparc/kernel/mdesc_sysfs.c
@@ -0,0 +1,831 @@ 
+/*
+ * mdesc_sysfs.c - Maps machine description DAG to sysfs
+ *
+ * Copyright (c) 2017 Oracle and/or its affiliates. All rights reserved.
+ * Author: Eric Saint-Etienne
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; see the file COPYING.  If not, write to
+ * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139,
+ * USA.
+ *
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <asm/mdesc.h>
+
+#define MDESC_SYSFS_TOO_LARGE "Error: Not enough space to output the data, retrieve it from /dev/mdesc instead."
+
+/*
+ * Machine Description is described in slides 37-39 here:
+ * http://www.oracle.com/technetwork/systems/opensparc/2008-oct-opensparc-slide-cast-09-mw-1539018.html
+ *
+ * It is composed of three main sections decribed by the following mdesc header.
+ * Since these blocks are contiguous, describing their size is sufficient to
+ * precisely locate them within the mdesc blob.
+ *
+ * - The NODE block contains the nodes of the directed acyclic graph (DAG)
+ * - The NAME block contains names of properties referred to by the DAG nodes
+ * - The DATA block contains values for the properties: strings and binary data
+ *
+ * A public specification of mdesc is available online at:
+ * https://kenai.com/downloads/hypervisor/hypervisor-api-3.0draft7.pdf
+ */
+struct mdesc_hdr {
+	u32	version; /* Transport version */
+	u32	node_sz; /* node block size */
+	u32	name_sz; /* name block size */
+	u32	data_sz; /* data block size */
+} __aligned(16)__;
+
+struct mdesc_elem {
+	u8	tag;
+#define MD_LIST_END	0x00
+#define MD_NODE		0x4e
+#define MD_NODE_END	0x45
+#define MD_NOOP		0x20
+#define MD_PROP_ARC	0x61
+#define MD_PROP_VAL	0x76
+#define MD_PROP_STR	0x73
+#define MD_PROP_DATA	0x64
+	u8	name_len;
+	u16	resv;
+	u32	name_offset;
+	union {
+		struct {
+			u32	data_len;
+			u32	data_offset;
+		} data;
+		u64	val;
+	} d;
+};
+
+struct mdesc_mem_ops {
+	struct mdesc_handle *(*alloc)(unsigned int mdesc_size);
+	void (*free)(struct mdesc_handle *handle);
+};
+
+struct mdesc_handle {
+	struct list_head     list;
+	struct mdesc_mem_ops *mops;
+	void                 *self_base;
+	atomic_t             refcnt;
+	unsigned int         handle_size;
+	struct mdesc_hdr     mdesc;
+};
+
+/*
+ * The following 3 functions return a pointer of the right type to the
+ * 3 main mdesc blocks
+ */
+
+static inline struct mdesc_elem *node_block(struct mdesc_hdr *mdesc)
+{
+	return (struct mdesc_elem *) (mdesc + 1);
+}
+
+static inline void *name_block(struct mdesc_hdr *mdesc)
+{
+	return ((void *) node_block(mdesc)) + mdesc->node_sz;
+}
+
+static inline void *data_block(struct mdesc_hdr *mdesc)
+{
+	return ((void *) name_block(mdesc)) + mdesc->name_sz;
+}
+
+/*
+ * Dynamically allocates the right amount of characters to format according
+ * to the printf-like format specified in "fmt".
+ * "*buf" is allocated and filled.
+ *
+ * Return 0 upon success
+ */
+static int alloc_sprintf(char **buf, const char *fmt, ...)
+{
+	va_list args;
+	size_t len, actual;
+
+	va_start(args, fmt);
+	len = vsnprintf(NULL, 0, fmt, args);
+	va_end(args);
+
+	*buf = kmalloc(len+1, GFP_KERNEL);
+	if (!*buf)
+		return -ENOMEM;
+
+	va_start(args, fmt);
+	actual = vscnprintf(*buf, len + 1, fmt, args);
+	va_end(args);
+
+	if (actual != len) {
+		kfree(*buf);
+		*buf = NULL;
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+/*
+ * Flags used with the following formatting functions that modify the way
+ * node names are formatted
+ */
+#define FORMAT_APPEND    0x01
+#define FORMAT_USE_INDEX 0x02
+
+/*
+ * Take an arc node in "ep" and format it using the "id" field as to avoid
+ * collision when adding to sysfs.
+ * Ex: multiple "cpu" nodes in the same folder will be named: "0", "1"...
+ *
+ * When FORMAT_APPEND is used, the names will become: "cpu0", "cpu1"...
+ * The formatted string is found in *formatted, to be freed by the caller
+ *
+ * Return 0 upon success
+ */
+static int format_using_id(struct mdesc_handle *hp,
+			   struct mdesc_elem *ep,
+			   char const *name,
+			   char **formatted,
+			   unsigned int flags,
+			   unsigned int index,
+			   unsigned int total)
+{
+	const u64 *prop_id = mdesc_get_property(hp, ep->d.val, "id", NULL);
+
+	if (!prop_id) {
+		pr_err("mdesc %s doesn't have a property \"id\"\n", name);
+		return -ENOENT;
+	}
+
+	if (flags & FORMAT_APPEND)
+		return alloc_sprintf(formatted, "%s%lld", name, *prop_id);
+
+	return alloc_sprintf(formatted, "%lld", *prop_id);
+}
+
+/*
+ * Take an arc node in "ep" and format it using the "name" field as to avoid
+ * collision when adding to sysfs.
+ * Ex: multiple "virtual-device" nodes in the same folder will be named:
+ * "disk", "network"...
+ *
+ * When FORMAT_APPEND is used: "virtual-device-disk", virtual-device-network"...
+ *
+ * Because the "name" field is not guaranteed to be unique, one can use the
+ * flag FORMAT_USE_INDEX: "virtual-device0-disk", virtual-device1-network"...
+ * In this case "index" is used.
+ *
+ * The formatted string is found in *formatted, to be freed by the caller
+ *
+ * Return 0 upon success
+ */
+static int format_using_name(struct mdesc_handle *hp,
+			     struct mdesc_elem *ep,
+			     const char *name,
+			     char **formatted,
+			     unsigned int flags,
+			     unsigned int index,
+			     unsigned int total)
+{
+	const char *prop_name = mdesc_get_property(hp, ep->d.val, "name", NULL);
+
+	if (!prop_name) {
+		pr_err("mdesc %s doesn't have a property \"name\"\n", name);
+		return -ENOENT;
+	}
+
+	if (flags & FORMAT_APPEND) {
+		if (flags & FORMAT_USE_INDEX)
+			return alloc_sprintf(formatted, "%s%d_%s",
+					     name, index, prop_name);
+		else
+			return alloc_sprintf(formatted, "%s_%s",
+					     name, prop_name);
+	}
+
+	if (flags & FORMAT_USE_INDEX)
+		return alloc_sprintf(formatted, "%s%d", prop_name, index);
+	else
+		return alloc_sprintf(formatted, "%s", prop_name);
+}
+
+/*
+ * Default formatting used in format_name() for arcs nodes.
+ * Called when we find multiple arcs with the same name for which no
+ * formatting is specified in format_nodes array.
+ * This default formatting guarantees uniqueness because it is using the node
+ * index in the mdesc nodes array
+ */
+static char *format_name_fallback(struct mdesc_handle *hp,
+				  struct mdesc_elem *ep,
+				  const char *name,
+				  unsigned int    index,
+				  unsigned int    total)
+{
+	char *formatted = NULL;
+	int err;
+
+	err = alloc_sprintf(&formatted, "%s_0x%x", name, ep->d.val);
+	return err ? NULL : formatted;
+}
+
+struct format_node {
+	char	*name;
+	int	(*format)(struct mdesc_handle*, struct mdesc_elem*, const char*,
+			char**, unsigned int, unsigned int, unsigned int);
+	unsigned int flags;
+};
+
+struct format_node format_nodes[] = {
+#define FORMAT_APPEND_USE_INDEX (FORMAT_APPEND | FORMAT_USE_INDEX)
+	{ "virtual-device",       format_using_name, FORMAT_APPEND_USE_INDEX },
+	{ "domain-services-port", format_using_id,   FORMAT_APPEND },
+	{ "virtual-device-port",  format_using_id,   FORMAT_APPEND },
+	{ "channel-endpoint",     format_using_id,   FORMAT_APPEND },
+	{ "cpu",                  format_using_id,   FORMAT_APPEND },
+	{ NULL, NULL, 0 }
+};
+
+/*
+ * Format the name of an arc node according to the table "format_nodes".
+ * If no entry is found for the arc node in "ep", it falls back to using
+ * format_name_fallback()
+ * Returns the formatted string, to be freed by the caller
+ */
+static char *format_name(struct mdesc_handle *hp,
+			 struct mdesc_elem *ep,
+			 const char *name,
+			 int    index,
+			 int    total)
+{
+	struct format_node *fn;
+	char   *formatted = NULL;
+	bool   found = false;
+	int    err = 0;
+
+	for (fn = format_nodes; fn->name; fn++) {
+		if (strcmp(fn->name, name))
+			continue;
+		err = fn->format(hp, ep, name, &formatted,
+				 fn->flags, index, total);
+		found = true;
+		break;
+	}
+
+	if (!found && (total != 1)) {
+		/*
+		 * there's no handler for this node and we've got more
+		 * than one so we format it by appending the index
+		 * (by order of appearance)
+		 */
+		err = alloc_sprintf(&formatted, "%s%lld", name, index);
+	}
+
+	if (err)
+		return format_name_fallback(hp, ep, name, index, total);
+
+	return formatted;
+}
+
+/*
+ * Takes an arc node in "ep" and create a folder in sysfs with the name as
+ * provided by format_name() which uses the format_nodes array as input
+ * "total" is the number of arcs nodes with the same type (ex: cpu)
+ * "index" is the occurrence number of the arc node (ex: cpu number 32)
+ * Fills *kobj with the kobject
+ * Return 0 upon success
+ */
+static int create_sysfs_folder(struct mdesc_handle *hp,
+			       struct mdesc_elem *ep,
+			       const char *name,
+			       struct kobject **kobj,
+			       struct kobject *parent_kobj,
+			       unsigned int index,
+			       unsigned int total)
+{
+	char *formatted;
+	int  err = 0;
+
+	formatted = format_name(hp, ep, name, index, total);
+	if (!formatted)
+		formatted = (char *) name;
+
+	*kobj = kobject_create_and_add(formatted, parent_kobj);
+
+	err = (*kobj == NULL) ? -ENOMEM : 0;
+	if (err)
+		pr_err("mdesc_sysfs: unable to create sysfs entry for \"%s\"\n",
+		       name);
+
+	if (formatted != name)
+		kfree(formatted);
+
+	return err;
+}
+
+/*
+ * Similar to create_sysfs_folder() except that because the node in "ep"
+ * already exist elsewhere in sysfs, a link is created instead.
+ * Return 0 upon success
+ */
+static int create_sysfs_link(struct mdesc_handle *hp,
+			     struct mdesc_elem *ep,
+			     const char *name,
+			     struct kobject *target_kobj,
+			     struct kobject *parent_kobj,
+			     unsigned int index,
+			     unsigned int total)
+{
+	char *formatted;
+	int  err = 0;
+
+	formatted = format_name(hp, ep, name, index, total);
+	if (!formatted)
+		formatted = (char *) name;
+
+	err = sysfs_create_link(parent_kobj, target_kobj, formatted);
+	if (err)
+		pr_err("mdesc_sysfs: unable to create sysfs link for \"%s\"\n",
+		       name);
+
+	if (formatted != name)
+		kfree(formatted);
+
+	return err;
+}
+
+struct mdesc_arcs {
+	struct list_head list;
+	const char *name;
+	unsigned int count;
+};
+
+/*
+ * Take a node in "ep" and build a linked list of mdesc_arcs elements.
+ * Each element contains a name and how many occurrences exists in "ep".
+ * Return 0 upon success
+ */
+static int sysfs_populate_build_arcs_list(struct mdesc_handle *hp,
+					  struct mdesc_elem *ep,
+					  struct mdesc_arcs *arcs_list)
+{
+	struct mdesc_elem *base = node_block(&hp->mdesc);
+	const char *names = name_block(&hp->mdesc);
+
+	for (ep++; ep->tag != MD_NODE_END; ep++) {
+		const char *arc_name;
+		struct mdesc_arcs *p;
+
+		/* We only care of the forward arcs of the DAG */
+		if (ep->tag != MD_PROP_ARC ||
+				strcmp(names + ep->name_offset, "fwd"))
+			continue;
+
+		arc_name = names + (base + ep->d.val)->name_offset;
+		list_for_each_entry(p, &arcs_list->list, list) {
+			if (!strcmp(p->name, arc_name))
+				break;
+		}
+		/* Have we found arc_name in the list? */
+		if (p == arcs_list) {
+			/* no, we add it to the list */
+			p = kzalloc(sizeof(struct mdesc_arcs), GFP_KERNEL);
+			if (p == NULL) {
+				struct list_head *pos, *q;
+
+				/* An error occured: free the whole list */
+				list_for_each_safe(pos, q, &arcs_list->list) {
+					p = list_entry(pos, struct mdesc_arcs,
+							list);
+					list_del(pos);
+					kfree(p);
+				}
+				return -ENOMEM;
+			}
+			p->name = arc_name;
+			/* add tail allows to keep mdesc's order */
+			list_add_tail(&p->list, &arcs_list->list);
+		}
+		/* increment the occurrence for this arc */
+		p->count++;
+	}
+	return 0;
+}
+
+struct sysfs_property {
+	struct kobj_attribute kattr;
+	char *value;
+	unsigned int length;
+};
+
+static bool updating;
+
+static ssize_t mdesc_sysfs_show(struct kobject *object,
+				struct kobj_attribute *attr,
+				char *buf)
+{
+	struct sysfs_property *prop;
+
+	if (updating)
+		return -EAGAIN; /* This version is stale */
+
+	prop = container_of(attr, struct sysfs_property, kattr);
+	if (prop->length <= PAGE_SIZE) {
+		memcpy(buf, prop->value, prop->length);
+		return prop->length;
+	}
+	return sprintf(buf, "%s\n", MDESC_SYSFS_TOO_LARGE);
+}
+
+/*
+ * Append a property to the list that will be later
+ * passed to sysfs_create_files()
+ * "attributes" is a pointer to an array of "struct attribute*"
+ */
+static int add_property(struct sysfs_property *prop,
+			struct attribute ***attributes,
+			int *length,
+			int *size)
+{
+	if (*size <= (*length + 1)) {
+		struct attribute **p;
+
+		*size += 16; /* pre-allocation to relieve the allocator */
+		p = krealloc(*attributes, *size * sizeof(struct attribute *),
+				GFP_KERNEL);
+		if (!p)
+			return -ENOMEM;
+
+		*attributes = p;
+	}
+	(*attributes)[(*length)++] = &prop->kattr.attr;
+	/* the array must be NULL terminated */
+	(*attributes)[*length] = NULL;
+	return 0;
+}
+
+/*
+ * Given an arc node in "ep", loop through its entries and create
+ * a list of attributes suitable for sysfs_create_files()
+ * the entries should be be unique within "ep" but there are versions
+ * where entries are duplicated, so we only add them once to the list
+ */
+static int mdesc_sysfs_add_propeties(struct mdesc_handle *hp,
+				     struct mdesc_elem *ep,
+				     struct kobject *parent_kobj)
+{
+	const char *names = name_block(&hp->mdesc);
+	void *data = data_block(&hp->mdesc);
+	struct attribute **sysfs_attributes = NULL;
+	int sysfs_attributes_length = 0;
+	int sysfs_attributes_size = 0;
+	int err;
+
+	for (ep++; ep->tag != MD_NODE_END; ep++) {
+		struct sysfs_property *prop;
+		char *s = NULL;
+		size_t len = 0;
+		bool valid = false;
+
+		switch (ep->tag) {
+		case MD_PROP_VAL:
+			err = alloc_sprintf(&s, "%lld\n", ep->d.val);
+			if (err)
+				return -ENOMEM;
+			len = strlen(s);
+			valid = true;
+			break;
+		case MD_PROP_STR:
+			s = kstrndup(data + ep->d.data.data_offset,
+				     ep->d.data.data_len, GFP_KERNEL);
+			s = kmalloc(ep->d.data.data_len+1, GFP_KERNEL);
+			if (s == NULL)
+				return -ENOMEM;
+			memcpy(s, data + ep->d.data.data_offset,
+			       ep->d.data.data_len-1);
+			s[ep->d.data.data_len-1] = '\n';
+			s[ep->d.data.data_len] = '\0';
+			len = ep->d.data.data_len;
+			valid = true;
+			break;
+		case MD_PROP_DATA:
+			s = kmalloc(ep->d.data.data_len, GFP_KERNEL);
+			if (s == NULL)
+				return -ENOMEM;
+			memcpy(s, data + ep->d.data.data_offset,
+					ep->d.data.data_len);
+			len = ep->d.data.data_len;
+			valid = true;
+			break;
+		default:
+			break;
+		}
+
+		prop = kmalloc(sizeof(*prop), GFP_KERNEL);
+		if (!prop) {
+			kfree(s);
+			kfree(sysfs_attributes);
+			return -ENOMEM;
+		}
+
+		if (valid) {
+			const char *c_name = names + ep->name_offset;
+			char *name;
+			int i;
+			bool found = false;
+
+			/*
+			 * Check if a property with the same name
+			 * has already been added
+			 */
+			for (i = 0; i < sysfs_attributes_length; i++) {
+				struct sysfs_property *prop;
+
+				prop = container_of(sysfs_attributes[i],
+						    struct sysfs_property,
+						    kattr.attr);
+				if (!strcmp(prop->kattr.attr.name, c_name)) {
+					found = true;
+					break;
+				}
+			}
+			if (found) {
+				pr_info("mdesc_sysfs: duplicate property found\n");
+				continue;
+			}
+
+			name = kstrdup(c_name, GFP_KERNEL);
+			if (!name) {
+				kfree(s);
+				kfree(sysfs_attributes);
+				return -ENOMEM;
+			}
+
+			/* Set the name */
+			prop->kattr.attr.name = name;
+			prop->kattr.attr.mode = S_IRUSR;
+			prop->kattr.show = mdesc_sysfs_show;
+			prop->kattr.store = NULL;
+			prop->value = s;
+			prop->length = len;
+
+			/* Add to the (automatically growing) array */
+			add_property(prop, &sysfs_attributes,
+				     &sysfs_attributes_length,
+				     &sysfs_attributes_size);
+		}
+	}
+
+	if (sysfs_attributes) {
+		int err;
+		const struct attribute **pattr = (const struct attribute **)
+						 sysfs_attributes;
+
+		err = sysfs_create_files(parent_kobj, pattr);
+		if (err)
+			pr_err("mdesc_sysfs: unable to create sysfs properties\n");
+
+		/*
+		 * Free the array of attributes, but not the individual elements
+		 * because each element points to a "struct sysfs_property" that
+		 * mdesc_sysfs_show() uses whenever users read from sysfs
+		 */
+		kfree(sysfs_attributes);
+	}
+
+	return 0;
+}
+
+/*
+ * Function called on the root that recursively add nodes to sysfs
+ * It first looks at the current node in "ep", gathering statistics
+ * on every forward arcs it points to,
+ * then for every arc type it create a sysfs folder or link
+ * Finally it adds the properties (non arcs) to sysfs
+ * Return 0 upon success
+ */
+static int sysfs_populate(struct mdesc_handle *hp,
+			  struct mdesc_elem *ep,
+			  struct kobject *parent_kobj,
+			  struct kobject **all_nodes)
+{
+	struct mdesc_elem *base = node_block(&hp->mdesc);
+	const char *names = name_block(&hp->mdesc);
+	struct mdesc_arcs arcs_list, *p;
+	struct list_head *pos, *q;
+	int err = 0;
+
+	/*
+	 * Check that it's indeed an arc node
+	 * and lose the 1st entry as it's not useful hereafter
+	 */
+	if (ep++->tag != MD_NODE) {
+		pr_err("mdesc_sysfs: Not a valid arc node\n");
+		return -EINVAL;
+	}
+
+	/*
+	 * In a high level language we'd create a dictionary
+	 * for example: {"cpu": 64", "memory": 1, ...}
+	 * In the kernel we use a linked list of structs.
+	 * We end up with: ("cpu", 64) --> ("memory", 1) -->...
+	 */
+	INIT_LIST_HEAD(&arcs_list.list);
+	err = sysfs_populate_build_arcs_list(hp, ep, &arcs_list);
+	if (err)
+		return err;
+
+	/*
+	 * For each arc type, we browse the current arc "ep" and
+	 * add to sysfs. That allows us to keep an index for these
+	 * arcs that don't contain an "id" or "name" field
+	 */
+	list_for_each_entry(p, &arcs_list.list, list) {
+		struct mdesc_elem *elem;
+		unsigned int index = 0;
+
+		for (elem = ep; elem->tag != MD_NODE_END; elem++) {
+			struct kobject *kobj;
+			const char *name;
+
+			/* Need to be a forward arc... */
+			if (elem->tag != MD_PROP_ARC ||
+			    strcmp(names + elem->name_offset, "fwd"))
+				continue;
+
+			/*
+			 * ...whose name matches the element of the list
+			 * that we're currently considering
+			 */
+			name = names + (base + elem->d.val)->name_offset;
+			if (strcmp(p->name, name))
+				continue;
+
+			/* Have we already added this arc to sysfs? */
+			if (all_nodes[elem->d.val]) {
+				/* Yes, we link to it */
+				err = create_sysfs_link(hp, elem, name,
+					all_nodes[elem->d.val], parent_kobj,
+					index, p->count);
+			} else {
+				/* No, we create a folder for it */
+				err = create_sysfs_folder(hp, elem, name,
+					&kobj, parent_kobj, index, p->count);
+				if (err)
+					return -ENOMEM;
+
+				all_nodes[elem->d.val] = kobj;
+
+				/* And we add to sysfs whatever it contains */
+				sysfs_populate(hp, base + elem->d.val,
+					       kobj, all_nodes);
+			}
+
+			/*
+			 * If it's the last entry of its kind, there's
+			 * no point in looping again
+			 */
+			if (++index == p->count)
+				break;
+		}
+	}
+
+	/* Free the linked list */
+	list_for_each_safe(pos, q, &arcs_list.list) {
+		p = list_entry(pos, struct mdesc_arcs, list);
+		list_del(pos);
+		kfree(p);
+	}
+
+	/* Add the properties (non-arc) to sysfs (string, int, binary data) */
+	return mdesc_sysfs_add_propeties(hp, ep, parent_kobj);
+}
+
+static struct kobject *mdesc_kobj;
+
+int mdesc_sysfs_populate(void)
+{
+	struct mdesc_handle *hp;
+	unsigned int num_node;
+	struct kobject **all_nodes, *old_mdesc_kobj;
+	unsigned int major, minor;
+	mode_t mode;
+	int retval = 0;
+
+	hp = mdesc_grab();
+	if (!hp) {
+		pr_err("mdesc_sysfs: hv mdesc not found\n");
+		return -ENODEV;
+	}
+
+	major = hp->mdesc.version >> 16;
+	minor = hp->mdesc.version & 0xffff;
+
+	if (major != 1) {
+		pr_err("mdesc_sysfs: unsupported hv mdesc version: %d.%d\n",
+		       major, minor);
+		mdesc_release(hp);
+		return -EINVAL;
+	}
+
+	pr_info("mdesc_sysfs: machine desc version %d.%d\n", major, minor);
+
+	num_node = hp->mdesc.node_sz / sizeof(struct mdesc_hdr);
+	all_nodes = kcalloc(num_node, sizeof(struct kobject *), GFP_KERNEL);
+	if (!all_nodes) {
+		mdesc_release(hp);
+		return -ENOMEM;
+	}
+
+	updating = true;
+
+	old_mdesc_kobj = mdesc_kobj;
+	mdesc_kobj = kobject_create_and_add(".mdesc.tmp", firmware_kobj);
+	if (!mdesc_kobj) {
+		pr_err("mdesc_sysfs: unable to create sysfs entry\n");
+		kfree(all_nodes);
+		mdesc_release(hp);
+		return -ENOMEM;
+	}
+	mode = mdesc_kobj->sd->mode;
+	/* Remove access to everyone since we're populating
+	 * the tree. Regardless, root has always access! */
+	mdesc_kobj->sd->mode &= ~0777;
+
+	retval = sysfs_populate(hp, node_block(&hp->mdesc),
+				mdesc_kobj, all_nodes);
+
+	mdesc_release(hp);
+	kfree(all_nodes);
+
+	/* Rename existing mdesc to avoid conflicts */
+	if (retval)
+		return retval;
+
+	if (old_mdesc_kobj) {
+		retval = kobject_rename(old_mdesc_kobj, ".mdesc.old");
+		if (retval) {
+			pr_err("mdesc_sysfs: unable to rename /sys/firmware/mdesc\n");
+			kobject_del(mdesc_kobj);
+			kobject_put(mdesc_kobj);
+			updating = false;
+			return retval;
+		}
+	}
+
+	/* Restore original mode and set name */
+	retval = kobject_rename(mdesc_kobj, "mdesc");
+	if (!retval) {
+		mdesc_kobj->sd->mode = mode;
+		if (old_mdesc_kobj) {
+			kobject_del(old_mdesc_kobj);
+			kobject_put(old_mdesc_kobj);
+		}
+	} else {
+		int dummy;
+
+		/* Attempt to restore the old entry (if any) */
+		if (old_mdesc_kobj)
+			dummy = kobject_rename(old_mdesc_kobj, "mdesc");
+		kobject_del(mdesc_kobj);
+		kobject_put(mdesc_kobj);
+	}
+
+	updating = false;
+
+	return retval;
+}
+EXPORT_SYMBOL(mdesc_sysfs_populate);
+
+MODULE_DESCRIPTION("Maps machine description graph to sysfs in /sys/firmware/mdesc");
+MODULE_AUTHOR("Oracle");
+MODULE_LICENSE("GPL");
+
+static int __init mdesc_sysfs_init(void)
+{
+	return mdesc_sysfs_populate();
+}
+
+static void __exit mdesc_sysfs_cleanup(void)
+{
+	if (mdesc_kobj) {
+		kobject_del(mdesc_kobj);
+		kobject_put(mdesc_kobj);
+	}
+}
+
+module_init(mdesc_sysfs_init);
+module_exit(mdesc_sysfs_cleanup);