diff mbox

[RFC,v2,01/12] Add sys_hotplug.h for system device hotplug framework

Message ID 1357861230-29549-2-git-send-email-toshi.kani@hp.com (mailing list archive)
State Not Applicable
Headers show

Commit Message

Toshi Kani Jan. 10, 2013, 11:40 p.m. UTC
Added include/linux/sys_hotplug.h, which defines the system device
hotplug framework interfaces used by the framework itself and
handlers.

The order values define the calling sequence of handlers.  For add
execute, the ordering is ACPI->MEM->CPU.  Memory is onlined before
CPU so that threads on new CPUs can start using their local memory.
The ordering of the delete execute is symmetric to the add execute.

struct shp_request defines a hot-plug request information.  The
device resource information is managed with a list so that a single
request may target to multiple devices.

Signed-off-by: Toshi Kani <toshi.kani@hp.com>
---
 include/linux/sys_hotplug.h |  181 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 181 insertions(+)
 create mode 100644 include/linux/sys_hotplug.h

Comments

Rafael J. Wysocki Jan. 11, 2013, 9:23 p.m. UTC | #1
On Thursday, January 10, 2013 04:40:19 PM Toshi Kani wrote:
> Added include/linux/sys_hotplug.h, which defines the system device
> hotplug framework interfaces used by the framework itself and
> handlers.
> 
> The order values define the calling sequence of handlers.  For add
> execute, the ordering is ACPI->MEM->CPU.  Memory is onlined before
> CPU so that threads on new CPUs can start using their local memory.
> The ordering of the delete execute is symmetric to the add execute.
> 
> struct shp_request defines a hot-plug request information.  The
> device resource information is managed with a list so that a single
> request may target to multiple devices.
> 
> Signed-off-by: Toshi Kani <toshi.kani@hp.com>
> ---
>  include/linux/sys_hotplug.h |  181 +++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 181 insertions(+)
>  create mode 100644 include/linux/sys_hotplug.h
> 
> diff --git a/include/linux/sys_hotplug.h b/include/linux/sys_hotplug.h
> new file mode 100644
> index 0000000..86674dd
> --- /dev/null
> +++ b/include/linux/sys_hotplug.h
> @@ -0,0 +1,181 @@
> +/*
> + * sys_hotplug.h - System device hot-plug framework
> + *
> + * Copyright (C) 2012 Hewlett-Packard Development Company, L.P.
> + *	Toshi Kani <toshi.kani@hp.com>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + */
> +
> +#ifndef _LINUX_SYS_HOTPLUG_H
> +#define _LINUX_SYS_HOTPLUG_H
> +
> +#include <linux/list.h>
> +#include <linux/device.h>
> +
> +/*
> + * System device hot-plug operation proceeds in the following order.
> + *   Validate phase -> Execute phase -> Commit phase
> + *
> + * The order values below define the calling sequence of platform
> + * neutral handlers for each phase in ascending order.  The order
> + * values of firmware-specific handlers are defined in sys_hotplug.h
> + * under firmware specific directories.
> + */
> +
> +/* All order values must be smaller than this value */
> +#define SHP_ORDER_MAX				0xffffff
> +
> +/* Add Validate order values */
> +
> +/* Add Execute order values */
> +#define SHP_MEM_ADD_EXECUTE_ORDER		100
> +#define SHP_CPU_ADD_EXECUTE_ORDER		110
> +
> +/* Add Commit order values */
> +
> +/* Delete Validate order values */
> +#define SHP_CPU_DEL_VALIDATE_ORDER		100
> +#define SHP_MEM_DEL_VALIDATE_ORDER		110
> +
> +/* Delete Execute order values */
> +#define SHP_CPU_DEL_EXECUTE_ORDER		10
> +#define SHP_MEM_DEL_EXECUTE_ORDER		20
> +
> +/* Delete Commit order values */
> +
> +/*
> + * Hot-plug request types
> + */
> +#define SHP_REQ_ADD		0x000000
> +#define SHP_REQ_DELETE		0x000001
> +#define SHP_REQ_MASK		0x0000ff
> +
> +/*
> + * Hot-plug phase types
> + */
> +#define SHP_PH_VALIDATE		0x000000
> +#define SHP_PH_EXECUTE		0x000100
> +#define SHP_PH_COMMIT		0x000200
> +#define SHP_PH_MASK		0x00ff00
> +
> +/*
> + * Hot-plug operation types
> + */
> +#define SHP_OP_HOTPLUG		0x000000
> +#define SHP_OP_ONLINE		0x010000
> +#define SHP_OP_MASK		0xff0000
> +
> +/*
> + * Hot-plug phases
> + */
> +enum shp_phase {
> +	SHP_ADD_VALIDATE	= (SHP_REQ_ADD|SHP_PH_VALIDATE),
> +	SHP_ADD_EXECUTE		= (SHP_REQ_ADD|SHP_PH_EXECUTE),
> +	SHP_ADD_COMMIT		= (SHP_REQ_ADD|SHP_PH_COMMIT),
> +	SHP_DEL_VALIDATE	= (SHP_REQ_DELETE|SHP_PH_VALIDATE),
> +	SHP_DEL_EXECUTE		= (SHP_REQ_DELETE|SHP_PH_EXECUTE),
> +	SHP_DEL_COMMIT		= (SHP_REQ_DELETE|SHP_PH_COMMIT)
> +};
> +
> +/*
> + * Hot-plug operations
> + */
> +enum shp_operation {
> +	SHP_HOTPLUG_ADD		= (SHP_OP_HOTPLUG|SHP_REQ_ADD),
> +	SHP_HOTPLUG_DEL		= (SHP_OP_HOTPLUG|SHP_REQ_DELETE),
> +	SHP_ONLINE_ADD		= (SHP_OP_ONLINE|SHP_REQ_ADD),
> +	SHP_ONLINE_DEL		= (SHP_OP_ONLINE|SHP_REQ_DELETE)
> +};
> +
> +/*
> + * Hot-plug device classes
> + */
> +enum shp_class {
> +	SHP_CLS_INVALID		= 0,
> +	SHP_CLS_CPU		= 1,
> +	SHP_CLS_MEMORY		= 2,
> +	SHP_CLS_HOSTBRIDGE	= 3,
> +	SHP_CLS_CONTAINER	= 4,
> +};
> +
> +/*
> + * Hot-plug device information
> + */
> +union shp_dev_info {
> +	struct shp_cpu {
> +		u32		cpu_id;
> +	} cpu;
> +
> +	struct shp_memory {
> +		int		node;
> +		u64		start_addr;
> +		u64		length;
> +	} mem;
> +
> +	struct shp_hostbridge {
> +	} hb;
> +
> +	struct shp_node {
> +	} node;
> +};
> +
> +struct shp_device {
> +	struct list_head	list;
> +	struct device		*device;
> +	enum shp_class		class;
> +	union shp_dev_info	info;
> +};
> +
> +/*
> + * Hot-plug request
> + */
> +struct shp_request {
> +	/* common info */
> +	enum shp_operation	operation;	/* operation */
> +
> +	/* hot-plug event info: only valid for hot-plug operations */
> +	void			*handle;	/* FW handle */

What's the role of handle here?


> +	u32			event;		/* FW event */
> +
> +	/* device resource info */
> +	struct list_head	dev_list;	/* shp_device list */
> +};
> +
> +/*
> + * Inline Utility Functions
> + */
> +static inline bool shp_is_hotplug_op(enum shp_operation operation)
> +{
> +	return (operation & SHP_OP_MASK) == SHP_OP_HOTPLUG;
> +}
> +
> +static inline bool shp_is_online_op(enum shp_operation operation)
> +{
> +	return (operation & SHP_OP_MASK) == SHP_OP_ONLINE;
> +}
> +
> +static inline bool shp_is_add_op(enum shp_operation operation)
> +{
> +	return (operation & SHP_REQ_MASK) == SHP_REQ_ADD;
> +}
> +
> +static inline bool shp_is_add_phase(enum shp_phase phase)
> +{
> +	return (phase & SHP_REQ_MASK) == SHP_REQ_ADD;
> +}
> +
> +/*
> + * Externs
> + */
> +typedef int (*shp_func)(struct shp_request *req, int rollback);
> +extern int shp_register_handler(enum shp_phase phase, shp_func func, u32 order);
> +extern int shp_unregister_handler(enum shp_phase phase, shp_func func);
> +extern int shp_submit_req(struct shp_request *req);
> +extern struct shp_request *shp_alloc_request(enum shp_operation operation);
> +extern void shp_add_dev_info(struct shp_request *shp_req,
> +		struct shp_device *shp_dev);
> +
> +#endif	/* _LINUX_SYS_HOTPLUG_H */
> --
> To unsubscribe from this list: send the line "unsubscribe linux-acpi" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
Toshi Kani Jan. 14, 2013, 3:33 p.m. UTC | #2
On Fri, 2013-01-11 at 22:23 +0100, Rafael J. Wysocki wrote:
> On Thursday, January 10, 2013 04:40:19 PM Toshi Kani wrote:
> > Added include/linux/sys_hotplug.h, which defines the system device
> > hotplug framework interfaces used by the framework itself and
> > handlers.
> > 
> > The order values define the calling sequence of handlers.  For add
> > execute, the ordering is ACPI->MEM->CPU.  Memory is onlined before
> > CPU so that threads on new CPUs can start using their local memory.
> > The ordering of the delete execute is symmetric to the add execute.
> > 
> > struct shp_request defines a hot-plug request information.  The
> > device resource information is managed with a list so that a single
> > request may target to multiple devices.
> > 
 :
> > +
> > +struct shp_device {
> > +	struct list_head	list;
> > +	struct device		*device;
> > +	enum shp_class		class;
> > +	union shp_dev_info	info;
> > +};
> > +
> > +/*
> > + * Hot-plug request
> > + */
> > +struct shp_request {
> > +	/* common info */
> > +	enum shp_operation	operation;	/* operation */
> > +
> > +	/* hot-plug event info: only valid for hot-plug operations */
> > +	void			*handle;	/* FW handle */
> 
> What's the role of handle here?

On ACPI-based platforms, the handle keeps a notified ACPI handle when a
hot-plug request is made.  ACPI bus handlers, acpi_add_execute() /
acpi_del_execute(), then scans / trims ACPI devices from the handle.

Thanks,
-Toshi
Rafael J. Wysocki Jan. 14, 2013, 6:48 p.m. UTC | #3
On Monday, January 14, 2013 08:33:48 AM Toshi Kani wrote:
> On Fri, 2013-01-11 at 22:23 +0100, Rafael J. Wysocki wrote:
> > On Thursday, January 10, 2013 04:40:19 PM Toshi Kani wrote:
> > > Added include/linux/sys_hotplug.h, which defines the system device
> > > hotplug framework interfaces used by the framework itself and
> > > handlers.
> > > 
> > > The order values define the calling sequence of handlers.  For add
> > > execute, the ordering is ACPI->MEM->CPU.  Memory is onlined before
> > > CPU so that threads on new CPUs can start using their local memory.
> > > The ordering of the delete execute is symmetric to the add execute.
> > > 
> > > struct shp_request defines a hot-plug request information.  The
> > > device resource information is managed with a list so that a single
> > > request may target to multiple devices.
> > > 
>  :
> > > +
> > > +struct shp_device {
> > > +	struct list_head	list;
> > > +	struct device		*device;
> > > +	enum shp_class		class;
> > > +	union shp_dev_info	info;
> > > +};
> > > +
> > > +/*
> > > + * Hot-plug request
> > > + */
> > > +struct shp_request {
> > > +	/* common info */
> > > +	enum shp_operation	operation;	/* operation */
> > > +
> > > +	/* hot-plug event info: only valid for hot-plug operations */
> > > +	void			*handle;	/* FW handle */
> > 
> > What's the role of handle here?
> 
> On ACPI-based platforms, the handle keeps a notified ACPI handle when a
> hot-plug request is made.  ACPI bus handlers, acpi_add_execute() /
> acpi_del_execute(), then scans / trims ACPI devices from the handle.

OK, so this is ACPI-specific and should be described as such.

Thanks,
Rafael
Toshi Kani Jan. 14, 2013, 7:02 p.m. UTC | #4
On Mon, 2013-01-14 at 19:48 +0100, Rafael J. Wysocki wrote:
> On Monday, January 14, 2013 08:33:48 AM Toshi Kani wrote:
> > On Fri, 2013-01-11 at 22:23 +0100, Rafael J. Wysocki wrote:
> > > On Thursday, January 10, 2013 04:40:19 PM Toshi Kani wrote:
> > > > Added include/linux/sys_hotplug.h, which defines the system device
> > > > hotplug framework interfaces used by the framework itself and
> > > > handlers.
> > > > 
> > > > The order values define the calling sequence of handlers.  For add
> > > > execute, the ordering is ACPI->MEM->CPU.  Memory is onlined before
> > > > CPU so that threads on new CPUs can start using their local memory.
> > > > The ordering of the delete execute is symmetric to the add execute.
> > > > 
> > > > struct shp_request defines a hot-plug request information.  The
> > > > device resource information is managed with a list so that a single
> > > > request may target to multiple devices.
> > > > 
> >  :
> > > > +
> > > > +struct shp_device {
> > > > +	struct list_head	list;
> > > > +	struct device		*device;
> > > > +	enum shp_class		class;
> > > > +	union shp_dev_info	info;
> > > > +};
> > > > +
> > > > +/*
> > > > + * Hot-plug request
> > > > + */
> > > > +struct shp_request {
> > > > +	/* common info */
> > > > +	enum shp_operation	operation;	/* operation */
> > > > +
> > > > +	/* hot-plug event info: only valid for hot-plug operations */
> > > > +	void			*handle;	/* FW handle */
> > > 
> > > What's the role of handle here?
> > 
> > On ACPI-based platforms, the handle keeps a notified ACPI handle when a
> > hot-plug request is made.  ACPI bus handlers, acpi_add_execute() /
> > acpi_del_execute(), then scans / trims ACPI devices from the handle.
> 
> OK, so this is ACPI-specific and should be described as such.

Other FW interface I know is parisc, which has mod_index (module index)
to identify a unique object, just like what ACPI handle does.  The
handle can keep the mod_index as an opaque value as well.  But as you
said, I do not know if the handle works for all other FWs.  So, I will
add descriptions, such that the hot-plug event info is modeled after
ACPI and may need to be revisited when supporting other FW.

Thanks,
-Toshi
Greg Kroah-Hartman Jan. 30, 2013, 4:48 a.m. UTC | #5
On Mon, Jan 14, 2013 at 12:02:04PM -0700, Toshi Kani wrote:
> On Mon, 2013-01-14 at 19:48 +0100, Rafael J. Wysocki wrote:
> > On Monday, January 14, 2013 08:33:48 AM Toshi Kani wrote:
> > > On Fri, 2013-01-11 at 22:23 +0100, Rafael J. Wysocki wrote:
> > > > On Thursday, January 10, 2013 04:40:19 PM Toshi Kani wrote:
> > > > > Added include/linux/sys_hotplug.h, which defines the system device
> > > > > hotplug framework interfaces used by the framework itself and
> > > > > handlers.
> > > > > 
> > > > > The order values define the calling sequence of handlers.  For add
> > > > > execute, the ordering is ACPI->MEM->CPU.  Memory is onlined before
> > > > > CPU so that threads on new CPUs can start using their local memory.
> > > > > The ordering of the delete execute is symmetric to the add execute.
> > > > > 
> > > > > struct shp_request defines a hot-plug request information.  The
> > > > > device resource information is managed with a list so that a single
> > > > > request may target to multiple devices.
> > > > > 
> > >  :
> > > > > +
> > > > > +struct shp_device {
> > > > > +	struct list_head	list;
> > > > > +	struct device		*device;
> > > > > +	enum shp_class		class;
> > > > > +	union shp_dev_info	info;
> > > > > +};
> > > > > +
> > > > > +/*
> > > > > + * Hot-plug request
> > > > > + */
> > > > > +struct shp_request {
> > > > > +	/* common info */
> > > > > +	enum shp_operation	operation;	/* operation */
> > > > > +
> > > > > +	/* hot-plug event info: only valid for hot-plug operations */
> > > > > +	void			*handle;	/* FW handle */
> > > > 
> > > > What's the role of handle here?
> > > 
> > > On ACPI-based platforms, the handle keeps a notified ACPI handle when a
> > > hot-plug request is made.  ACPI bus handlers, acpi_add_execute() /
> > > acpi_del_execute(), then scans / trims ACPI devices from the handle.
> > 
> > OK, so this is ACPI-specific and should be described as such.
> 
> Other FW interface I know is parisc, which has mod_index (module index)
> to identify a unique object, just like what ACPI handle does.  The
> handle can keep the mod_index as an opaque value as well.  But as you
> said, I do not know if the handle works for all other FWs.  So, I will
> add descriptions, such that the hot-plug event info is modeled after
> ACPI and may need to be revisited when supporting other FW.

Please make it a "real" pointer, and not a void *, those shouldn't be
used at all if possible.

thanks,

greg k-h
Greg Kroah-Hartman Jan. 30, 2013, 4:53 a.m. UTC | #6
On Thu, Jan 10, 2013 at 04:40:19PM -0700, Toshi Kani wrote:
> Added include/linux/sys_hotplug.h, which defines the system device
> hotplug framework interfaces used by the framework itself and
> handlers.
> 
> The order values define the calling sequence of handlers.  For add
> execute, the ordering is ACPI->MEM->CPU.  Memory is onlined before
> CPU so that threads on new CPUs can start using their local memory.
> The ordering of the delete execute is symmetric to the add execute.
> 
> struct shp_request defines a hot-plug request information.  The
> device resource information is managed with a list so that a single
> request may target to multiple devices.
> 
> Signed-off-by: Toshi Kani <toshi.kani@hp.com>
> ---
>  include/linux/sys_hotplug.h |  181 +++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 181 insertions(+)
>  create mode 100644 include/linux/sys_hotplug.h
> 
> diff --git a/include/linux/sys_hotplug.h b/include/linux/sys_hotplug.h
> new file mode 100644
> index 0000000..86674dd
> --- /dev/null
> +++ b/include/linux/sys_hotplug.h
> @@ -0,0 +1,181 @@
> +/*
> + * sys_hotplug.h - System device hot-plug framework
> + *
> + * Copyright (C) 2012 Hewlett-Packard Development Company, L.P.
> + *	Toshi Kani <toshi.kani@hp.com>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + */
> +
> +#ifndef _LINUX_SYS_HOTPLUG_H
> +#define _LINUX_SYS_HOTPLUG_H
> +
> +#include <linux/list.h>
> +#include <linux/device.h>
> +
> +/*
> + * System device hot-plug operation proceeds in the following order.
> + *   Validate phase -> Execute phase -> Commit phase
> + *
> + * The order values below define the calling sequence of platform
> + * neutral handlers for each phase in ascending order.  The order
> + * values of firmware-specific handlers are defined in sys_hotplug.h
> + * under firmware specific directories.
> + */
> +
> +/* All order values must be smaller than this value */
> +#define SHP_ORDER_MAX				0xffffff
> +
> +/* Add Validate order values */
> +
> +/* Add Execute order values */
> +#define SHP_MEM_ADD_EXECUTE_ORDER		100
> +#define SHP_CPU_ADD_EXECUTE_ORDER		110
> +
> +/* Add Commit order values */
> +
> +/* Delete Validate order values */
> +#define SHP_CPU_DEL_VALIDATE_ORDER		100
> +#define SHP_MEM_DEL_VALIDATE_ORDER		110
> +
> +/* Delete Execute order values */
> +#define SHP_CPU_DEL_EXECUTE_ORDER		10
> +#define SHP_MEM_DEL_EXECUTE_ORDER		20
> +
> +/* Delete Commit order values */
> +

Empty value?

Anyway, as I said before, don't use "values", just call things directly
in the order you need to.

This isn't like other operating systems, we don't need to be so
"flexible", we can modify the core code as much as we want and need to
if future things come along :)

thanks,

greg k-h
Greg Kroah-Hartman Jan. 30, 2013, 4:58 a.m. UTC | #7
On Thu, Jan 10, 2013 at 04:40:19PM -0700, Toshi Kani wrote:
> +/*
> + * Hot-plug device information
> + */

Again, stop it with the "generic" hotplug term here, and everywhere
else.  You are doing a very _specific_ type of hotplug devices, so spell
it out.  We've worked hard to hotplug _everything_ in Linux, you are
going to confuse a lot of people with this type of terms.

> +union shp_dev_info {
> +	struct shp_cpu {
> +		u32		cpu_id;
> +	} cpu;

What is this?  Why not point to the system device for the cpu?

> +	struct shp_memory {
> +		int		node;
> +		u64		start_addr;
> +		u64		length;
> +	} mem;

Same here, why not point to the system device?

> +	struct shp_hostbridge {
> +	} hb;
> +
> +	struct shp_node {
> +	} node;

What happened here with these?  Empty structures?  Huh?

> +};
> +
> +struct shp_device {
> +	struct list_head	list;
> +	struct device		*device;

No, make it a "real" device, embed the device into it.

But, again, I'm going to ask why you aren't using the existing cpu /
memory / bridge / node devices that we have in the kernel.  Please use
them, or give me a _really_ good reason why they will not work.

> +	enum shp_class		class;
> +	union shp_dev_info	info;
> +};
> +
> +/*
> + * Hot-plug request
> + */
> +struct shp_request {
> +	/* common info */
> +	enum shp_operation	operation;	/* operation */
> +
> +	/* hot-plug event info: only valid for hot-plug operations */
> +	void			*handle;	/* FW handle */
> +	u32			event;		/* FW event */

What is this?

greg k-h
Toshi Kani Jan. 31, 2013, 1:15 a.m. UTC | #8
On Tue, 2013-01-29 at 23:48 -0500, Greg KH wrote:
> On Mon, Jan 14, 2013 at 12:02:04PM -0700, Toshi Kani wrote:
> > On Mon, 2013-01-14 at 19:48 +0100, Rafael J. Wysocki wrote:
> > > On Monday, January 14, 2013 08:33:48 AM Toshi Kani wrote:
> > > > On Fri, 2013-01-11 at 22:23 +0100, Rafael J. Wysocki wrote:
> > > > > On Thursday, January 10, 2013 04:40:19 PM Toshi Kani wrote:
> > > > > > Added include/linux/sys_hotplug.h, which defines the system device
> > > > > > hotplug framework interfaces used by the framework itself and
> > > > > > handlers.
> > > > > > 
> > > > > > The order values define the calling sequence of handlers.  For add
> > > > > > execute, the ordering is ACPI->MEM->CPU.  Memory is onlined before
> > > > > > CPU so that threads on new CPUs can start using their local memory.
> > > > > > The ordering of the delete execute is symmetric to the add execute.
> > > > > > 
> > > > > > struct shp_request defines a hot-plug request information.  The
> > > > > > device resource information is managed with a list so that a single
> > > > > > request may target to multiple devices.
> > > > > > 
> > > >  :
> > > > > > +
> > > > > > +struct shp_device {
> > > > > > +	struct list_head	list;
> > > > > > +	struct device		*device;
> > > > > > +	enum shp_class		class;
> > > > > > +	union shp_dev_info	info;
> > > > > > +};
> > > > > > +
> > > > > > +/*
> > > > > > + * Hot-plug request
> > > > > > + */
> > > > > > +struct shp_request {
> > > > > > +	/* common info */
> > > > > > +	enum shp_operation	operation;	/* operation */
> > > > > > +
> > > > > > +	/* hot-plug event info: only valid for hot-plug operations */
> > > > > > +	void			*handle;	/* FW handle */
> > > > > 
> > > > > What's the role of handle here?
> > > > 
> > > > On ACPI-based platforms, the handle keeps a notified ACPI handle when a
> > > > hot-plug request is made.  ACPI bus handlers, acpi_add_execute() /
> > > > acpi_del_execute(), then scans / trims ACPI devices from the handle.
> > > 
> > > OK, so this is ACPI-specific and should be described as such.
> > 
> > Other FW interface I know is parisc, which has mod_index (module index)
> > to identify a unique object, just like what ACPI handle does.  The
> > handle can keep the mod_index as an opaque value as well.  But as you
> > said, I do not know if the handle works for all other FWs.  So, I will
> > add descriptions, such that the hot-plug event info is modeled after
> > ACPI and may need to be revisited when supporting other FW.
> 
> Please make it a "real" pointer, and not a void *, those shouldn't be
> used at all if possible.

How about changing the "void *handle" to acpi_dev_node below?   

   struct acpi_dev_node    acpi_node;

Basically, it has the same challenge as struct device, which uses
acpi_dev_node as well.  We can add other FW node when needed (just like
device also has *of_node).

Thanks,
-Toshi
Toshi Kani Jan. 31, 2013, 1:46 a.m. UTC | #9
On Tue, 2013-01-29 at 23:53 -0500, Greg KH wrote:
> On Thu, Jan 10, 2013 at 04:40:19PM -0700, Toshi Kani wrote:
> > Added include/linux/sys_hotplug.h, which defines the system device
> > hotplug framework interfaces used by the framework itself and
> > handlers.
> > 
> > The order values define the calling sequence of handlers.  For add
> > execute, the ordering is ACPI->MEM->CPU.  Memory is onlined before
> > CPU so that threads on new CPUs can start using their local memory.
> > The ordering of the delete execute is symmetric to the add execute.
> > 
> > struct shp_request defines a hot-plug request information.  The
> > device resource information is managed with a list so that a single
> > request may target to multiple devices.
> > 
> > Signed-off-by: Toshi Kani <toshi.kani@hp.com>
> > ---
> >  include/linux/sys_hotplug.h |  181 +++++++++++++++++++++++++++++++++++++++++++
> >  1 file changed, 181 insertions(+)
> >  create mode 100644 include/linux/sys_hotplug.h
> > 
> > diff --git a/include/linux/sys_hotplug.h b/include/linux/sys_hotplug.h
> > new file mode 100644
> > index 0000000..86674dd
> > --- /dev/null
> > +++ b/include/linux/sys_hotplug.h
> > @@ -0,0 +1,181 @@
> > +/*
> > + * sys_hotplug.h - System device hot-plug framework
> > + *
> > + * Copyright (C) 2012 Hewlett-Packard Development Company, L.P.
> > + *	Toshi Kani <toshi.kani@hp.com>
> > + *
> > + * This program is free software; you can redistribute it and/or modify
> > + * it under the terms of the GNU General Public License version 2 as
> > + * published by the Free Software Foundation.
> > + */
> > +
> > +#ifndef _LINUX_SYS_HOTPLUG_H
> > +#define _LINUX_SYS_HOTPLUG_H
> > +
> > +#include <linux/list.h>
> > +#include <linux/device.h>
> > +
> > +/*
> > + * System device hot-plug operation proceeds in the following order.
> > + *   Validate phase -> Execute phase -> Commit phase
> > + *
> > + * The order values below define the calling sequence of platform
> > + * neutral handlers for each phase in ascending order.  The order
> > + * values of firmware-specific handlers are defined in sys_hotplug.h
> > + * under firmware specific directories.
> > + */
> > +
> > +/* All order values must be smaller than this value */
> > +#define SHP_ORDER_MAX				0xffffff
> > +
> > +/* Add Validate order values */
> > +
> > +/* Add Execute order values */
> > +#define SHP_MEM_ADD_EXECUTE_ORDER		100
> > +#define SHP_CPU_ADD_EXECUTE_ORDER		110
> > +
> > +/* Add Commit order values */
> > +
> > +/* Delete Validate order values */
> > +#define SHP_CPU_DEL_VALIDATE_ORDER		100
> > +#define SHP_MEM_DEL_VALIDATE_ORDER		110
> > +
> > +/* Delete Execute order values */
> > +#define SHP_CPU_DEL_EXECUTE_ORDER		10
> > +#define SHP_MEM_DEL_EXECUTE_ORDER		20
> > +
> > +/* Delete Commit order values */
> > +
> 
> Empty value?

Yes, in this version, all the delete commit order values are defined in
<acpi/sys_hotplug.h>.

> Anyway, as I said before, don't use "values", just call things directly
> in the order you need to.
> 
> This isn't like other operating systems, we don't need to be so
> "flexible", we can modify the core code as much as we want and need to
> if future things come along :)

Understood.  As described in the previous email, I will define them with
enum and avoid using values.

Thanks,
-Toshi
Toshi Kani Jan. 31, 2013, 2:57 a.m. UTC | #10
On Tue, 2013-01-29 at 23:58 -0500, Greg KH wrote:
> On Thu, Jan 10, 2013 at 04:40:19PM -0700, Toshi Kani wrote:
> > +/*
> > + * Hot-plug device information
> > + */
> 
> Again, stop it with the "generic" hotplug term here, and everywhere
> else.  You are doing a very _specific_ type of hotplug devices, so spell
> it out.  We've worked hard to hotplug _everything_ in Linux, you are
> going to confuse a lot of people with this type of terms.

Agreed.  I will clarify in all places.

> > +union shp_dev_info {
> > +	struct shp_cpu {
> > +		u32		cpu_id;
> > +	} cpu;
> 
> What is this?  Why not point to the system device for the cpu?

This info is used to on-line a new CPU and create its system/cpu device.
In other word, a system/cpu device is created as a result of CPU
hotplug.

> > +	struct shp_memory {
> > +		int		node;
> > +		u64		start_addr;
> > +		u64		length;
> > +	} mem;
> 
> Same here, why not point to the system device?

Same as above.

> > +	struct shp_hostbridge {
> > +	} hb;
> > +
> > +	struct shp_node {
> > +	} node;
> 
> What happened here with these?  Empty structures?  Huh?

They are place holders for now.  PCI bridge hot-plug and node hot-plug
are still very much work in progress, so I have not integrated them into
this framework yet.

> > +};
> > +
> > +struct shp_device {
> > +	struct list_head	list;
> > +	struct device		*device;
> 
> No, make it a "real" device, embed the device into it.

This device pointer is used to send KOBJ_ONLINE/OFFLINE event during CPU
online/offline operation in order to maintain the current behavior.  CPU
online/offline operation only changes the state of CPU, so its
system/cpu device continues to be present before and after an operation.
(Whereas, CPU hot-add/delete operation creates or removes a system/cpu
device.)  So, this "*device" needs to be a pointer to reference an
existing device that is to be on-lined/off-lined.

> But, again, I'm going to ask why you aren't using the existing cpu /
> memory / bridge / node devices that we have in the kernel.  Please use
> them, or give me a _really_ good reason why they will not work.

We cannot use the existing system devices or ACPI devices here.  During
hot-plug, ACPI handler sets this shp_device info, so that cpu and memory
handlers (drivers/cpu.c and mm/memory_hotplug.c) can obtain their target
device information in a platform-neutral way.  During hot-add, we first
creates an ACPI device node (i.e. device under /sys/bus/acpi/devices),
but platform-neutral modules cannot use them as they are ACPI-specific.
Also, its system device (i.e. device under /sys/devices/system) has not
been created until the hot-add operation completes.

> > +	enum shp_class		class;
> > +	union shp_dev_info	info;
> > +};
> > +
> > +/*
> > + * Hot-plug request
> > + */
> > +struct shp_request {
> > +	/* common info */
> > +	enum shp_operation	operation;	/* operation */
> > +
> > +	/* hot-plug event info: only valid for hot-plug operations */
> > +	void			*handle;	/* FW handle */
> > +	u32			event;		/* FW event */
> 
> What is this?

The shp_request describes a hotplug or online/offline operation that is
requested.  In case of hot-plug request, the "*handle" describes a
target device (which is an ACPI device object) and the "event" describes
a type of request, such as hot-add or hot-delete.

Thanks,
-Toshi
Greg Kroah-Hartman Jan. 31, 2013, 5:24 a.m. UTC | #11
On Wed, Jan 30, 2013 at 06:15:12PM -0700, Toshi Kani wrote:
> > Please make it a "real" pointer, and not a void *, those shouldn't be
> > used at all if possible.
> 
> How about changing the "void *handle" to acpi_dev_node below?   
> 
>    struct acpi_dev_node    acpi_node;
> 
> Basically, it has the same challenge as struct device, which uses
> acpi_dev_node as well.  We can add other FW node when needed (just like
> device also has *of_node).

That sounds good to me.
Toshi Kani Jan. 31, 2013, 2:42 p.m. UTC | #12
On Thu, 2013-01-31 at 05:24 +0000, Greg KH wrote:
> On Wed, Jan 30, 2013 at 06:15:12PM -0700, Toshi Kani wrote:
> > > Please make it a "real" pointer, and not a void *, those shouldn't be
> > > used at all if possible.
> > 
> > How about changing the "void *handle" to acpi_dev_node below?   
> > 
> >    struct acpi_dev_node    acpi_node;
> > 
> > Basically, it has the same challenge as struct device, which uses
> > acpi_dev_node as well.  We can add other FW node when needed (just like
> > device also has *of_node).
> 
> That sounds good to me.

Great!  Thanks Greg,
-Toshi
Rafael J. Wysocki Jan. 31, 2013, 8:54 p.m. UTC | #13
On Wednesday, January 30, 2013 07:57:45 PM Toshi Kani wrote:
> On Tue, 2013-01-29 at 23:58 -0500, Greg KH wrote:
> > On Thu, Jan 10, 2013 at 04:40:19PM -0700, Toshi Kani wrote:
> > > +/*
> > > + * Hot-plug device information
> > > + */
> > 
> > Again, stop it with the "generic" hotplug term here, and everywhere
> > else.  You are doing a very _specific_ type of hotplug devices, so spell
> > it out.  We've worked hard to hotplug _everything_ in Linux, you are
> > going to confuse a lot of people with this type of terms.
> 
> Agreed.  I will clarify in all places.
> 
> > > +union shp_dev_info {
> > > +	struct shp_cpu {
> > > +		u32		cpu_id;
> > > +	} cpu;
> > 
> > What is this?  Why not point to the system device for the cpu?
> 
> This info is used to on-line a new CPU and create its system/cpu device.
> In other word, a system/cpu device is created as a result of CPU
> hotplug.
> 
> > > +	struct shp_memory {
> > > +		int		node;
> > > +		u64		start_addr;
> > > +		u64		length;
> > > +	} mem;
> > 
> > Same here, why not point to the system device?
> 
> Same as above.
> 
> > > +	struct shp_hostbridge {
> > > +	} hb;
> > > +
> > > +	struct shp_node {
> > > +	} node;
> > 
> > What happened here with these?  Empty structures?  Huh?
> 
> They are place holders for now.  PCI bridge hot-plug and node hot-plug
> are still very much work in progress, so I have not integrated them into
> this framework yet.
> 
> > > +};
> > > +
> > > +struct shp_device {
> > > +	struct list_head	list;
> > > +	struct device		*device;
> > 
> > No, make it a "real" device, embed the device into it.
> 
> This device pointer is used to send KOBJ_ONLINE/OFFLINE event during CPU
> online/offline operation in order to maintain the current behavior.  CPU
> online/offline operation only changes the state of CPU, so its
> system/cpu device continues to be present before and after an operation.
> (Whereas, CPU hot-add/delete operation creates or removes a system/cpu
> device.)  So, this "*device" needs to be a pointer to reference an
> existing device that is to be on-lined/off-lined.
> 
> > But, again, I'm going to ask why you aren't using the existing cpu /
> > memory / bridge / node devices that we have in the kernel.  Please use
> > them, or give me a _really_ good reason why they will not work.
> 
> We cannot use the existing system devices or ACPI devices here.  During
> hot-plug, ACPI handler sets this shp_device info, so that cpu and memory
> handlers (drivers/cpu.c and mm/memory_hotplug.c) can obtain their target
> device information in a platform-neutral way.  During hot-add, we first
> creates an ACPI device node (i.e. device under /sys/bus/acpi/devices),
> but platform-neutral modules cannot use them as they are ACPI-specific.

But suppose we're smart and have ACPI scan handlers that will create
"physical" device nodes for those devices during the ACPI namespace scan.
Then, the platform-neutral nodes will be able to bind to those "physical"
nodes.  Moreover, it should be possible to get a hierarchy of device objects
this way that will reflect all of the dependencies we need to take into
account during hot-add and hot-remove operations.  That may not be what we
have today, but I don't see any *fundamental* obstacles preventing us from
using this approach.

This is already done for PCI host bridges and platform devices and I don't
see why we can't do that for the other types of devices too.

The only missing piece I see is a way to handle the "eject" problem, i.e.
when we try do eject a device at the top of a subtree and need to tear down
the entire subtree below it, but if that's going to lead to a system crash,
for example, we want to cancel the eject.  It seems to me that we'll need some
help from the driver core here.

Thanks,
Rafael
Toshi Kani Feb. 1, 2013, 1:32 a.m. UTC | #14
On Thu, 2013-01-31 at 21:54 +0100, Rafael J. Wysocki wrote:
> On Wednesday, January 30, 2013 07:57:45 PM Toshi Kani wrote:
> > On Tue, 2013-01-29 at 23:58 -0500, Greg KH wrote:
> > > On Thu, Jan 10, 2013 at 04:40:19PM -0700, Toshi Kani wrote:
 :
> > > > +};
> > > > +
> > > > +struct shp_device {
> > > > +	struct list_head	list;
> > > > +	struct device		*device;
> > > 
> > > No, make it a "real" device, embed the device into it.
> > 
> > This device pointer is used to send KOBJ_ONLINE/OFFLINE event during CPU
> > online/offline operation in order to maintain the current behavior.  CPU
> > online/offline operation only changes the state of CPU, so its
> > system/cpu device continues to be present before and after an operation.
> > (Whereas, CPU hot-add/delete operation creates or removes a system/cpu
> > device.)  So, this "*device" needs to be a pointer to reference an
> > existing device that is to be on-lined/off-lined.
> > 
> > > But, again, I'm going to ask why you aren't using the existing cpu /
> > > memory / bridge / node devices that we have in the kernel.  Please use
> > > them, or give me a _really_ good reason why they will not work.
> > 
> > We cannot use the existing system devices or ACPI devices here.  During
> > hot-plug, ACPI handler sets this shp_device info, so that cpu and memory
> > handlers (drivers/cpu.c and mm/memory_hotplug.c) can obtain their target
> > device information in a platform-neutral way.  During hot-add, we first
> > creates an ACPI device node (i.e. device under /sys/bus/acpi/devices),
> > but platform-neutral modules cannot use them as they are ACPI-specific.
> 
> But suppose we're smart and have ACPI scan handlers that will create
> "physical" device nodes for those devices during the ACPI namespace scan.
> Then, the platform-neutral nodes will be able to bind to those "physical"
> nodes.  Moreover, it should be possible to get a hierarchy of device objects
> this way that will reflect all of the dependencies we need to take into
> account during hot-add and hot-remove operations.  That may not be what we
> have today, but I don't see any *fundamental* obstacles preventing us from
> using this approach.

I misstated in my previous email.  system/cpu device is actually created
by ACPI driver during ACPI scan in case of hot-add.  This is done by 
acpi_processor_hotadd_init(), which I consider as a hack but can be
done.  system/memory device is created in add_memory() by the mm module.

> This is already done for PCI host bridges and platform devices and I don't
> see why we can't do that for the other types of devices too.
> 
> The only missing piece I see is a way to handle the "eject" problem, i.e.
> when we try do eject a device at the top of a subtree and need to tear down
> the entire subtree below it, but if that's going to lead to a system crash,
> for example, we want to cancel the eject.  It seems to me that we'll need some
> help from the driver core here.

There are three different approaches suggested for system device
hot-plug:
 A. Proceed within system device bus scan.
 B. Proceed within ACPI bus scan.
 C. Proceed with a sequence (as a mini-boot).

Option A uses system devices as tokens, option B uses acpi devices as
tokens, and option C uses resource tables as tokens, for their handlers.

Here is summary of key questions & answers so far.  I hope this
clarifies why I am suggesting option 3.

1. What are the system devices?
System devices provide system-wide core computing resources, which are
essential to compose a computer system.  System devices are not
connected to any particular standard buses.

2. Why are the system devices special?
The system devices are initialized during early boot-time, by multiple
subsystems, from the boot-up sequence, in pre-defined order.  They
provide low-level services to enable other subsystems to come up.

3. Why can't initialize the system devices from the driver structure at
boot?
The driver structure is initialized at the end of the boot sequence and
requires the low-level services from the system devices initialized
beforehand.

4. Why do we need a new common framework?
Sysfs CPU and memory on-lining/off-lining are performed within the CPU
and memory modules.  They are common code and do not depend on ACPI.
Therefore, a new common framework is necessary to integrate both
on-lining/off-lining operation and hot-plugging operation of system
devices into a single framework.

5. Why can't do everything with ACPI bus scan?
Software dependency among system devices may not be dictated by the ACPI
hierarchy.  For instance, memory should be initialized before CPUs (i.e.
a new cpu may need its local memory), but such ordering cannot be
guaranteed by the ACPI hierarchy.  Also, as described in 4,
online/offline operations are independent from ACPI.  

Thanks,
-Toshi
Greg Kroah-Hartman Feb. 1, 2013, 7:23 a.m. UTC | #15
On Thu, Jan 31, 2013 at 09:54:51PM +0100, Rafael J. Wysocki wrote:
> > > But, again, I'm going to ask why you aren't using the existing cpu /
> > > memory / bridge / node devices that we have in the kernel.  Please use
> > > them, or give me a _really_ good reason why they will not work.
> > 
> > We cannot use the existing system devices or ACPI devices here.  During
> > hot-plug, ACPI handler sets this shp_device info, so that cpu and memory
> > handlers (drivers/cpu.c and mm/memory_hotplug.c) can obtain their target
> > device information in a platform-neutral way.  During hot-add, we first
> > creates an ACPI device node (i.e. device under /sys/bus/acpi/devices),
> > but platform-neutral modules cannot use them as they are ACPI-specific.
> 
> But suppose we're smart and have ACPI scan handlers that will create
> "physical" device nodes for those devices during the ACPI namespace scan.
> Then, the platform-neutral nodes will be able to bind to those "physical"
> nodes.  Moreover, it should be possible to get a hierarchy of device objects
> this way that will reflect all of the dependencies we need to take into
> account during hot-add and hot-remove operations.  That may not be what we
> have today, but I don't see any *fundamental* obstacles preventing us from
> using this approach.

I would _much_ rather see that be the solution here as I think it is the
proper one.

> This is already done for PCI host bridges and platform devices and I don't
> see why we can't do that for the other types of devices too.

I agree.

> The only missing piece I see is a way to handle the "eject" problem, i.e.
> when we try do eject a device at the top of a subtree and need to tear down
> the entire subtree below it, but if that's going to lead to a system crash,
> for example, we want to cancel the eject.  It seems to me that we'll need some
> help from the driver core here.

I say do what we always have done here, if the user asked us to tear
something down, let it happen as they are the ones that know best :)

Seriously, I guess this gets back to the "fail disconnect" idea that the
ACPI developers keep harping on.  I thought we already resolved this
properly by having them implement it in their bus code, no reason the
same thing couldn't happen here, right?  I don't think the core needs to
do anything special, but if so, I'll be glad to review it.

thanks,

gre k-h
Greg Kroah-Hartman Feb. 1, 2013, 7:30 a.m. UTC | #16
On Thu, Jan 31, 2013 at 06:32:18PM -0700, Toshi Kani wrote:
 > This is already done for PCI host bridges and platform devices and I don't
> > see why we can't do that for the other types of devices too.
> > 
> > The only missing piece I see is a way to handle the "eject" problem, i.e.
> > when we try do eject a device at the top of a subtree and need to tear down
> > the entire subtree below it, but if that's going to lead to a system crash,
> > for example, we want to cancel the eject.  It seems to me that we'll need some
> > help from the driver core here.
> 
> There are three different approaches suggested for system device
> hot-plug:
>  A. Proceed within system device bus scan.
>  B. Proceed within ACPI bus scan.
>  C. Proceed with a sequence (as a mini-boot).
> 
> Option A uses system devices as tokens, option B uses acpi devices as
> tokens, and option C uses resource tables as tokens, for their handlers.
> 
> Here is summary of key questions & answers so far.  I hope this
> clarifies why I am suggesting option 3.
> 
> 1. What are the system devices?
> System devices provide system-wide core computing resources, which are
> essential to compose a computer system.  System devices are not
> connected to any particular standard buses.

Not a problem, lots of devices are not connected to any "particular
standard busses".  All this means is that system devices are connected
to the "system" bus, nothing more.

> 2. Why are the system devices special?
> The system devices are initialized during early boot-time, by multiple
> subsystems, from the boot-up sequence, in pre-defined order.  They
> provide low-level services to enable other subsystems to come up.

Sorry, no, that doesn't mean they are special, nothing here is unique
for the point of view of the driver model from any other device or bus.

> 3. Why can't initialize the system devices from the driver structure at
> boot?
> The driver structure is initialized at the end of the boot sequence and
> requires the low-level services from the system devices initialized
> beforehand.

Wait, what "driver structure"?  If you need to initialize the driver
core earlier, then do so.  Or, even better, just wait until enough of
the system has come up and then go initialize all of the devices you
have found so far as part of your boot process.

None of the above things you have stated seem to have anything to do
with your proposed patch, so I don't understand why you have mentioned
them...

> 4. Why do we need a new common framework?
> Sysfs CPU and memory on-lining/off-lining are performed within the CPU
> and memory modules.  They are common code and do not depend on ACPI.
> Therefore, a new common framework is necessary to integrate both
> on-lining/off-lining operation and hot-plugging operation of system
> devices into a single framework.

{sigh}

Removing and adding devices and handling hotplug operations is what the
driver core was written for, almost 10 years ago.  To somehow think that
your devices are "special" just because they don't use ACPI is odd,
because the driver core itself has nothing to do with ACPI.  Don't get
the current mix of x86 system code tied into ACPI confused with an
driver core issues here please.

> 5. Why can't do everything with ACPI bus scan?
> Software dependency among system devices may not be dictated by the ACPI
> hierarchy.  For instance, memory should be initialized before CPUs (i.e.
> a new cpu may need its local memory), but such ordering cannot be
> guaranteed by the ACPI hierarchy.  Also, as described in 4,
> online/offline operations are independent from ACPI.  

That's fine, the driver core is independant from ACPI.  I don't care how
you do the scaning of your devices, but I do care about you creating new
driver core pieces that duplicate the existing functionality of what we
have today.

In short, I like Rafael's proposal better, and I fail to see how
anything you have stated here would matter in how this is implemented. :)

thanks,

greg k-h
Toshi Kani Feb. 1, 2013, 8:40 p.m. UTC | #17
On Fri, 2013-02-01 at 07:30 +0000, Greg KH wrote:
> On Thu, Jan 31, 2013 at 06:32:18PM -0700, Toshi Kani wrote:
>  > This is already done for PCI host bridges and platform devices and I don't
> > > see why we can't do that for the other types of devices too.
> > > 
> > > The only missing piece I see is a way to handle the "eject" problem, i.e.
> > > when we try do eject a device at the top of a subtree and need to tear down
> > > the entire subtree below it, but if that's going to lead to a system crash,
> > > for example, we want to cancel the eject.  It seems to me that we'll need some
> > > help from the driver core here.
> > 
> > There are three different approaches suggested for system device
> > hot-plug:
> >  A. Proceed within system device bus scan.
> >  B. Proceed within ACPI bus scan.
> >  C. Proceed with a sequence (as a mini-boot).
> > 
> > Option A uses system devices as tokens, option B uses acpi devices as
> > tokens, and option C uses resource tables as tokens, for their handlers.
> > 
> > Here is summary of key questions & answers so far.  I hope this
> > clarifies why I am suggesting option 3.
> > 
> > 1. What are the system devices?
> > System devices provide system-wide core computing resources, which are
> > essential to compose a computer system.  System devices are not
> > connected to any particular standard buses.
> 
> Not a problem, lots of devices are not connected to any "particular
> standard busses".  All this means is that system devices are connected
> to the "system" bus, nothing more.

Can you give me a few examples of other devices that support hotplug and
are not connected to any particular buses?  I will investigate them to
see how they are managed to support hotplug.

> > 2. Why are the system devices special?
> > The system devices are initialized during early boot-time, by multiple
> > subsystems, from the boot-up sequence, in pre-defined order.  They
> > provide low-level services to enable other subsystems to come up.
> 
> Sorry, no, that doesn't mean they are special, nothing here is unique
> for the point of view of the driver model from any other device or bus.

I think system devices are unique in a sense that they are initialized
before drivers run.

> > 3. Why can't initialize the system devices from the driver structure at
> > boot?
> > The driver structure is initialized at the end of the boot sequence and
> > requires the low-level services from the system devices initialized
> > beforehand.
> 
> Wait, what "driver structure"?  

Sorry it was not clear.  cpu_dev_init() and memory_dev_init() are called
from driver_init() at the end of the boot sequence, and initialize
system/cpu and system/memory devices.  I assume they are the system bus
you are referring with option A.

> If you need to initialize the driver
> core earlier, then do so.  Or, even better, just wait until enough of
> the system has come up and then go initialize all of the devices you
> have found so far as part of your boot process.

They are pseudo drivers that provide sysfs entry points of cpu and
memory.  They do not actually initialize cpu and memory.  I do not think
initializing cpu and memory fits into the driver model either, since
drivers should run after cpu and memory are initialized.

> None of the above things you have stated seem to have anything to do
> with your proposed patch, so I don't understand why you have mentioned
> them...

You suggested option A before, which uses system bus scan to initialize
all system devices at boot time as well as hot-plug.  I tried to say
that this option would not be doable.

> > 4. Why do we need a new common framework?
> > Sysfs CPU and memory on-lining/off-lining are performed within the CPU
> > and memory modules.  They are common code and do not depend on ACPI.
> > Therefore, a new common framework is necessary to integrate both
> > on-lining/off-lining operation and hot-plugging operation of system
> > devices into a single framework.
> 
> {sigh}
> 
> Removing and adding devices and handling hotplug operations is what the
> driver core was written for, almost 10 years ago.  To somehow think that
> your devices are "special" just because they don't use ACPI is odd,
> because the driver core itself has nothing to do with ACPI.  Don't get
> the current mix of x86 system code tied into ACPI confused with an
> driver core issues here please.

CPU online/offline operation is performed within the CPU module.  Memory
online/offline operation is performed within the memory module.  CPU and
memory hotplug operations are performed within ACPI.  While they deal
with the same set of devices, they operate independently and are not
managed under a same framework.

I agree with you that not using ACPI is perfectly fine.  My point is
that ACPI framework won't be able to manage operations that do not use
ACPI.

> > 5. Why can't do everything with ACPI bus scan?
> > Software dependency among system devices may not be dictated by the ACPI
> > hierarchy.  For instance, memory should be initialized before CPUs (i.e.
> > a new cpu may need its local memory), but such ordering cannot be
> > guaranteed by the ACPI hierarchy.  Also, as described in 4,
> > online/offline operations are independent from ACPI.  
> 
> That's fine, the driver core is independant from ACPI.  I don't care how
> you do the scaning of your devices, but I do care about you creating new
> driver core pieces that duplicate the existing functionality of what we
> have today.
>
> In short, I like Rafael's proposal better, and I fail to see how
> anything you have stated here would matter in how this is implemented. :)

Doing everything within ACPI means we can only manage ACPI hotplug
operations, not online/offline operations.  But I understand that you
concern about adding a new framework with option C.  It is good to know
that you are fine with option B. :)  So, I will step back, and think
about what we can do within ACPI.

Thanks,
-Toshi
Rafael J. Wysocki Feb. 1, 2013, 10:12 p.m. UTC | #18
On Friday, February 01, 2013 08:23:12 AM Greg KH wrote:
> On Thu, Jan 31, 2013 at 09:54:51PM +0100, Rafael J. Wysocki wrote:
> > > > But, again, I'm going to ask why you aren't using the existing cpu /
> > > > memory / bridge / node devices that we have in the kernel.  Please use
> > > > them, or give me a _really_ good reason why they will not work.
> > > 
> > > We cannot use the existing system devices or ACPI devices here.  During
> > > hot-plug, ACPI handler sets this shp_device info, so that cpu and memory
> > > handlers (drivers/cpu.c and mm/memory_hotplug.c) can obtain their target
> > > device information in a platform-neutral way.  During hot-add, we first
> > > creates an ACPI device node (i.e. device under /sys/bus/acpi/devices),
> > > but platform-neutral modules cannot use them as they are ACPI-specific.
> > 
> > But suppose we're smart and have ACPI scan handlers that will create
> > "physical" device nodes for those devices during the ACPI namespace scan.
> > Then, the platform-neutral nodes will be able to bind to those "physical"
> > nodes.  Moreover, it should be possible to get a hierarchy of device objects
> > this way that will reflect all of the dependencies we need to take into
> > account during hot-add and hot-remove operations.  That may not be what we
> > have today, but I don't see any *fundamental* obstacles preventing us from
> > using this approach.
> 
> I would _much_ rather see that be the solution here as I think it is the
> proper one.
> 
> > This is already done for PCI host bridges and platform devices and I don't
> > see why we can't do that for the other types of devices too.
> 
> I agree.
> 
> > The only missing piece I see is a way to handle the "eject" problem, i.e.
> > when we try do eject a device at the top of a subtree and need to tear down
> > the entire subtree below it, but if that's going to lead to a system crash,
> > for example, we want to cancel the eject.  It seems to me that we'll need some
> > help from the driver core here.
> 
> I say do what we always have done here, if the user asked us to tear
> something down, let it happen as they are the ones that know best :)
> 
> Seriously, I guess this gets back to the "fail disconnect" idea that the
> ACPI developers keep harping on.  I thought we already resolved this
> properly by having them implement it in their bus code, no reason the
> same thing couldn't happen here, right?

Not really. :-)  We haven't ever resolved that particular issue I'm afraid.

> I don't think the core needs to do anything special, but if so, I'll be glad
> to review it.

OK, so this is the use case.  We have "eject" defined for something like
a container with a number of CPU cores, PCI host bridge, and a memory
controller under it.  And a few pretty much arbitrary I/O devices as a bonus.

Now, there's a button on the system case labeled as "Eject" and if that button
is pressed, we're supposed to _try_ to eject all of those things at once.  We
are allowed to fail that request, though, if that's problematic for some
reason, but we're supposed to let the BIOS know about that.

Do you seriously think that if that button is pressed, we should just proceed
with removing all that stuff no matter what?  That'd be kind of like Russian
roulette for whoever pressed that button, because s/he could only press it and
wait for the system to either crash or not.  Or maybe to crash a bit later
because of some delayed stuff that would hit one of those devices that had just
gone.  Surely not a situation any admin of a high-availability system would
like to be in. :-)

Quite frankly, I have no idea how that can be addressed in a single bus type,
let alone ACPI (which is not even a proper bus type, just something pretending
to be one).

Thanks,
Rafael
Rafael J. Wysocki Feb. 1, 2013, 10:21 p.m. UTC | #19
On Friday, February 01, 2013 01:40:10 PM Toshi Kani wrote:
> On Fri, 2013-02-01 at 07:30 +0000, Greg KH wrote:
> > On Thu, Jan 31, 2013 at 06:32:18PM -0700, Toshi Kani wrote:
> >  > This is already done for PCI host bridges and platform devices and I don't
> > > > see why we can't do that for the other types of devices too.
> > > > 
> > > > The only missing piece I see is a way to handle the "eject" problem, i.e.
> > > > when we try do eject a device at the top of a subtree and need to tear down
> > > > the entire subtree below it, but if that's going to lead to a system crash,
> > > > for example, we want to cancel the eject.  It seems to me that we'll need some
> > > > help from the driver core here.
> > > 
> > > There are three different approaches suggested for system device
> > > hot-plug:
> > >  A. Proceed within system device bus scan.
> > >  B. Proceed within ACPI bus scan.
> > >  C. Proceed with a sequence (as a mini-boot).
> > > 
> > > Option A uses system devices as tokens, option B uses acpi devices as
> > > tokens, and option C uses resource tables as tokens, for their handlers.
> > > 
> > > Here is summary of key questions & answers so far.  I hope this
> > > clarifies why I am suggesting option 3.
> > > 
> > > 1. What are the system devices?
> > > System devices provide system-wide core computing resources, which are
> > > essential to compose a computer system.  System devices are not
> > > connected to any particular standard buses.
> > 
> > Not a problem, lots of devices are not connected to any "particular
> > standard busses".  All this means is that system devices are connected
> > to the "system" bus, nothing more.
> 
> Can you give me a few examples of other devices that support hotplug and
> are not connected to any particular buses?  I will investigate them to
> see how they are managed to support hotplug.
> 
> > > 2. Why are the system devices special?
> > > The system devices are initialized during early boot-time, by multiple
> > > subsystems, from the boot-up sequence, in pre-defined order.  They
> > > provide low-level services to enable other subsystems to come up.
> > 
> > Sorry, no, that doesn't mean they are special, nothing here is unique
> > for the point of view of the driver model from any other device or bus.
> 
> I think system devices are unique in a sense that they are initialized
> before drivers run.
> 
> > > 3. Why can't initialize the system devices from the driver structure at
> > > boot?
> > > The driver structure is initialized at the end of the boot sequence and
> > > requires the low-level services from the system devices initialized
> > > beforehand.
> > 
> > Wait, what "driver structure"?  
> 
> Sorry it was not clear.  cpu_dev_init() and memory_dev_init() are called
> from driver_init() at the end of the boot sequence, and initialize
> system/cpu and system/memory devices.  I assume they are the system bus
> you are referring with option A.
> 
> > If you need to initialize the driver
> > core earlier, then do so.  Or, even better, just wait until enough of
> > the system has come up and then go initialize all of the devices you
> > have found so far as part of your boot process.
> 
> They are pseudo drivers that provide sysfs entry points of cpu and
> memory.  They do not actually initialize cpu and memory.  I do not think
> initializing cpu and memory fits into the driver model either, since
> drivers should run after cpu and memory are initialized.
> 
> > None of the above things you have stated seem to have anything to do
> > with your proposed patch, so I don't understand why you have mentioned
> > them...
> 
> You suggested option A before, which uses system bus scan to initialize
> all system devices at boot time as well as hot-plug.  I tried to say
> that this option would not be doable.
> 
> > > 4. Why do we need a new common framework?
> > > Sysfs CPU and memory on-lining/off-lining are performed within the CPU
> > > and memory modules.  They are common code and do not depend on ACPI.
> > > Therefore, a new common framework is necessary to integrate both
> > > on-lining/off-lining operation and hot-plugging operation of system
> > > devices into a single framework.
> > 
> > {sigh}
> > 
> > Removing and adding devices and handling hotplug operations is what the
> > driver core was written for, almost 10 years ago.  To somehow think that
> > your devices are "special" just because they don't use ACPI is odd,
> > because the driver core itself has nothing to do with ACPI.  Don't get
> > the current mix of x86 system code tied into ACPI confused with an
> > driver core issues here please.
> 
> CPU online/offline operation is performed within the CPU module.  Memory
> online/offline operation is performed within the memory module.  CPU and
> memory hotplug operations are performed within ACPI.  While they deal
> with the same set of devices, they operate independently and are not
> managed under a same framework.
> 
> I agree with you that not using ACPI is perfectly fine.  My point is
> that ACPI framework won't be able to manage operations that do not use
> ACPI.
> 
> > > 5. Why can't do everything with ACPI bus scan?
> > > Software dependency among system devices may not be dictated by the ACPI
> > > hierarchy.  For instance, memory should be initialized before CPUs (i.e.
> > > a new cpu may need its local memory), but such ordering cannot be
> > > guaranteed by the ACPI hierarchy.  Also, as described in 4,
> > > online/offline operations are independent from ACPI.  
> > 
> > That's fine, the driver core is independant from ACPI.  I don't care how
> > you do the scaning of your devices, but I do care about you creating new
> > driver core pieces that duplicate the existing functionality of what we
> > have today.
> >
> > In short, I like Rafael's proposal better, and I fail to see how
> > anything you have stated here would matter in how this is implemented. :)
> 
> Doing everything within ACPI means we can only manage ACPI hotplug
> operations, not online/offline operations.  But I understand that you
> concern about adding a new framework with option C.  It is good to know
> that you are fine with option B. :)  So, I will step back, and think
> about what we can do within ACPI.

Not much, because ACPI only knows about a subset of devices that may be
involved in that, and a limited one for that matter.  For one example,
anything connected through PCI and not having a corresponding ACPI object (i.e.
pretty much every add-in card in existence) will be unknown to ACPI.  And
say one of these things is a SATA controller with a number of disks under it
and so on.  ACPI won't even know that it exists.  Moreover, PCI won't know
that those disks exist.  Etc.

Thanks,
Rafael
Toshi Kani Feb. 1, 2013, 11:12 p.m. UTC | #20
On Fri, 2013-02-01 at 23:21 +0100, Rafael J. Wysocki wrote:
> On Friday, February 01, 2013 01:40:10 PM Toshi Kani wrote:
> > On Fri, 2013-02-01 at 07:30 +0000, Greg KH wrote:
> > > On Thu, Jan 31, 2013 at 06:32:18PM -0700, Toshi Kani wrote:
> > >  > This is already done for PCI host bridges and platform devices and I don't
> > > > > see why we can't do that for the other types of devices too.
> > > > > 
> > > > > The only missing piece I see is a way to handle the "eject" problem, i.e.
> > > > > when we try do eject a device at the top of a subtree and need to tear down
> > > > > the entire subtree below it, but if that's going to lead to a system crash,
> > > > > for example, we want to cancel the eject.  It seems to me that we'll need some
> > > > > help from the driver core here.
> > > > 
> > > > There are three different approaches suggested for system device
> > > > hot-plug:
> > > >  A. Proceed within system device bus scan.
> > > >  B. Proceed within ACPI bus scan.
> > > >  C. Proceed with a sequence (as a mini-boot).
> > > > 
> > > > Option A uses system devices as tokens, option B uses acpi devices as
> > > > tokens, and option C uses resource tables as tokens, for their handlers.
> > > > 
> > > > Here is summary of key questions & answers so far.  I hope this
> > > > clarifies why I am suggesting option 3.
> > > > 
> > > > 1. What are the system devices?
> > > > System devices provide system-wide core computing resources, which are
> > > > essential to compose a computer system.  System devices are not
> > > > connected to any particular standard buses.
> > > 
> > > Not a problem, lots of devices are not connected to any "particular
> > > standard busses".  All this means is that system devices are connected
> > > to the "system" bus, nothing more.
> > 
> > Can you give me a few examples of other devices that support hotplug and
> > are not connected to any particular buses?  I will investigate them to
> > see how they are managed to support hotplug.
> > 
> > > > 2. Why are the system devices special?
> > > > The system devices are initialized during early boot-time, by multiple
> > > > subsystems, from the boot-up sequence, in pre-defined order.  They
> > > > provide low-level services to enable other subsystems to come up.
> > > 
> > > Sorry, no, that doesn't mean they are special, nothing here is unique
> > > for the point of view of the driver model from any other device or bus.
> > 
> > I think system devices are unique in a sense that they are initialized
> > before drivers run.
> > 
> > > > 3. Why can't initialize the system devices from the driver structure at
> > > > boot?
> > > > The driver structure is initialized at the end of the boot sequence and
> > > > requires the low-level services from the system devices initialized
> > > > beforehand.
> > > 
> > > Wait, what "driver structure"?  
> > 
> > Sorry it was not clear.  cpu_dev_init() and memory_dev_init() are called
> > from driver_init() at the end of the boot sequence, and initialize
> > system/cpu and system/memory devices.  I assume they are the system bus
> > you are referring with option A.
> > 
> > > If you need to initialize the driver
> > > core earlier, then do so.  Or, even better, just wait until enough of
> > > the system has come up and then go initialize all of the devices you
> > > have found so far as part of your boot process.
> > 
> > They are pseudo drivers that provide sysfs entry points of cpu and
> > memory.  They do not actually initialize cpu and memory.  I do not think
> > initializing cpu and memory fits into the driver model either, since
> > drivers should run after cpu and memory are initialized.
> > 
> > > None of the above things you have stated seem to have anything to do
> > > with your proposed patch, so I don't understand why you have mentioned
> > > them...
> > 
> > You suggested option A before, which uses system bus scan to initialize
> > all system devices at boot time as well as hot-plug.  I tried to say
> > that this option would not be doable.
> > 
> > > > 4. Why do we need a new common framework?
> > > > Sysfs CPU and memory on-lining/off-lining are performed within the CPU
> > > > and memory modules.  They are common code and do not depend on ACPI.
> > > > Therefore, a new common framework is necessary to integrate both
> > > > on-lining/off-lining operation and hot-plugging operation of system
> > > > devices into a single framework.
> > > 
> > > {sigh}
> > > 
> > > Removing and adding devices and handling hotplug operations is what the
> > > driver core was written for, almost 10 years ago.  To somehow think that
> > > your devices are "special" just because they don't use ACPI is odd,
> > > because the driver core itself has nothing to do with ACPI.  Don't get
> > > the current mix of x86 system code tied into ACPI confused with an
> > > driver core issues here please.
> > 
> > CPU online/offline operation is performed within the CPU module.  Memory
> > online/offline operation is performed within the memory module.  CPU and
> > memory hotplug operations are performed within ACPI.  While they deal
> > with the same set of devices, they operate independently and are not
> > managed under a same framework.
> > 
> > I agree with you that not using ACPI is perfectly fine.  My point is
> > that ACPI framework won't be able to manage operations that do not use
> > ACPI.
> > 
> > > > 5. Why can't do everything with ACPI bus scan?
> > > > Software dependency among system devices may not be dictated by the ACPI
> > > > hierarchy.  For instance, memory should be initialized before CPUs (i.e.
> > > > a new cpu may need its local memory), but such ordering cannot be
> > > > guaranteed by the ACPI hierarchy.  Also, as described in 4,
> > > > online/offline operations are independent from ACPI.  
> > > 
> > > That's fine, the driver core is independant from ACPI.  I don't care how
> > > you do the scaning of your devices, but I do care about you creating new
> > > driver core pieces that duplicate the existing functionality of what we
> > > have today.
> > >
> > > In short, I like Rafael's proposal better, and I fail to see how
> > > anything you have stated here would matter in how this is implemented. :)
> > 
> > Doing everything within ACPI means we can only manage ACPI hotplug
> > operations, not online/offline operations.  But I understand that you
> > concern about adding a new framework with option C.  It is good to know
> > that you are fine with option B. :)  So, I will step back, and think
> > about what we can do within ACPI.
> 
> Not much, because ACPI only knows about a subset of devices that may be
> involved in that, and a limited one for that matter.  For one example,
> anything connected through PCI and not having a corresponding ACPI object (i.e.
> pretty much every add-in card in existence) will be unknown to ACPI.  And
> say one of these things is a SATA controller with a number of disks under it
> and so on.  ACPI won't even know that it exists.  Moreover, PCI won't know
> that those disks exist.  Etc.

Agreed.  Thanks for bringing I/Os into the picture.  I did not mention
them since they have not supported in this patchset, but we certainly
need to consider them into the design.

Thanks,
-Toshi
Greg Kroah-Hartman Feb. 2, 2013, 2:58 p.m. UTC | #21
On Fri, Feb 01, 2013 at 11:12:59PM +0100, Rafael J. Wysocki wrote:
> On Friday, February 01, 2013 08:23:12 AM Greg KH wrote:
> > On Thu, Jan 31, 2013 at 09:54:51PM +0100, Rafael J. Wysocki wrote:
> > > > > But, again, I'm going to ask why you aren't using the existing cpu /
> > > > > memory / bridge / node devices that we have in the kernel.  Please use
> > > > > them, or give me a _really_ good reason why they will not work.
> > > > 
> > > > We cannot use the existing system devices or ACPI devices here.  During
> > > > hot-plug, ACPI handler sets this shp_device info, so that cpu and memory
> > > > handlers (drivers/cpu.c and mm/memory_hotplug.c) can obtain their target
> > > > device information in a platform-neutral way.  During hot-add, we first
> > > > creates an ACPI device node (i.e. device under /sys/bus/acpi/devices),
> > > > but platform-neutral modules cannot use them as they are ACPI-specific.
> > > 
> > > But suppose we're smart and have ACPI scan handlers that will create
> > > "physical" device nodes for those devices during the ACPI namespace scan.
> > > Then, the platform-neutral nodes will be able to bind to those "physical"
> > > nodes.  Moreover, it should be possible to get a hierarchy of device objects
> > > this way that will reflect all of the dependencies we need to take into
> > > account during hot-add and hot-remove operations.  That may not be what we
> > > have today, but I don't see any *fundamental* obstacles preventing us from
> > > using this approach.
> > 
> > I would _much_ rather see that be the solution here as I think it is the
> > proper one.
> > 
> > > This is already done for PCI host bridges and platform devices and I don't
> > > see why we can't do that for the other types of devices too.
> > 
> > I agree.
> > 
> > > The only missing piece I see is a way to handle the "eject" problem, i.e.
> > > when we try do eject a device at the top of a subtree and need to tear down
> > > the entire subtree below it, but if that's going to lead to a system crash,
> > > for example, we want to cancel the eject.  It seems to me that we'll need some
> > > help from the driver core here.
> > 
> > I say do what we always have done here, if the user asked us to tear
> > something down, let it happen as they are the ones that know best :)
> > 
> > Seriously, I guess this gets back to the "fail disconnect" idea that the
> > ACPI developers keep harping on.  I thought we already resolved this
> > properly by having them implement it in their bus code, no reason the
> > same thing couldn't happen here, right?
> 
> Not really. :-)  We haven't ever resolved that particular issue I'm afraid.

Ah, I didn't realize that.

> > I don't think the core needs to do anything special, but if so, I'll be glad
> > to review it.
> 
> OK, so this is the use case.  We have "eject" defined for something like
> a container with a number of CPU cores, PCI host bridge, and a memory
> controller under it.  And a few pretty much arbitrary I/O devices as a bonus.
> 
> Now, there's a button on the system case labeled as "Eject" and if that button
> is pressed, we're supposed to _try_ to eject all of those things at once.  We
> are allowed to fail that request, though, if that's problematic for some
> reason, but we're supposed to let the BIOS know about that.
> 
> Do you seriously think that if that button is pressed, we should just proceed
> with removing all that stuff no matter what?  That'd be kind of like Russian
> roulette for whoever pressed that button, because s/he could only press it and
> wait for the system to either crash or not.  Or maybe to crash a bit later
> because of some delayed stuff that would hit one of those devices that had just
> gone.  Surely not a situation any admin of a high-availability system would
> like to be in. :-)
> 
> Quite frankly, I have no idea how that can be addressed in a single bus type,
> let alone ACPI (which is not even a proper bus type, just something pretending
> to be one).

You don't have it as a single bus type, you have a controller somewhere,
off of the bus being destroyed, that handles sending remove events to
the device and tearing everything down.  PCI does this from the very
beginning.

I know it's more complicated with these types of devices, and I think we
are getting closer to the correct solution, I just don't want to ever
see duplicate devices in the driver model for the same physical device.

thanks,

greg k-h
Greg Kroah-Hartman Feb. 2, 2013, 3:01 p.m. UTC | #22
On Fri, Feb 01, 2013 at 01:40:10PM -0700, Toshi Kani wrote:
> On Fri, 2013-02-01 at 07:30 +0000, Greg KH wrote:
> > On Thu, Jan 31, 2013 at 06:32:18PM -0700, Toshi Kani wrote:
> >  > This is already done for PCI host bridges and platform devices and I don't
> > > > see why we can't do that for the other types of devices too.
> > > > 
> > > > The only missing piece I see is a way to handle the "eject" problem, i.e.
> > > > when we try do eject a device at the top of a subtree and need to tear down
> > > > the entire subtree below it, but if that's going to lead to a system crash,
> > > > for example, we want to cancel the eject.  It seems to me that we'll need some
> > > > help from the driver core here.
> > > 
> > > There are three different approaches suggested for system device
> > > hot-plug:
> > >  A. Proceed within system device bus scan.
> > >  B. Proceed within ACPI bus scan.
> > >  C. Proceed with a sequence (as a mini-boot).
> > > 
> > > Option A uses system devices as tokens, option B uses acpi devices as
> > > tokens, and option C uses resource tables as tokens, for their handlers.
> > > 
> > > Here is summary of key questions & answers so far.  I hope this
> > > clarifies why I am suggesting option 3.
> > > 
> > > 1. What are the system devices?
> > > System devices provide system-wide core computing resources, which are
> > > essential to compose a computer system.  System devices are not
> > > connected to any particular standard buses.
> > 
> > Not a problem, lots of devices are not connected to any "particular
> > standard busses".  All this means is that system devices are connected
> > to the "system" bus, nothing more.
> 
> Can you give me a few examples of other devices that support hotplug and
> are not connected to any particular buses?  I will investigate them to
> see how they are managed to support hotplug.

Any device that is attached to any bus in the driver model can be
hotunplugged from userspace by telling it to be "unbound" from the
driver controlling it.  Try it for any platform device in your system to
see how it happens.

> > > 2. Why are the system devices special?
> > > The system devices are initialized during early boot-time, by multiple
> > > subsystems, from the boot-up sequence, in pre-defined order.  They
> > > provide low-level services to enable other subsystems to come up.
> > 
> > Sorry, no, that doesn't mean they are special, nothing here is unique
> > for the point of view of the driver model from any other device or bus.
> 
> I think system devices are unique in a sense that they are initialized
> before drivers run.

No, most all devices are "initialized" before a driver runs on it, USB
is one such example, PCI another, and I'm pretty sure that there are
others.

> > If you need to initialize the driver
> > core earlier, then do so.  Or, even better, just wait until enough of
> > the system has come up and then go initialize all of the devices you
> > have found so far as part of your boot process.
> 
> They are pseudo drivers that provide sysfs entry points of cpu and
> memory.  They do not actually initialize cpu and memory.  I do not think
> initializing cpu and memory fits into the driver model either, since
> drivers should run after cpu and memory are initialized.

We already represent CPUs in the sysfs tree, don't represent them in two
different places with two different structures.  Use the existing ones
please.

> > None of the above things you have stated seem to have anything to do
> > with your proposed patch, so I don't understand why you have mentioned
> > them...
> 
> You suggested option A before, which uses system bus scan to initialize
> all system devices at boot time as well as hot-plug.  I tried to say
> that this option would not be doable.

I haven't yet been convinced otherwise, sorry.  Please prove me wrong :)

thanks,

greg k-h
Rafael J. Wysocki Feb. 2, 2013, 8:15 p.m. UTC | #23
On Saturday, February 02, 2013 03:58:01 PM Greg KH wrote:
> On Fri, Feb 01, 2013 at 11:12:59PM +0100, Rafael J. Wysocki wrote:
> > On Friday, February 01, 2013 08:23:12 AM Greg KH wrote:
> > > On Thu, Jan 31, 2013 at 09:54:51PM +0100, Rafael J. Wysocki wrote:
> > > > > > But, again, I'm going to ask why you aren't using the existing cpu /
> > > > > > memory / bridge / node devices that we have in the kernel.  Please use
> > > > > > them, or give me a _really_ good reason why they will not work.
> > > > > 
> > > > > We cannot use the existing system devices or ACPI devices here.  During
> > > > > hot-plug, ACPI handler sets this shp_device info, so that cpu and memory
> > > > > handlers (drivers/cpu.c and mm/memory_hotplug.c) can obtain their target
> > > > > device information in a platform-neutral way.  During hot-add, we first
> > > > > creates an ACPI device node (i.e. device under /sys/bus/acpi/devices),
> > > > > but platform-neutral modules cannot use them as they are ACPI-specific.
> > > > 
> > > > But suppose we're smart and have ACPI scan handlers that will create
> > > > "physical" device nodes for those devices during the ACPI namespace scan.
> > > > Then, the platform-neutral nodes will be able to bind to those "physical"
> > > > nodes.  Moreover, it should be possible to get a hierarchy of device objects
> > > > this way that will reflect all of the dependencies we need to take into
> > > > account during hot-add and hot-remove operations.  That may not be what we
> > > > have today, but I don't see any *fundamental* obstacles preventing us from
> > > > using this approach.
> > > 
> > > I would _much_ rather see that be the solution here as I think it is the
> > > proper one.
> > > 
> > > > This is already done for PCI host bridges and platform devices and I don't
> > > > see why we can't do that for the other types of devices too.
> > > 
> > > I agree.
> > > 
> > > > The only missing piece I see is a way to handle the "eject" problem, i.e.
> > > > when we try do eject a device at the top of a subtree and need to tear down
> > > > the entire subtree below it, but if that's going to lead to a system crash,
> > > > for example, we want to cancel the eject.  It seems to me that we'll need some
> > > > help from the driver core here.
> > > 
> > > I say do what we always have done here, if the user asked us to tear
> > > something down, let it happen as they are the ones that know best :)
> > > 
> > > Seriously, I guess this gets back to the "fail disconnect" idea that the
> > > ACPI developers keep harping on.  I thought we already resolved this
> > > properly by having them implement it in their bus code, no reason the
> > > same thing couldn't happen here, right?
> > 
> > Not really. :-)  We haven't ever resolved that particular issue I'm afraid.
> 
> Ah, I didn't realize that.
> 
> > > I don't think the core needs to do anything special, but if so, I'll be glad
> > > to review it.
> > 
> > OK, so this is the use case.  We have "eject" defined for something like
> > a container with a number of CPU cores, PCI host bridge, and a memory
> > controller under it.  And a few pretty much arbitrary I/O devices as a bonus.
> > 
> > Now, there's a button on the system case labeled as "Eject" and if that button
> > is pressed, we're supposed to _try_ to eject all of those things at once.  We
> > are allowed to fail that request, though, if that's problematic for some
> > reason, but we're supposed to let the BIOS know about that.
> > 
> > Do you seriously think that if that button is pressed, we should just proceed
> > with removing all that stuff no matter what?  That'd be kind of like Russian
> > roulette for whoever pressed that button, because s/he could only press it and
> > wait for the system to either crash or not.  Or maybe to crash a bit later
> > because of some delayed stuff that would hit one of those devices that had just
> > gone.  Surely not a situation any admin of a high-availability system would
> > like to be in. :-)
> > 
> > Quite frankly, I have no idea how that can be addressed in a single bus type,
> > let alone ACPI (which is not even a proper bus type, just something pretending
> > to be one).
> 
> You don't have it as a single bus type, you have a controller somewhere,
> off of the bus being destroyed, that handles sending remove events to
> the device and tearing everything down.  PCI does this from the very
> beginning.

Yes, but those are just remove events and we can only see how destructive they
were after the removal.  The point is to be able to figure out whether or not
we *want* to do the removal in the first place.

Say you have a computing node which signals a hardware problem in a processor
package (the container with CPU cores, memory, PCI host bridge etc.).  You
may want to eject that package, but you don't want to kill the system this
way.  So if the eject is doable, it is very much desirable to do it, but if it
is not doable, you'd rather shut the box down and do the replacement afterward.
That may be costly, however (maybe weeks of computations), so it should be
avoided if possible, but not at the expense of crashing the box if the eject
doesn't work out.

> I know it's more complicated with these types of devices, and I think we
> are getting closer to the correct solution, I just don't want to ever
> see duplicate devices in the driver model for the same physical device.

Do you mean two things based on struct device for the same hardware component?
That's been happening already pretty much forever for every PCI device known
to the ACPI layer, for PNP and many others.  However, those ACPI things are (or
rather should be, but we're going to clean that up) only for convenience (to be
able to see the namespace structure and related things in sysfs).  So the stuff
under /sys/devices/LNXSYSTM\:00/ is not "real".  In my view it shouldn't even
be under /sys/devices/ (/sys/firmware/acpi/ seems to be a better place for it),
but that may be difficult to change without breaking user space (maybe we can
just symlink it from /sys/devices/ or something).  And the ACPI bus type
shouldn't even exist in my opinion.

There's much confusion in there and much work to clean that up, I agree, but
that's kind of separate from the hotplug thing.

Thanks,
Rafael
Rafael J. Wysocki Feb. 3, 2013, 8:44 p.m. UTC | #24
On Saturday, February 02, 2013 09:15:37 PM Rafael J. Wysocki wrote:
> On Saturday, February 02, 2013 03:58:01 PM Greg KH wrote:
> > On Fri, Feb 01, 2013 at 11:12:59PM +0100, Rafael J. Wysocki wrote:
> > > On Friday, February 01, 2013 08:23:12 AM Greg KH wrote:
> > > > On Thu, Jan 31, 2013 at 09:54:51PM +0100, Rafael J. Wysocki wrote:
> > > > > > > But, again, I'm going to ask why you aren't using the existing cpu /
> > > > > > > memory / bridge / node devices that we have in the kernel.  Please use
> > > > > > > them, or give me a _really_ good reason why they will not work.
> > > > > > 
> > > > > > We cannot use the existing system devices or ACPI devices here.  During
> > > > > > hot-plug, ACPI handler sets this shp_device info, so that cpu and memory
> > > > > > handlers (drivers/cpu.c and mm/memory_hotplug.c) can obtain their target
> > > > > > device information in a platform-neutral way.  During hot-add, we first
> > > > > > creates an ACPI device node (i.e. device under /sys/bus/acpi/devices),
> > > > > > but platform-neutral modules cannot use them as they are ACPI-specific.
> > > > > 
> > > > > But suppose we're smart and have ACPI scan handlers that will create
> > > > > "physical" device nodes for those devices during the ACPI namespace scan.
> > > > > Then, the platform-neutral nodes will be able to bind to those "physical"
> > > > > nodes.  Moreover, it should be possible to get a hierarchy of device objects
> > > > > this way that will reflect all of the dependencies we need to take into
> > > > > account during hot-add and hot-remove operations.  That may not be what we
> > > > > have today, but I don't see any *fundamental* obstacles preventing us from
> > > > > using this approach.
> > > > 
> > > > I would _much_ rather see that be the solution here as I think it is the
> > > > proper one.
> > > > 
> > > > > This is already done for PCI host bridges and platform devices and I don't
> > > > > see why we can't do that for the other types of devices too.
> > > > 
> > > > I agree.
> > > > 
> > > > > The only missing piece I see is a way to handle the "eject" problem, i.e.
> > > > > when we try do eject a device at the top of a subtree and need to tear down
> > > > > the entire subtree below it, but if that's going to lead to a system crash,
> > > > > for example, we want to cancel the eject.  It seems to me that we'll need some
> > > > > help from the driver core here.
> > > > 
> > > > I say do what we always have done here, if the user asked us to tear
> > > > something down, let it happen as they are the ones that know best :)
> > > > 
> > > > Seriously, I guess this gets back to the "fail disconnect" idea that the
> > > > ACPI developers keep harping on.  I thought we already resolved this
> > > > properly by having them implement it in their bus code, no reason the
> > > > same thing couldn't happen here, right?
> > > 
> > > Not really. :-)  We haven't ever resolved that particular issue I'm afraid.
> > 
> > Ah, I didn't realize that.
> > 
> > > > I don't think the core needs to do anything special, but if so, I'll be glad
> > > > to review it.
> > > 
> > > OK, so this is the use case.  We have "eject" defined for something like
> > > a container with a number of CPU cores, PCI host bridge, and a memory
> > > controller under it.  And a few pretty much arbitrary I/O devices as a bonus.
> > > 
> > > Now, there's a button on the system case labeled as "Eject" and if that button
> > > is pressed, we're supposed to _try_ to eject all of those things at once.  We
> > > are allowed to fail that request, though, if that's problematic for some
> > > reason, but we're supposed to let the BIOS know about that.
> > > 
> > > Do you seriously think that if that button is pressed, we should just proceed
> > > with removing all that stuff no matter what?  That'd be kind of like Russian
> > > roulette for whoever pressed that button, because s/he could only press it and
> > > wait for the system to either crash or not.  Or maybe to crash a bit later
> > > because of some delayed stuff that would hit one of those devices that had just
> > > gone.  Surely not a situation any admin of a high-availability system would
> > > like to be in. :-)
> > > 
> > > Quite frankly, I have no idea how that can be addressed in a single bus type,
> > > let alone ACPI (which is not even a proper bus type, just something pretending
> > > to be one).
> > 
> > You don't have it as a single bus type, you have a controller somewhere,
> > off of the bus being destroyed, that handles sending remove events to
> > the device and tearing everything down.  PCI does this from the very
> > beginning.
> 
> Yes, but those are just remove events and we can only see how destructive they
> were after the removal.  The point is to be able to figure out whether or not
> we *want* to do the removal in the first place.
> 
> Say you have a computing node which signals a hardware problem in a processor
> package (the container with CPU cores, memory, PCI host bridge etc.).  You
> may want to eject that package, but you don't want to kill the system this
> way.  So if the eject is doable, it is very much desirable to do it, but if it
> is not doable, you'd rather shut the box down and do the replacement afterward.
> That may be costly, however (maybe weeks of computations), so it should be
> avoided if possible, but not at the expense of crashing the box if the eject
> doesn't work out.

It seems to me that we could handle that with the help of a new flag, say
"no_eject", in struct device, a global mutex, and a function that will walk
the given subtree of the device hierarchy and check if "no_eject" is set for
any devices in there.  Plus a global "no_eject" switch, perhaps.

To be more precise, suppose we have a "no_eject" flag that is set for a device
when the kernel is about to start using it in such a way that ejecting it would
lead to serious trouble (i.e. it is a memory module holding the kernel's page
tables or something like that).  Next, suppose that we have a function called,
say, "device_may_eject()" that will walk the subtree of the device hierarchy
starting at the given node and return false if any of the devices in there have
"no_eject" set.  Further, require that device_may_eject() has to be called
under a global mutex called something like "device_eject_lock".  Then, we can
arrange things as follows:

1. When a device is about to be used for such purposes that it shouldn't be
   ejected, the relevant code should:
  (a) Acquire device_eject_lock.
  (b) Check if the device is still there and go to (e) if not.
  (c) Do whatever needs to be done to the device.
  (d) Set the device's no_eject flag.
  (e) Release device_eject_lock.

2. When an eject operation is about to be carried out on a subtree of the device
   hierarchy, the eject code (be it ACPI or something else) should:
  (a) Acquire device_eject_lock.
  (b) Call device_may_eject() on the starting device and go to (d) if it
      returns false.
  (c) Carry out the eject (that includes calling .remove() from all of the
      involved drivers in partiular).
  (d) Release device_eject_lock.

3. When it is OK to eject the device again, the relevant code should just clear
   its "no_eject" flag under device_eject_lock.

If we want to synchronize that with such things like boot or system suspend,
a global "no_eject" switch can be used for that (it needs to be manipulated
under device_eject_lock) and one more step (check the global "no_eject") is
needed between 2(a) and 2(b).

Does it look like a reasonable approach?

Rafael
Toshi Kani Feb. 4, 2013, 12:28 a.m. UTC | #25
On Sat, 2013-02-02 at 16:01 +0100, Greg KH wrote:
> On Fri, Feb 01, 2013 at 01:40:10PM -0700, Toshi Kani wrote:
> > On Fri, 2013-02-01 at 07:30 +0000, Greg KH wrote:
> > > On Thu, Jan 31, 2013 at 06:32:18PM -0700, Toshi Kani wrote:
> > >  > This is already done for PCI host bridges and platform devices and I don't
> > > > > see why we can't do that for the other types of devices too.
> > > > > 
> > > > > The only missing piece I see is a way to handle the "eject" problem, i.e.
> > > > > when we try do eject a device at the top of a subtree and need to tear down
> > > > > the entire subtree below it, but if that's going to lead to a system crash,
> > > > > for example, we want to cancel the eject.  It seems to me that we'll need some
> > > > > help from the driver core here.
> > > > 
> > > > There are three different approaches suggested for system device
> > > > hot-plug:
> > > >  A. Proceed within system device bus scan.
> > > >  B. Proceed within ACPI bus scan.
> > > >  C. Proceed with a sequence (as a mini-boot).
> > > > 
> > > > Option A uses system devices as tokens, option B uses acpi devices as
> > > > tokens, and option C uses resource tables as tokens, for their handlers.
> > > > 
> > > > Here is summary of key questions & answers so far.  I hope this
> > > > clarifies why I am suggesting option 3.
> > > > 
> > > > 1. What are the system devices?
> > > > System devices provide system-wide core computing resources, which are
> > > > essential to compose a computer system.  System devices are not
> > > > connected to any particular standard buses.
> > > 
> > > Not a problem, lots of devices are not connected to any "particular
> > > standard busses".  All this means is that system devices are connected
> > > to the "system" bus, nothing more.
> > 
> > Can you give me a few examples of other devices that support hotplug and
> > are not connected to any particular buses?  I will investigate them to
> > see how they are managed to support hotplug.
> 
> Any device that is attached to any bus in the driver model can be
> hotunplugged from userspace by telling it to be "unbound" from the
> driver controlling it.  Try it for any platform device in your system to
> see how it happens.

The unbind operation, as I understand from you, is to detach a driver
from a device.  Yes, unbinding can be done for any devices.  It is
however different from hot-plug operation, which unplugs a device.

Today, the unbind operation to an ACPI cpu/memory devices causes
hot-unplug (offline) operation to them, which is one of the major issues
for us since unbind cannot fail.  This patchset addresses this issue by
making the unbind operation of ACPI cpu/memory devices to do the
unbinding only.  ACPI drivers no longer control cpu and memory as they
are supposed to be controlled by their drivers, cpu and memory modules.
The current hotplug code requires putting all device control stuff into
ACPI, which this patchset is trying to fix it.


> > > > 2. Why are the system devices special?
> > > > The system devices are initialized during early boot-time, by multiple
> > > > subsystems, from the boot-up sequence, in pre-defined order.  They
> > > > provide low-level services to enable other subsystems to come up.
> > > 
> > > Sorry, no, that doesn't mean they are special, nothing here is unique
> > > for the point of view of the driver model from any other device or bus.
> > 
> > I think system devices are unique in a sense that they are initialized
> > before drivers run.
> 
> No, most all devices are "initialized" before a driver runs on it, USB
> is one such example, PCI another, and I'm pretty sure that there are
> others.

USB devices can be initialized after the USB bus driver is initialized.
Similarly, PCI devices can be initialized after the PCI bus driver is
initialized.  However, CPU and memory are initialized without any
dependency to their bus driver since there is no such thing.

In addition, CPU and memory have two drivers -- their actual
drivers/subsystems and their ACPI drivers.  Their actual
drivers/subsystems initialize cpu and memory.  The ACPI drivers
initialize driver-specific data of their ACPI nodes.  During hot-plug
operation, however, the current code requires these ACPI drivers to do
their hot-plug operations.  This patchset keeps them separated during
hot-plug and let their actual drivers/subsystems to do the job.


> > > If you need to initialize the driver
> > > core earlier, then do so.  Or, even better, just wait until enough of
> > > the system has come up and then go initialize all of the devices you
> > > have found so far as part of your boot process.
> > 
> > They are pseudo drivers that provide sysfs entry points of cpu and
> > memory.  They do not actually initialize cpu and memory.  I do not think
> > initializing cpu and memory fits into the driver model either, since
> > drivers should run after cpu and memory are initialized.
> 
> We already represent CPUs in the sysfs tree, don't represent them in two
> different places with two different structures.  Use the existing ones
> please.

This patchset does not make any changes to sysfs.  It does however make
the system drivers to control the system devices, and ACPI to do the
ACPI stuff only.

The boot sequence calls the following steps to initialize memory and
cpu, as well as their acpi nodes.  These steps are independent, i.e. #1
and #2 run without ACPI.

 1. mm -> initialize memory -> create sysfs system/memory
 2. smp/cpu -> initialize cpu -> create sysfs system/cpu
 3. acpi core -> acpi scan -> create sysfs acpi nodes

While there are 3 separate steps at boot, the current hotplug code tries
to do everything in #3.  Therefore, this patchset provides a sequencer
(which is similar to the boot sequence) to run these steps during a
hot-plug operation as well.  Hence, we have consistent steps and role
mode between boot and hot-plug operations.


> > > None of the above things you have stated seem to have anything to do
> > > with your proposed patch, so I don't understand why you have mentioned
> > > them...
> > 
> > You suggested option A before, which uses system bus scan to initialize
> > all system devices at boot time as well as hot-plug.  I tried to say
> > that this option would not be doable.
> 
> I haven't yet been convinced otherwise, sorry.  Please prove me wrong :)

This patchset enables the system drivers (i.e. cpu and memory drivers)
to do the hot-plug operations, and ACPI core to do the ACPI stuff.

We should not go with the system driver only approach since we do need
to use ACPI stuff.  Also, we should not go with the ACPI only approach
since we do need to use the system (and other) drivers.  This patchset
provides a sequencer to manage these steps across multiple subsystems.


Thanks,
-Toshi
Greg Kroah-Hartman Feb. 4, 2013, 1:23 a.m. UTC | #26
On Sat, Feb 02, 2013 at 09:15:37PM +0100, Rafael J. Wysocki wrote:
> On Saturday, February 02, 2013 03:58:01 PM Greg KH wrote:
> > On Fri, Feb 01, 2013 at 11:12:59PM +0100, Rafael J. Wysocki wrote:
> > > On Friday, February 01, 2013 08:23:12 AM Greg KH wrote:
> > > > On Thu, Jan 31, 2013 at 09:54:51PM +0100, Rafael J. Wysocki wrote:
> > > > > > > But, again, I'm going to ask why you aren't using the existing cpu /
> > > > > > > memory / bridge / node devices that we have in the kernel.  Please use
> > > > > > > them, or give me a _really_ good reason why they will not work.
> > > > > > 
> > > > > > We cannot use the existing system devices or ACPI devices here.  During
> > > > > > hot-plug, ACPI handler sets this shp_device info, so that cpu and memory
> > > > > > handlers (drivers/cpu.c and mm/memory_hotplug.c) can obtain their target
> > > > > > device information in a platform-neutral way.  During hot-add, we first
> > > > > > creates an ACPI device node (i.e. device under /sys/bus/acpi/devices),
> > > > > > but platform-neutral modules cannot use them as they are ACPI-specific.
> > > > > 
> > > > > But suppose we're smart and have ACPI scan handlers that will create
> > > > > "physical" device nodes for those devices during the ACPI namespace scan.
> > > > > Then, the platform-neutral nodes will be able to bind to those "physical"
> > > > > nodes.  Moreover, it should be possible to get a hierarchy of device objects
> > > > > this way that will reflect all of the dependencies we need to take into
> > > > > account during hot-add and hot-remove operations.  That may not be what we
> > > > > have today, but I don't see any *fundamental* obstacles preventing us from
> > > > > using this approach.
> > > > 
> > > > I would _much_ rather see that be the solution here as I think it is the
> > > > proper one.
> > > > 
> > > > > This is already done for PCI host bridges and platform devices and I don't
> > > > > see why we can't do that for the other types of devices too.
> > > > 
> > > > I agree.
> > > > 
> > > > > The only missing piece I see is a way to handle the "eject" problem, i.e.
> > > > > when we try do eject a device at the top of a subtree and need to tear down
> > > > > the entire subtree below it, but if that's going to lead to a system crash,
> > > > > for example, we want to cancel the eject.  It seems to me that we'll need some
> > > > > help from the driver core here.
> > > > 
> > > > I say do what we always have done here, if the user asked us to tear
> > > > something down, let it happen as they are the ones that know best :)
> > > > 
> > > > Seriously, I guess this gets back to the "fail disconnect" idea that the
> > > > ACPI developers keep harping on.  I thought we already resolved this
> > > > properly by having them implement it in their bus code, no reason the
> > > > same thing couldn't happen here, right?
> > > 
> > > Not really. :-)  We haven't ever resolved that particular issue I'm afraid.
> > 
> > Ah, I didn't realize that.
> > 
> > > > I don't think the core needs to do anything special, but if so, I'll be glad
> > > > to review it.
> > > 
> > > OK, so this is the use case.  We have "eject" defined for something like
> > > a container with a number of CPU cores, PCI host bridge, and a memory
> > > controller under it.  And a few pretty much arbitrary I/O devices as a bonus.
> > > 
> > > Now, there's a button on the system case labeled as "Eject" and if that button
> > > is pressed, we're supposed to _try_ to eject all of those things at once.  We
> > > are allowed to fail that request, though, if that's problematic for some
> > > reason, but we're supposed to let the BIOS know about that.
> > > 
> > > Do you seriously think that if that button is pressed, we should just proceed
> > > with removing all that stuff no matter what?  That'd be kind of like Russian
> > > roulette for whoever pressed that button, because s/he could only press it and
> > > wait for the system to either crash or not.  Or maybe to crash a bit later
> > > because of some delayed stuff that would hit one of those devices that had just
> > > gone.  Surely not a situation any admin of a high-availability system would
> > > like to be in. :-)
> > > 
> > > Quite frankly, I have no idea how that can be addressed in a single bus type,
> > > let alone ACPI (which is not even a proper bus type, just something pretending
> > > to be one).
> > 
> > You don't have it as a single bus type, you have a controller somewhere,
> > off of the bus being destroyed, that handles sending remove events to
> > the device and tearing everything down.  PCI does this from the very
> > beginning.
> 
> Yes, but those are just remove events and we can only see how destructive they
> were after the removal.  The point is to be able to figure out whether or not
> we *want* to do the removal in the first place.

Yes, but, you will always race if you try to test to see if you can shut
down a device and then trying to do it.  So walking the bus ahead of
time isn't a good idea.

And, we really don't have a viable way to recover if disconnect() fails,
do we.  What do we do in that situation, restore the other devices we
disconnected successfully?  How do we remember/know what they were?

PCI hotplug almost had this same problem until the designers finally
realized that they just had to accept the fact that removing a PCI
device could either happen by:
	- a user yanking out the device, at which time the OS better
	  clean up properly no matter what happens
	- the user asked nicely to remove a device, and the OS can take
	  as long as it wants to complete that action, including
	  stalling for noticable amounts of time before eventually,
	  always letting the action succeed.

I think the second thing is what you have to do here.  If a user tells
the OS it wants to remove these devices, you better do it.  If you
can't, because memory is being used by someone else, either move them
off, or just hope that nothing bad happens, before the user gets
frustrated and yanks out the CPU/memory module themselves physically :)

> Say you have a computing node which signals a hardware problem in a processor
> package (the container with CPU cores, memory, PCI host bridge etc.).  You
> may want to eject that package, but you don't want to kill the system this
> way.  So if the eject is doable, it is very much desirable to do it, but if it
> is not doable, you'd rather shut the box down and do the replacement afterward.
> That may be costly, however (maybe weeks of computations), so it should be
> avoided if possible, but not at the expense of crashing the box if the eject
> doesn't work out.

These same "situations" came up for PCI hotplug, and I still say the
same resolution there holds true, as described above.  The user wants to
remove something, so let them do it.  They always know best, and get mad
at us if we think otherwise :)

What does the ACPI spec say about this type of thing?  Surely the same
people that did the PCI Hotplug spec were consulted when doing this part
of the spec, right?  Yeah, I know, I can dream...

> > I know it's more complicated with these types of devices, and I think we
> > are getting closer to the correct solution, I just don't want to ever
> > see duplicate devices in the driver model for the same physical device.
> 
> Do you mean two things based on struct device for the same hardware component?
> That's been happening already pretty much forever for every PCI device known
> to the ACPI layer, for PNP and many others.  However, those ACPI things are (or
> rather should be, but we're going to clean that up) only for convenience (to be
> able to see the namespace structure and related things in sysfs).  So the stuff
> under /sys/devices/LNXSYSTM\:00/ is not "real".

Yes, I've never treated that as a "real" device because they (usually)
didn't ever bind to the "real" driver that controlled the device and how
it talked to the rest of the os (like a USB device for example.)  I
always thought just of it as a "shadow" of the firmware image, nothing
that should be directly operated on if at all possible.

But, as you are pointing out, maybe this needs to be changed.  Having
users have to look in one part of the tree for one interface to a
device, and another totally different part for a different interface to
the same physical device is crazy, don't you agree?

As to how to solve it, I really have no idea, I don't know ACPI that
well at all, and honestly, don't want to, I want to keep what little
hair I have left...

> In my view it shouldn't even
> be under /sys/devices/ (/sys/firmware/acpi/ seems to be a better place for it),

I agree.

> but that may be difficult to change without breaking user space (maybe we can
> just symlink it from /sys/devices/ or something).  And the ACPI bus type
> shouldn't even exist in my opinion.
> 
> There's much confusion in there and much work to clean that up, I agree, but
> that's kind of separate from the hotplug thing.

I agree as well.

Best of luck.

greg k-h
Greg Kroah-Hartman Feb. 4, 2013, 12:46 p.m. UTC | #27
On Sun, Feb 03, 2013 at 05:28:09PM -0700, Toshi Kani wrote:
> On Sat, 2013-02-02 at 16:01 +0100, Greg KH wrote:
> > On Fri, Feb 01, 2013 at 01:40:10PM -0700, Toshi Kani wrote:
> > > On Fri, 2013-02-01 at 07:30 +0000, Greg KH wrote:
> > > > On Thu, Jan 31, 2013 at 06:32:18PM -0700, Toshi Kani wrote:
> > > >  > This is already done for PCI host bridges and platform devices and I don't
> > > > > > see why we can't do that for the other types of devices too.
> > > > > > 
> > > > > > The only missing piece I see is a way to handle the "eject" problem, i.e.
> > > > > > when we try do eject a device at the top of a subtree and need to tear down
> > > > > > the entire subtree below it, but if that's going to lead to a system crash,
> > > > > > for example, we want to cancel the eject.  It seems to me that we'll need some
> > > > > > help from the driver core here.
> > > > > 
> > > > > There are three different approaches suggested for system device
> > > > > hot-plug:
> > > > >  A. Proceed within system device bus scan.
> > > > >  B. Proceed within ACPI bus scan.
> > > > >  C. Proceed with a sequence (as a mini-boot).
> > > > > 
> > > > > Option A uses system devices as tokens, option B uses acpi devices as
> > > > > tokens, and option C uses resource tables as tokens, for their handlers.
> > > > > 
> > > > > Here is summary of key questions & answers so far.  I hope this
> > > > > clarifies why I am suggesting option 3.
> > > > > 
> > > > > 1. What are the system devices?
> > > > > System devices provide system-wide core computing resources, which are
> > > > > essential to compose a computer system.  System devices are not
> > > > > connected to any particular standard buses.
> > > > 
> > > > Not a problem, lots of devices are not connected to any "particular
> > > > standard busses".  All this means is that system devices are connected
> > > > to the "system" bus, nothing more.
> > > 
> > > Can you give me a few examples of other devices that support hotplug and
> > > are not connected to any particular buses?  I will investigate them to
> > > see how they are managed to support hotplug.
> > 
> > Any device that is attached to any bus in the driver model can be
> > hotunplugged from userspace by telling it to be "unbound" from the
> > driver controlling it.  Try it for any platform device in your system to
> > see how it happens.
> 
> The unbind operation, as I understand from you, is to detach a driver
> from a device.  Yes, unbinding can be done for any devices.  It is
> however different from hot-plug operation, which unplugs a device.

Physically, yes, but to the driver involved, and the driver core, there
is no difference.  That was one of the primary goals of the driver core
creation so many years ago.

> Today, the unbind operation to an ACPI cpu/memory devices causes
> hot-unplug (offline) operation to them, which is one of the major issues
> for us since unbind cannot fail.  This patchset addresses this issue by
> making the unbind operation of ACPI cpu/memory devices to do the
> unbinding only.  ACPI drivers no longer control cpu and memory as they
> are supposed to be controlled by their drivers, cpu and memory modules.

I think that's the problem right there, solve that, please.

> > > > > 2. Why are the system devices special?
> > > > > The system devices are initialized during early boot-time, by multiple
> > > > > subsystems, from the boot-up sequence, in pre-defined order.  They
> > > > > provide low-level services to enable other subsystems to come up.
> > > > 
> > > > Sorry, no, that doesn't mean they are special, nothing here is unique
> > > > for the point of view of the driver model from any other device or bus.
> > > 
> > > I think system devices are unique in a sense that they are initialized
> > > before drivers run.
> > 
> > No, most all devices are "initialized" before a driver runs on it, USB
> > is one such example, PCI another, and I'm pretty sure that there are
> > others.
> 
> USB devices can be initialized after the USB bus driver is initialized.
> Similarly, PCI devices can be initialized after the PCI bus driver is
> initialized.  However, CPU and memory are initialized without any
> dependency to their bus driver since there is no such thing.

You can create such a thing if you want :)

> In addition, CPU and memory have two drivers -- their actual
> drivers/subsystems and their ACPI drivers.

Again, I feel that is the root of the problem.  Rafael seems to be
working on solving this, which I think is essencial to your work as
well.

thanks,

greg k-h
Greg Kroah-Hartman Feb. 4, 2013, 12:48 p.m. UTC | #28
On Sun, Feb 03, 2013 at 09:44:39PM +0100, Rafael J. Wysocki wrote:
> > Yes, but those are just remove events and we can only see how destructive they
> > were after the removal.  The point is to be able to figure out whether or not
> > we *want* to do the removal in the first place.
> > 
> > Say you have a computing node which signals a hardware problem in a processor
> > package (the container with CPU cores, memory, PCI host bridge etc.).  You
> > may want to eject that package, but you don't want to kill the system this
> > way.  So if the eject is doable, it is very much desirable to do it, but if it
> > is not doable, you'd rather shut the box down and do the replacement afterward.
> > That may be costly, however (maybe weeks of computations), so it should be
> > avoided if possible, but not at the expense of crashing the box if the eject
> > doesn't work out.
> 
> It seems to me that we could handle that with the help of a new flag, say
> "no_eject", in struct device, a global mutex, and a function that will walk
> the given subtree of the device hierarchy and check if "no_eject" is set for
> any devices in there.  Plus a global "no_eject" switch, perhaps.

I think this will always be racy, or at worst, slow things down on
normal device operations as you will always be having to grab this flag
whenever you want to do something new.

See my comments earlier about pci hotplug and the design decisions there
about "no eject" capabilities for why.

thanks,

greg k-h
Rafael J. Wysocki Feb. 4, 2013, 1:41 p.m. UTC | #29
On Sunday, February 03, 2013 07:23:49 PM Greg KH wrote:
> On Sat, Feb 02, 2013 at 09:15:37PM +0100, Rafael J. Wysocki wrote:
> > On Saturday, February 02, 2013 03:58:01 PM Greg KH wrote:
> > > On Fri, Feb 01, 2013 at 11:12:59PM +0100, Rafael J. Wysocki wrote:
> > > > On Friday, February 01, 2013 08:23:12 AM Greg KH wrote:
> > > > > On Thu, Jan 31, 2013 at 09:54:51PM +0100, Rafael J. Wysocki wrote:
> > > > > > > > But, again, I'm going to ask why you aren't using the existing cpu /
> > > > > > > > memory / bridge / node devices that we have in the kernel.  Please use
> > > > > > > > them, or give me a _really_ good reason why they will not work.
> > > > > > > 
> > > > > > > We cannot use the existing system devices or ACPI devices here.  During
> > > > > > > hot-plug, ACPI handler sets this shp_device info, so that cpu and memory
> > > > > > > handlers (drivers/cpu.c and mm/memory_hotplug.c) can obtain their target
> > > > > > > device information in a platform-neutral way.  During hot-add, we first
> > > > > > > creates an ACPI device node (i.e. device under /sys/bus/acpi/devices),
> > > > > > > but platform-neutral modules cannot use them as they are ACPI-specific.
> > > > > > 
> > > > > > But suppose we're smart and have ACPI scan handlers that will create
> > > > > > "physical" device nodes for those devices during the ACPI namespace scan.
> > > > > > Then, the platform-neutral nodes will be able to bind to those "physical"
> > > > > > nodes.  Moreover, it should be possible to get a hierarchy of device objects
> > > > > > this way that will reflect all of the dependencies we need to take into
> > > > > > account during hot-add and hot-remove operations.  That may not be what we
> > > > > > have today, but I don't see any *fundamental* obstacles preventing us from
> > > > > > using this approach.
> > > > > 
> > > > > I would _much_ rather see that be the solution here as I think it is the
> > > > > proper one.
> > > > > 
> > > > > > This is already done for PCI host bridges and platform devices and I don't
> > > > > > see why we can't do that for the other types of devices too.
> > > > > 
> > > > > I agree.
> > > > > 
> > > > > > The only missing piece I see is a way to handle the "eject" problem, i.e.
> > > > > > when we try do eject a device at the top of a subtree and need to tear down
> > > > > > the entire subtree below it, but if that's going to lead to a system crash,
> > > > > > for example, we want to cancel the eject.  It seems to me that we'll need some
> > > > > > help from the driver core here.
> > > > > 
> > > > > I say do what we always have done here, if the user asked us to tear
> > > > > something down, let it happen as they are the ones that know best :)
> > > > > 
> > > > > Seriously, I guess this gets back to the "fail disconnect" idea that the
> > > > > ACPI developers keep harping on.  I thought we already resolved this
> > > > > properly by having them implement it in their bus code, no reason the
> > > > > same thing couldn't happen here, right?
> > > > 
> > > > Not really. :-)  We haven't ever resolved that particular issue I'm afraid.
> > > 
> > > Ah, I didn't realize that.
> > > 
> > > > > I don't think the core needs to do anything special, but if so, I'll be glad
> > > > > to review it.
> > > > 
> > > > OK, so this is the use case.  We have "eject" defined for something like
> > > > a container with a number of CPU cores, PCI host bridge, and a memory
> > > > controller under it.  And a few pretty much arbitrary I/O devices as a bonus.
> > > > 
> > > > Now, there's a button on the system case labeled as "Eject" and if that button
> > > > is pressed, we're supposed to _try_ to eject all of those things at once.  We
> > > > are allowed to fail that request, though, if that's problematic for some
> > > > reason, but we're supposed to let the BIOS know about that.
> > > > 
> > > > Do you seriously think that if that button is pressed, we should just proceed
> > > > with removing all that stuff no matter what?  That'd be kind of like Russian
> > > > roulette for whoever pressed that button, because s/he could only press it and
> > > > wait for the system to either crash or not.  Or maybe to crash a bit later
> > > > because of some delayed stuff that would hit one of those devices that had just
> > > > gone.  Surely not a situation any admin of a high-availability system would
> > > > like to be in. :-)
> > > > 
> > > > Quite frankly, I have no idea how that can be addressed in a single bus type,
> > > > let alone ACPI (which is not even a proper bus type, just something pretending
> > > > to be one).
> > > 
> > > You don't have it as a single bus type, you have a controller somewhere,
> > > off of the bus being destroyed, that handles sending remove events to
> > > the device and tearing everything down.  PCI does this from the very
> > > beginning.
> > 
> > Yes, but those are just remove events and we can only see how destructive they
> > were after the removal.  The point is to be able to figure out whether or not
> > we *want* to do the removal in the first place.
> 
> Yes, but, you will always race if you try to test to see if you can shut
> down a device and then trying to do it.  So walking the bus ahead of
> time isn't a good idea.
>
> And, we really don't have a viable way to recover if disconnect() fails,
> do we.  What do we do in that situation, restore the other devices we
> disconnected successfully?  How do we remember/know what they were?
> 
> PCI hotplug almost had this same problem until the designers finally
> realized that they just had to accept the fact that removing a PCI
> device could either happen by:
> 	- a user yanking out the device, at which time the OS better
> 	  clean up properly no matter what happens
> 	- the user asked nicely to remove a device, and the OS can take
> 	  as long as it wants to complete that action, including
> 	  stalling for noticable amounts of time before eventually,
> 	  always letting the action succeed.
> 
> I think the second thing is what you have to do here.  If a user tells
> the OS it wants to remove these devices, you better do it.  If you
> can't, because memory is being used by someone else, either move them
> off, or just hope that nothing bad happens, before the user gets
> frustrated and yanks out the CPU/memory module themselves physically :)

Well, that we can't help, but sometimes users really *want* the OS to tell them
if it is safe to unplug something at this particualr time (think about the
Windows' "safe remove" feature for USB sticks, for example; that came out of
users' demand AFAIR).

So in my opinion it would be good to give them an option to do "safe eject" or
"forcible eject", whichever they prefer.

> > Say you have a computing node which signals a hardware problem in a processor
> > package (the container with CPU cores, memory, PCI host bridge etc.).  You
> > may want to eject that package, but you don't want to kill the system this
> > way.  So if the eject is doable, it is very much desirable to do it, but if it
> > is not doable, you'd rather shut the box down and do the replacement afterward.
> > That may be costly, however (maybe weeks of computations), so it should be
> > avoided if possible, but not at the expense of crashing the box if the eject
> > doesn't work out.
> 
> These same "situations" came up for PCI hotplug, and I still say the
> same resolution there holds true, as described above.  The user wants to
> remove something, so let them do it.  They always know best, and get mad
> at us if we think otherwise :)

Well, not necessarily.  Users sometimes really don't know what they are doing
and want us to give them a hint.  My opinion is that if we can give them a
hint, there's no reason not to.

> What does the ACPI spec say about this type of thing?  Surely the same
> people that did the PCI Hotplug spec were consulted when doing this part
> of the spec, right?  Yeah, I know, I can dream...

It's not very specific (as usual), but it gives hints. :-)

For example, there is the _OST method (Section 6.3.5 of ACPI 5) that we are
supposed to use to notify the platform of ejection failures and there are
status codes like "0x81: Device in use by application" or "0x82: Device busy"
that can be used in there.  So definitely the authors took ejection failures
for software-related reasons into consideration.

> > > I know it's more complicated with these types of devices, and I think we
> > > are getting closer to the correct solution, I just don't want to ever
> > > see duplicate devices in the driver model for the same physical device.
> > 
> > Do you mean two things based on struct device for the same hardware component?
> > That's been happening already pretty much forever for every PCI device known
> > to the ACPI layer, for PNP and many others.  However, those ACPI things are (or
> > rather should be, but we're going to clean that up) only for convenience (to be
> > able to see the namespace structure and related things in sysfs).  So the stuff
> > under /sys/devices/LNXSYSTM\:00/ is not "real".
> 
> Yes, I've never treated that as a "real" device because they (usually)
> didn't ever bind to the "real" driver that controlled the device and how
> it talked to the rest of the os (like a USB device for example.)  I
> always thought just of it as a "shadow" of the firmware image, nothing
> that should be directly operated on if at all possible.

Precisely.  That's why I'd like to move that stuff away from /sys/devices/
and I don't see a reason why these objects should be based on struct device.
They need kobjects to show up in sysfs, but apart from this they don't really
need anything from struct device as far as I can say.

> But, as you are pointing out, maybe this needs to be changed.  Having
> users have to look in one part of the tree for one interface to a
> device, and another totally different part for a different interface to
> the same physical device is crazy, don't you agree?

Well, it is confusing.  I don't have a problem with exposing the ACPI namespace
in the form of a directory structure in sysfs and I see some benefits from
doing that, but I'd like it to be clear what's represented by that directory
structure and I don't want people to confuse ACPI device objects with devices
(they are abstract interfaces to devices rather than anything else).

> As to how to solve it, I really have no idea, I don't know ACPI that
> well at all, and honestly, don't want to, I want to keep what little
> hair I have left...

I totally understand you. :-)

> > In my view it shouldn't even
> > be under /sys/devices/ (/sys/firmware/acpi/ seems to be a better place for it),
> 
> I agree.
> 
> > but that may be difficult to change without breaking user space (maybe we can
> > just symlink it from /sys/devices/ or something).  And the ACPI bus type
> > shouldn't even exist in my opinion.
> > 
> > There's much confusion in there and much work to clean that up, I agree, but
> > that's kind of separate from the hotplug thing.
> 
> I agree as well.
> 
> Best of luck.

Thanks. :-)

Rafael
Rafael J. Wysocki Feb. 4, 2013, 2:21 p.m. UTC | #30
On Monday, February 04, 2013 04:48:10 AM Greg KH wrote:
> On Sun, Feb 03, 2013 at 09:44:39PM +0100, Rafael J. Wysocki wrote:
> > > Yes, but those are just remove events and we can only see how destructive they
> > > were after the removal.  The point is to be able to figure out whether or not
> > > we *want* to do the removal in the first place.
> > > 
> > > Say you have a computing node which signals a hardware problem in a processor
> > > package (the container with CPU cores, memory, PCI host bridge etc.).  You
> > > may want to eject that package, but you don't want to kill the system this
> > > way.  So if the eject is doable, it is very much desirable to do it, but if it
> > > is not doable, you'd rather shut the box down and do the replacement afterward.
> > > That may be costly, however (maybe weeks of computations), so it should be
> > > avoided if possible, but not at the expense of crashing the box if the eject
> > > doesn't work out.
> > 
> > It seems to me that we could handle that with the help of a new flag, say
> > "no_eject", in struct device, a global mutex, and a function that will walk
> > the given subtree of the device hierarchy and check if "no_eject" is set for
> > any devices in there.  Plus a global "no_eject" switch, perhaps.
> 
> I think this will always be racy, or at worst, slow things down on
> normal device operations as you will always be having to grab this flag
> whenever you want to do something new.

I don't see why this particular scheme should be racy, at least I don't see any
obvious races in it (although I'm not that good at races detection in general,
admittedly).

Also, I don't expect that flag to be used for everything, just for things known
to seriously break if forcible eject is done.  That may be not precise enough,
so that's a matter of defining its purpose more precisely.

We can do something like that on the ACPI level (ie. introduce a no_eject flag
in struct acpi_device and provide an iterface for the layers above ACPI to
manipulate it) but then devices without ACPI namespace objects won't be
covered.  That may not be a big deal, though.

So say dev is about to be used for something incompatible with ejecting, so to
speak.  Then, one would do platform_lock_eject(dev), which would check if dev
has an ACPI handle and then take acpi_eject_lock (if so).  The return value of
platform_lock_eject(dev) would need to be checked to see if the device is not
gone.  If it returns success (0), one would do something to the device and
call platform_no_eject(dev) and then platform_unlock_eject(dev).

To clear no_eject one would just call platform_allow_to_eject(dev) that would
do all of the locking and clearing in one operation.

The ACPI eject side would be similar to the thing I described previously,
so it would (1) take acpi_eject_lock, (2) see if any struct acpi_device
involved has no_eject set and if not, then (3) do acpi_bus_trim(), (4)
carry out the eject and (5) release acpi_eject_lock.

Step (2) above might be optional, ie. if eject is forcible, we would just do
(3) etc. without (2).

The locking should prevent races from happening (and it should prevent two
ejects from happening at the same time too, which is not a bad thing by itself).

> See my comments earlier about pci hotplug and the design decisions there
> about "no eject" capabilities for why.

Well, I replied to that one too. :-)

Thanks,
Rafael
Greg Kroah-Hartman Feb. 4, 2013, 2:33 p.m. UTC | #31
On Mon, Feb 04, 2013 at 03:21:22PM +0100, Rafael J. Wysocki wrote:
> On Monday, February 04, 2013 04:48:10 AM Greg KH wrote:
> > On Sun, Feb 03, 2013 at 09:44:39PM +0100, Rafael J. Wysocki wrote:
> > > > Yes, but those are just remove events and we can only see how destructive they
> > > > were after the removal.  The point is to be able to figure out whether or not
> > > > we *want* to do the removal in the first place.
> > > > 
> > > > Say you have a computing node which signals a hardware problem in a processor
> > > > package (the container with CPU cores, memory, PCI host bridge etc.).  You
> > > > may want to eject that package, but you don't want to kill the system this
> > > > way.  So if the eject is doable, it is very much desirable to do it, but if it
> > > > is not doable, you'd rather shut the box down and do the replacement afterward.
> > > > That may be costly, however (maybe weeks of computations), so it should be
> > > > avoided if possible, but not at the expense of crashing the box if the eject
> > > > doesn't work out.
> > > 
> > > It seems to me that we could handle that with the help of a new flag, say
> > > "no_eject", in struct device, a global mutex, and a function that will walk
> > > the given subtree of the device hierarchy and check if "no_eject" is set for
> > > any devices in there.  Plus a global "no_eject" switch, perhaps.
> > 
> > I think this will always be racy, or at worst, slow things down on
> > normal device operations as you will always be having to grab this flag
> > whenever you want to do something new.
> 
> I don't see why this particular scheme should be racy, at least I don't see any
> obvious races in it (although I'm not that good at races detection in general,
> admittedly).
> 
> Also, I don't expect that flag to be used for everything, just for things known
> to seriously break if forcible eject is done.  That may be not precise enough,
> so that's a matter of defining its purpose more precisely.
> 
> We can do something like that on the ACPI level (ie. introduce a no_eject flag
> in struct acpi_device and provide an iterface for the layers above ACPI to
> manipulate it) but then devices without ACPI namespace objects won't be
> covered.  That may not be a big deal, though.
> 
> So say dev is about to be used for something incompatible with ejecting, so to
> speak.  Then, one would do platform_lock_eject(dev), which would check if dev
> has an ACPI handle and then take acpi_eject_lock (if so).  The return value of
> platform_lock_eject(dev) would need to be checked to see if the device is not
> gone.  If it returns success (0), one would do something to the device and
> call platform_no_eject(dev) and then platform_unlock_eject(dev).

How does a device "know" it is doing something that is incompatible with
ejecting?  That's a non-trivial task from what I can tell.

What happens if a device wants to set that flag, right after it was told
to eject and the device was in the middle of being removed?  How can you
"fail" the "I can't be removed me now, so don't" requirement that it now
has?

thanks,

greg k-h
Toshi Kani Feb. 4, 2013, 4:02 p.m. UTC | #32
On Mon, 2013-02-04 at 14:41 +0100, Rafael J. Wysocki wrote:
> On Sunday, February 03, 2013 07:23:49 PM Greg KH wrote:
> > On Sat, Feb 02, 2013 at 09:15:37PM +0100, Rafael J. Wysocki wrote:
> > > On Saturday, February 02, 2013 03:58:01 PM Greg KH wrote:
  :
> > > Yes, but those are just remove events and we can only see how destructive they
> > > were after the removal.  The point is to be able to figure out whether or not
> > > we *want* to do the removal in the first place.
> > 
> > Yes, but, you will always race if you try to test to see if you can shut
> > down a device and then trying to do it.  So walking the bus ahead of
> > time isn't a good idea.
> >
> > And, we really don't have a viable way to recover if disconnect() fails,
> > do we.  What do we do in that situation, restore the other devices we
> > disconnected successfully?  How do we remember/know what they were?
> > 
> > PCI hotplug almost had this same problem until the designers finally
> > realized that they just had to accept the fact that removing a PCI
> > device could either happen by:
> > 	- a user yanking out the device, at which time the OS better
> > 	  clean up properly no matter what happens
> > 	- the user asked nicely to remove a device, and the OS can take
> > 	  as long as it wants to complete that action, including
> > 	  stalling for noticable amounts of time before eventually,
> > 	  always letting the action succeed.
> > 
> > I think the second thing is what you have to do here.  If a user tells
> > the OS it wants to remove these devices, you better do it.  If you
> > can't, because memory is being used by someone else, either move them
> > off, or just hope that nothing bad happens, before the user gets
> > frustrated and yanks out the CPU/memory module themselves physically :)
> 
> Well, that we can't help, but sometimes users really *want* the OS to tell them
> if it is safe to unplug something at this particualr time (think about the
> Windows' "safe remove" feature for USB sticks, for example; that came out of
> users' demand AFAIR).
> 
> So in my opinion it would be good to give them an option to do "safe eject" or
> "forcible eject", whichever they prefer.

For system device hot-plug, it always needs to be "safe eject".  This
feature will be implemented on mission critical servers, which are
managed by professional IT folks.  Crashing a server causes serious
money to the business.

A user yanking out a system device won't happen, and it immediately
crashes the system if it is done.  So, we have nothing to do with this
case.  The 2nd case can hang the operation, waiting forever to proceed,
which is still a serious issue for enterprise customers.


> > > Say you have a computing node which signals a hardware problem in a processor
> > > package (the container with CPU cores, memory, PCI host bridge etc.).  You
> > > may want to eject that package, but you don't want to kill the system this
> > > way.  So if the eject is doable, it is very much desirable to do it, but if it
> > > is not doable, you'd rather shut the box down and do the replacement afterward.
> > > That may be costly, however (maybe weeks of computations), so it should be
> > > avoided if possible, but not at the expense of crashing the box if the eject
> > > doesn't work out.
> > 
> > These same "situations" came up for PCI hotplug, and I still say the
> > same resolution there holds true, as described above.  The user wants to
> > remove something, so let them do it.  They always know best, and get mad
> > at us if we think otherwise :)
> 
> Well, not necessarily.  Users sometimes really don't know what they are doing
> and want us to give them a hint.  My opinion is that if we can give them a
> hint, there's no reason not to.
> 
> > What does the ACPI spec say about this type of thing?  Surely the same
> > people that did the PCI Hotplug spec were consulted when doing this part
> > of the spec, right?  Yeah, I know, I can dream...
> 
> It's not very specific (as usual), but it gives hints. :-)
> 
> For example, there is the _OST method (Section 6.3.5 of ACPI 5) that we are
> supposed to use to notify the platform of ejection failures and there are
> status codes like "0x81: Device in use by application" or "0x82: Device busy"
> that can be used in there.  So definitely the authors took ejection failures
> for software-related reasons into consideration.

That is correct.  Also, ACPI spec deliberately does not define
implementation details, so we defined DIG64 hotplug spec below (which I
contributed to the spec.)

http://www.dig64.org/home/DIG64_HPPF_R1_0.pdf

For example, Figure 2 in page 14 states memory hot-remove flow.  The
operation needs to either succeed or fail.  Crash or hang is not an
option.


Thanks,
-Toshi
Toshi Kani Feb. 4, 2013, 4:19 p.m. UTC | #33
On Mon, 2013-02-04 at 15:21 +0100, Rafael J. Wysocki wrote:
> On Monday, February 04, 2013 04:48:10 AM Greg KH wrote:
> > On Sun, Feb 03, 2013 at 09:44:39PM +0100, Rafael J. Wysocki wrote:
> > > > Yes, but those are just remove events and we can only see how destructive they
> > > > were after the removal.  The point is to be able to figure out whether or not
> > > > we *want* to do the removal in the first place.
> > > > 
> > > > Say you have a computing node which signals a hardware problem in a processor
> > > > package (the container with CPU cores, memory, PCI host bridge etc.).  You
> > > > may want to eject that package, but you don't want to kill the system this
> > > > way.  So if the eject is doable, it is very much desirable to do it, but if it
> > > > is not doable, you'd rather shut the box down and do the replacement afterward.
> > > > That may be costly, however (maybe weeks of computations), so it should be
> > > > avoided if possible, but not at the expense of crashing the box if the eject
> > > > doesn't work out.
> > > 
> > > It seems to me that we could handle that with the help of a new flag, say
> > > "no_eject", in struct device, a global mutex, and a function that will walk
> > > the given subtree of the device hierarchy and check if "no_eject" is set for
> > > any devices in there.  Plus a global "no_eject" switch, perhaps.
> > 
> > I think this will always be racy, or at worst, slow things down on
> > normal device operations as you will always be having to grab this flag
> > whenever you want to do something new.
> 
> I don't see why this particular scheme should be racy, at least I don't see any
> obvious races in it (although I'm not that good at races detection in general,
> admittedly).
> 
> Also, I don't expect that flag to be used for everything, just for things known
> to seriously break if forcible eject is done.  That may be not precise enough,
> so that's a matter of defining its purpose more precisely.
> 
> We can do something like that on the ACPI level (ie. introduce a no_eject flag
> in struct acpi_device and provide an iterface for the layers above ACPI to
> manipulate it) but then devices without ACPI namespace objects won't be
> covered.  That may not be a big deal, though.

I am afraid that bringing the device status management into the ACPI
level would not a good idea.  acpi_device should only reflect ACPI
device object information, not how its actual device is being used.

I like your initiative of acpi_scan_driver and I think scanning /
trimming of ACPI object info is what the ACPI drivers should do.


Thanks,
-Toshi
Toshi Kani Feb. 4, 2013, 4:46 p.m. UTC | #34
On Mon, 2013-02-04 at 04:46 -0800, Greg KH wrote:
> On Sun, Feb 03, 2013 at 05:28:09PM -0700, Toshi Kani wrote:
> > On Sat, 2013-02-02 at 16:01 +0100, Greg KH wrote:
> > > On Fri, Feb 01, 2013 at 01:40:10PM -0700, Toshi Kani wrote:
> > > > On Fri, 2013-02-01 at 07:30 +0000, Greg KH wrote:
> > > > > On Thu, Jan 31, 2013 at 06:32:18PM -0700, Toshi Kani wrote:
> > > > >  > This is already done for PCI host bridges and platform devices and I don't
> > > > > > > see why we can't do that for the other types of devices too.
> > > > > > > 
> > > > > > > The only missing piece I see is a way to handle the "eject" problem, i.e.
> > > > > > > when we try do eject a device at the top of a subtree and need to tear down
> > > > > > > the entire subtree below it, but if that's going to lead to a system crash,
> > > > > > > for example, we want to cancel the eject.  It seems to me that we'll need some
> > > > > > > help from the driver core here.
> > > > > > 
> > > > > > There are three different approaches suggested for system device
> > > > > > hot-plug:
> > > > > >  A. Proceed within system device bus scan.
> > > > > >  B. Proceed within ACPI bus scan.
> > > > > >  C. Proceed with a sequence (as a mini-boot).
> > > > > > 
> > > > > > Option A uses system devices as tokens, option B uses acpi devices as
> > > > > > tokens, and option C uses resource tables as tokens, for their handlers.
> > > > > > 
> > > > > > Here is summary of key questions & answers so far.  I hope this
> > > > > > clarifies why I am suggesting option 3.
> > > > > > 
> > > > > > 1. What are the system devices?
> > > > > > System devices provide system-wide core computing resources, which are
> > > > > > essential to compose a computer system.  System devices are not
> > > > > > connected to any particular standard buses.
> > > > > 
> > > > > Not a problem, lots of devices are not connected to any "particular
> > > > > standard busses".  All this means is that system devices are connected
> > > > > to the "system" bus, nothing more.
> > > > 
> > > > Can you give me a few examples of other devices that support hotplug and
> > > > are not connected to any particular buses?  I will investigate them to
> > > > see how they are managed to support hotplug.
> > > 
> > > Any device that is attached to any bus in the driver model can be
> > > hotunplugged from userspace by telling it to be "unbound" from the
> > > driver controlling it.  Try it for any platform device in your system to
> > > see how it happens.
> > 
> > The unbind operation, as I understand from you, is to detach a driver
> > from a device.  Yes, unbinding can be done for any devices.  It is
> > however different from hot-plug operation, which unplugs a device.
> 
> Physically, yes, but to the driver involved, and the driver core, there
> is no difference.  That was one of the primary goals of the driver core
> creation so many years ago.
> 
> > Today, the unbind operation to an ACPI cpu/memory devices causes
> > hot-unplug (offline) operation to them, which is one of the major issues
> > for us since unbind cannot fail.  This patchset addresses this issue by
> > making the unbind operation of ACPI cpu/memory devices to do the
> > unbinding only.  ACPI drivers no longer control cpu and memory as they
> > are supposed to be controlled by their drivers, cpu and memory modules.
> 
> I think that's the problem right there, solve that, please.

We cannot eliminate the ACPI drivers since we have to scan ACPI.  But we
can limit the ACPI drivers to do the scanning stuff only.   This is
precisely the intend of this patchset.  The real stuff, removing actual
devices, is done by the system device drivers/modules.


> > > > > > 2. Why are the system devices special?
> > > > > > The system devices are initialized during early boot-time, by multiple
> > > > > > subsystems, from the boot-up sequence, in pre-defined order.  They
> > > > > > provide low-level services to enable other subsystems to come up.
> > > > > 
> > > > > Sorry, no, that doesn't mean they are special, nothing here is unique
> > > > > for the point of view of the driver model from any other device or bus.
> > > > 
> > > > I think system devices are unique in a sense that they are initialized
> > > > before drivers run.
> > > 
> > > No, most all devices are "initialized" before a driver runs on it, USB
> > > is one such example, PCI another, and I'm pretty sure that there are
> > > others.
> > 
> > USB devices can be initialized after the USB bus driver is initialized.
> > Similarly, PCI devices can be initialized after the PCI bus driver is
> > initialized.  However, CPU and memory are initialized without any
> > dependency to their bus driver since there is no such thing.
> 
> You can create such a thing if you want :)

Well, a pseudo driver could be created for it, but it does not make any
difference.  Access to CPU and memory does not go thru any bus
controller visible to the OS.  CPU and memory are connected with links
(which are up at begging) and do not have bus structure any more.


> > In addition, CPU and memory have two drivers -- their actual
> > drivers/subsystems and their ACPI drivers.
> 
> Again, I feel that is the root of the problem.  Rafael seems to be
> working on solving this, which I think is essencial to your work as
> well.

Yes, Rafael is doing excellent work to turn ACPI drivers into ACPI
"scan" drivers, removing device driver portion, and keeping them as
attach / detach operation to ACPI device object.  My patchset is very
much aligned with this direction. :)


Thanks,
-Toshi
Rafael J. Wysocki Feb. 4, 2013, 7:43 p.m. UTC | #35
On Monday, February 04, 2013 09:19:09 AM Toshi Kani wrote:
> On Mon, 2013-02-04 at 15:21 +0100, Rafael J. Wysocki wrote:
> > On Monday, February 04, 2013 04:48:10 AM Greg KH wrote:
> > > On Sun, Feb 03, 2013 at 09:44:39PM +0100, Rafael J. Wysocki wrote:
> > > > > Yes, but those are just remove events and we can only see how destructive they
> > > > > were after the removal.  The point is to be able to figure out whether or not
> > > > > we *want* to do the removal in the first place.
> > > > > 
> > > > > Say you have a computing node which signals a hardware problem in a processor
> > > > > package (the container with CPU cores, memory, PCI host bridge etc.).  You
> > > > > may want to eject that package, but you don't want to kill the system this
> > > > > way.  So if the eject is doable, it is very much desirable to do it, but if it
> > > > > is not doable, you'd rather shut the box down and do the replacement afterward.
> > > > > That may be costly, however (maybe weeks of computations), so it should be
> > > > > avoided if possible, but not at the expense of crashing the box if the eject
> > > > > doesn't work out.
> > > > 
> > > > It seems to me that we could handle that with the help of a new flag, say
> > > > "no_eject", in struct device, a global mutex, and a function that will walk
> > > > the given subtree of the device hierarchy and check if "no_eject" is set for
> > > > any devices in there.  Plus a global "no_eject" switch, perhaps.
> > > 
> > > I think this will always be racy, or at worst, slow things down on
> > > normal device operations as you will always be having to grab this flag
> > > whenever you want to do something new.
> > 
> > I don't see why this particular scheme should be racy, at least I don't see any
> > obvious races in it (although I'm not that good at races detection in general,
> > admittedly).
> > 
> > Also, I don't expect that flag to be used for everything, just for things known
> > to seriously break if forcible eject is done.  That may be not precise enough,
> > so that's a matter of defining its purpose more precisely.
> > 
> > We can do something like that on the ACPI level (ie. introduce a no_eject flag
> > in struct acpi_device and provide an iterface for the layers above ACPI to
> > manipulate it) but then devices without ACPI namespace objects won't be
> > covered.  That may not be a big deal, though.
> 
> I am afraid that bringing the device status management into the ACPI
> level would not a good idea.  acpi_device should only reflect ACPI
> device object information, not how its actual device is being used.
> 
> I like your initiative of acpi_scan_driver and I think scanning /
> trimming of ACPI object info is what the ACPI drivers should do.

ACPI drivers, yes, but the users of ACPI already rely on information
in struct acpi_device.  Like ACPI device power states, for example.

So platform_no_eject(dev) is not much different in that respect from
platform_pci_set_power_state(pci_dev).

The whole "eject" concept is somewhat ACPI-specific, though, and the eject
notifications come from ACPI, so I don't have a problem with limiting it to
ACPI-backed devices for the time being.

If it turns out the be useful outside of ACPI, then we can move it up to the
driver core.  For now I don't see a compelling reason to do that.

Thanks,
Rafael
Rafael J. Wysocki Feb. 4, 2013, 7:45 p.m. UTC | #36
On Monday, February 04, 2013 09:46:18 AM Toshi Kani wrote:
> On Mon, 2013-02-04 at 04:46 -0800, Greg KH wrote:
> > On Sun, Feb 03, 2013 at 05:28:09PM -0700, Toshi Kani wrote:
> > > On Sat, 2013-02-02 at 16:01 +0100, Greg KH wrote:
> > > > On Fri, Feb 01, 2013 at 01:40:10PM -0700, Toshi Kani wrote:
> > > > > On Fri, 2013-02-01 at 07:30 +0000, Greg KH wrote:
> > > > > > On Thu, Jan 31, 2013 at 06:32:18PM -0700, Toshi Kani wrote:
> > > > > >  > This is already done for PCI host bridges and platform devices and I don't
> > > > > > > > see why we can't do that for the other types of devices too.
> > > > > > > > 
> > > > > > > > The only missing piece I see is a way to handle the "eject" problem, i.e.
> > > > > > > > when we try do eject a device at the top of a subtree and need to tear down
> > > > > > > > the entire subtree below it, but if that's going to lead to a system crash,
> > > > > > > > for example, we want to cancel the eject.  It seems to me that we'll need some
> > > > > > > > help from the driver core here.
> > > > > > > 
> > > > > > > There are three different approaches suggested for system device
> > > > > > > hot-plug:
> > > > > > >  A. Proceed within system device bus scan.
> > > > > > >  B. Proceed within ACPI bus scan.
> > > > > > >  C. Proceed with a sequence (as a mini-boot).
> > > > > > > 
> > > > > > > Option A uses system devices as tokens, option B uses acpi devices as
> > > > > > > tokens, and option C uses resource tables as tokens, for their handlers.
> > > > > > > 
> > > > > > > Here is summary of key questions & answers so far.  I hope this
> > > > > > > clarifies why I am suggesting option 3.
> > > > > > > 
> > > > > > > 1. What are the system devices?
> > > > > > > System devices provide system-wide core computing resources, which are
> > > > > > > essential to compose a computer system.  System devices are not
> > > > > > > connected to any particular standard buses.
> > > > > > 
> > > > > > Not a problem, lots of devices are not connected to any "particular
> > > > > > standard busses".  All this means is that system devices are connected
> > > > > > to the "system" bus, nothing more.
> > > > > 
> > > > > Can you give me a few examples of other devices that support hotplug and
> > > > > are not connected to any particular buses?  I will investigate them to
> > > > > see how they are managed to support hotplug.
> > > > 
> > > > Any device that is attached to any bus in the driver model can be
> > > > hotunplugged from userspace by telling it to be "unbound" from the
> > > > driver controlling it.  Try it for any platform device in your system to
> > > > see how it happens.
> > > 
> > > The unbind operation, as I understand from you, is to detach a driver
> > > from a device.  Yes, unbinding can be done for any devices.  It is
> > > however different from hot-plug operation, which unplugs a device.
> > 
> > Physically, yes, but to the driver involved, and the driver core, there
> > is no difference.  That was one of the primary goals of the driver core
> > creation so many years ago.
> > 
> > > Today, the unbind operation to an ACPI cpu/memory devices causes
> > > hot-unplug (offline) operation to them, which is one of the major issues
> > > for us since unbind cannot fail.  This patchset addresses this issue by
> > > making the unbind operation of ACPI cpu/memory devices to do the
> > > unbinding only.  ACPI drivers no longer control cpu and memory as they
> > > are supposed to be controlled by their drivers, cpu and memory modules.
> > 
> > I think that's the problem right there, solve that, please.
> 
> We cannot eliminate the ACPI drivers since we have to scan ACPI.  But we
> can limit the ACPI drivers to do the scanning stuff only.   This is
> precisely the intend of this patchset.  The real stuff, removing actual
> devices, is done by the system device drivers/modules.

In case you haven't realized that yet, the $subject patchset has no future.

Let's just talk about how we can get what we need in more general terms.

Thanks,
Rafael
Toshi Kani Feb. 4, 2013, 7:46 p.m. UTC | #37
On Mon, 2013-02-04 at 20:48 +0100, Rafael J. Wysocki wrote:
> On Monday, February 04, 2013 09:02:46 AM Toshi Kani wrote:
> > On Mon, 2013-02-04 at 14:41 +0100, Rafael J. Wysocki wrote:
> > > On Sunday, February 03, 2013 07:23:49 PM Greg KH wrote:
> > > > On Sat, Feb 02, 2013 at 09:15:37PM +0100, Rafael J. Wysocki wrote:
> > > > > On Saturday, February 02, 2013 03:58:01 PM Greg KH wrote:
> >   :
> > > > > Yes, but those are just remove events and we can only see how destructive they
> > > > > were after the removal.  The point is to be able to figure out whether or not
> > > > > we *want* to do the removal in the first place.
> > > > 
> > > > Yes, but, you will always race if you try to test to see if you can shut
> > > > down a device and then trying to do it.  So walking the bus ahead of
> > > > time isn't a good idea.
> > > >
> > > > And, we really don't have a viable way to recover if disconnect() fails,
> > > > do we.  What do we do in that situation, restore the other devices we
> > > > disconnected successfully?  How do we remember/know what they were?
> > > > 
> > > > PCI hotplug almost had this same problem until the designers finally
> > > > realized that they just had to accept the fact that removing a PCI
> > > > device could either happen by:
> > > > 	- a user yanking out the device, at which time the OS better
> > > > 	  clean up properly no matter what happens
> > > > 	- the user asked nicely to remove a device, and the OS can take
> > > > 	  as long as it wants to complete that action, including
> > > > 	  stalling for noticable amounts of time before eventually,
> > > > 	  always letting the action succeed.
> > > > 
> > > > I think the second thing is what you have to do here.  If a user tells
> > > > the OS it wants to remove these devices, you better do it.  If you
> > > > can't, because memory is being used by someone else, either move them
> > > > off, or just hope that nothing bad happens, before the user gets
> > > > frustrated and yanks out the CPU/memory module themselves physically :)
> > > 
> > > Well, that we can't help, but sometimes users really *want* the OS to tell them
> > > if it is safe to unplug something at this particualr time (think about the
> > > Windows' "safe remove" feature for USB sticks, for example; that came out of
> > > users' demand AFAIR).
> > > 
> > > So in my opinion it would be good to give them an option to do "safe eject" or
> > > "forcible eject", whichever they prefer.
> > 
> > For system device hot-plug, it always needs to be "safe eject".  This
> > feature will be implemented on mission critical servers, which are
> > managed by professional IT folks.  Crashing a server causes serious
> > money to the business.
> 
> Well, "always" is a bit too strong a word as far as human behavior is concerned
> in my opinion.
> 
> That said I would be perfectly fine with not supporting the "forcible eject" to
> start with and waiting for the first request to add support for it.  I also
> would be fine with taking bets on how much time it's going to take for such a
> request to appear. :-)

Sounds good.  In my experience, though, it actually takes a LONG time to
convince customers that "safe eject" is actually safe.  Enterprise
customers are so afraid of doing anything risky that might cause the
system to crash or hang due to some defect.  I would be very surprised
to see a customer asking for a force operation when we do not guarantee
its outcome.  I have not seen such enterprise customers yet.

Thanks,
-Toshi
Rafael J. Wysocki Feb. 4, 2013, 7:48 p.m. UTC | #38
On Monday, February 04, 2013 09:02:46 AM Toshi Kani wrote:
> On Mon, 2013-02-04 at 14:41 +0100, Rafael J. Wysocki wrote:
> > On Sunday, February 03, 2013 07:23:49 PM Greg KH wrote:
> > > On Sat, Feb 02, 2013 at 09:15:37PM +0100, Rafael J. Wysocki wrote:
> > > > On Saturday, February 02, 2013 03:58:01 PM Greg KH wrote:
>   :
> > > > Yes, but those are just remove events and we can only see how destructive they
> > > > were after the removal.  The point is to be able to figure out whether or not
> > > > we *want* to do the removal in the first place.
> > > 
> > > Yes, but, you will always race if you try to test to see if you can shut
> > > down a device and then trying to do it.  So walking the bus ahead of
> > > time isn't a good idea.
> > >
> > > And, we really don't have a viable way to recover if disconnect() fails,
> > > do we.  What do we do in that situation, restore the other devices we
> > > disconnected successfully?  How do we remember/know what they were?
> > > 
> > > PCI hotplug almost had this same problem until the designers finally
> > > realized that they just had to accept the fact that removing a PCI
> > > device could either happen by:
> > > 	- a user yanking out the device, at which time the OS better
> > > 	  clean up properly no matter what happens
> > > 	- the user asked nicely to remove a device, and the OS can take
> > > 	  as long as it wants to complete that action, including
> > > 	  stalling for noticable amounts of time before eventually,
> > > 	  always letting the action succeed.
> > > 
> > > I think the second thing is what you have to do here.  If a user tells
> > > the OS it wants to remove these devices, you better do it.  If you
> > > can't, because memory is being used by someone else, either move them
> > > off, or just hope that nothing bad happens, before the user gets
> > > frustrated and yanks out the CPU/memory module themselves physically :)
> > 
> > Well, that we can't help, but sometimes users really *want* the OS to tell them
> > if it is safe to unplug something at this particualr time (think about the
> > Windows' "safe remove" feature for USB sticks, for example; that came out of
> > users' demand AFAIR).
> > 
> > So in my opinion it would be good to give them an option to do "safe eject" or
> > "forcible eject", whichever they prefer.
> 
> For system device hot-plug, it always needs to be "safe eject".  This
> feature will be implemented on mission critical servers, which are
> managed by professional IT folks.  Crashing a server causes serious
> money to the business.

Well, "always" is a bit too strong a word as far as human behavior is concerned
in my opinion.

That said I would be perfectly fine with not supporting the "forcible eject" to
start with and waiting for the first request to add support for it.  I also
would be fine with taking bets on how much time it's going to take for such a
request to appear. :-)

Thanks,
Rafael
Rafael J. Wysocki Feb. 4, 2013, 8:07 p.m. UTC | #39
On Monday, February 04, 2013 06:33:52 AM Greg KH wrote:
> On Mon, Feb 04, 2013 at 03:21:22PM +0100, Rafael J. Wysocki wrote:
> > On Monday, February 04, 2013 04:48:10 AM Greg KH wrote:
> > > On Sun, Feb 03, 2013 at 09:44:39PM +0100, Rafael J. Wysocki wrote:
> > > > > Yes, but those are just remove events and we can only see how destructive they
> > > > > were after the removal.  The point is to be able to figure out whether or not
> > > > > we *want* to do the removal in the first place.
> > > > > 
> > > > > Say you have a computing node which signals a hardware problem in a processor
> > > > > package (the container with CPU cores, memory, PCI host bridge etc.).  You
> > > > > may want to eject that package, but you don't want to kill the system this
> > > > > way.  So if the eject is doable, it is very much desirable to do it, but if it
> > > > > is not doable, you'd rather shut the box down and do the replacement afterward.
> > > > > That may be costly, however (maybe weeks of computations), so it should be
> > > > > avoided if possible, but not at the expense of crashing the box if the eject
> > > > > doesn't work out.
> > > > 
> > > > It seems to me that we could handle that with the help of a new flag, say
> > > > "no_eject", in struct device, a global mutex, and a function that will walk
> > > > the given subtree of the device hierarchy and check if "no_eject" is set for
> > > > any devices in there.  Plus a global "no_eject" switch, perhaps.
> > > 
> > > I think this will always be racy, or at worst, slow things down on
> > > normal device operations as you will always be having to grab this flag
> > > whenever you want to do something new.
> > 
> > I don't see why this particular scheme should be racy, at least I don't see any
> > obvious races in it (although I'm not that good at races detection in general,
> > admittedly).
> > 
> > Also, I don't expect that flag to be used for everything, just for things known
> > to seriously break if forcible eject is done.  That may be not precise enough,
> > so that's a matter of defining its purpose more precisely.
> > 
> > We can do something like that on the ACPI level (ie. introduce a no_eject flag
> > in struct acpi_device and provide an iterface for the layers above ACPI to
> > manipulate it) but then devices without ACPI namespace objects won't be
> > covered.  That may not be a big deal, though.
> > 
> > So say dev is about to be used for something incompatible with ejecting, so to
> > speak.  Then, one would do platform_lock_eject(dev), which would check if dev
> > has an ACPI handle and then take acpi_eject_lock (if so).  The return value of
> > platform_lock_eject(dev) would need to be checked to see if the device is not
> > gone.  If it returns success (0), one would do something to the device and
> > call platform_no_eject(dev) and then platform_unlock_eject(dev).
> 
> How does a device "know" it is doing something that is incompatible with
> ejecting?  That's a non-trivial task from what I can tell.

I agree that this is complicated in general.  But.

There are devices known to have software "offline" and "online" operations
such that after the "offline" the given device is guaranteed to be not used
until "online".  We have that for CPU cores, for example, and user space can
do it via /sys/devices/system/cpu/cpuX/online .  So, why don't we make the
"online" set the no_eject flag (under the lock as appropriate) and the
"offline" clear it?  And why don't we define such "online" and "offline" for
all of the other "system" stuff, like memory, PCI host bridges etc. and make it
behave analogously?

Then, it is quite simple to say which devices should use the no_eject flag:
devices that have "online" and "offline" exported to user space.  And guess
who's responsible for "offlining" all of those things before trying to eject
them: user space is.  From the kernel's point of view it is all clear.  Hands
clean. :-)

Now, there's a different problem how to expose all of the relevant information
to user space so that it knows what to "offline" for the specific eject
operation to succeed, but that's kind of separate and worth addressing
anyway.

> What happens if a device wants to set that flag, right after it was told
> to eject and the device was in the middle of being removed?  How can you
> "fail" the "I can't be removed me now, so don't" requirement that it now
> has?

This one is easy. :-)

If platform_lock_eject() is called when an eject is under way, it will block
on acpi_eject_lock until the eject is complete and if the device is gone as
a result of the eject, it will return an error code.

In turn, if an eject happens after platform_lock_eject(), it will block until
platform_unlock_eject() and if platform_no_eject() is called in between the
lock and unlock, it will notice the device with no_eject set and bail out.

Quite obviously, it would be a bug to call platform_lock_eject() from within an
eject code path.

Thanks,
Rafael
Rafael J. Wysocki Feb. 4, 2013, 8:12 p.m. UTC | #40
On Monday, February 04, 2013 12:46:24 PM Toshi Kani wrote:
> On Mon, 2013-02-04 at 20:48 +0100, Rafael J. Wysocki wrote:
> > On Monday, February 04, 2013 09:02:46 AM Toshi Kani wrote:
> > > On Mon, 2013-02-04 at 14:41 +0100, Rafael J. Wysocki wrote:
> > > > On Sunday, February 03, 2013 07:23:49 PM Greg KH wrote:
> > > > > On Sat, Feb 02, 2013 at 09:15:37PM +0100, Rafael J. Wysocki wrote:
> > > > > > On Saturday, February 02, 2013 03:58:01 PM Greg KH wrote:
> > >   :
> > > > > > Yes, but those are just remove events and we can only see how destructive they
> > > > > > were after the removal.  The point is to be able to figure out whether or not
> > > > > > we *want* to do the removal in the first place.
> > > > > 
> > > > > Yes, but, you will always race if you try to test to see if you can shut
> > > > > down a device and then trying to do it.  So walking the bus ahead of
> > > > > time isn't a good idea.
> > > > >
> > > > > And, we really don't have a viable way to recover if disconnect() fails,
> > > > > do we.  What do we do in that situation, restore the other devices we
> > > > > disconnected successfully?  How do we remember/know what they were?
> > > > > 
> > > > > PCI hotplug almost had this same problem until the designers finally
> > > > > realized that they just had to accept the fact that removing a PCI
> > > > > device could either happen by:
> > > > > 	- a user yanking out the device, at which time the OS better
> > > > > 	  clean up properly no matter what happens
> > > > > 	- the user asked nicely to remove a device, and the OS can take
> > > > > 	  as long as it wants to complete that action, including
> > > > > 	  stalling for noticable amounts of time before eventually,
> > > > > 	  always letting the action succeed.
> > > > > 
> > > > > I think the second thing is what you have to do here.  If a user tells
> > > > > the OS it wants to remove these devices, you better do it.  If you
> > > > > can't, because memory is being used by someone else, either move them
> > > > > off, or just hope that nothing bad happens, before the user gets
> > > > > frustrated and yanks out the CPU/memory module themselves physically :)
> > > > 
> > > > Well, that we can't help, but sometimes users really *want* the OS to tell them
> > > > if it is safe to unplug something at this particualr time (think about the
> > > > Windows' "safe remove" feature for USB sticks, for example; that came out of
> > > > users' demand AFAIR).
> > > > 
> > > > So in my opinion it would be good to give them an option to do "safe eject" or
> > > > "forcible eject", whichever they prefer.
> > > 
> > > For system device hot-plug, it always needs to be "safe eject".  This
> > > feature will be implemented on mission critical servers, which are
> > > managed by professional IT folks.  Crashing a server causes serious
> > > money to the business.
> > 
> > Well, "always" is a bit too strong a word as far as human behavior is concerned
> > in my opinion.
> > 
> > That said I would be perfectly fine with not supporting the "forcible eject" to
> > start with and waiting for the first request to add support for it.  I also
> > would be fine with taking bets on how much time it's going to take for such a
> > request to appear. :-)
> 
> Sounds good.  In my experience, though, it actually takes a LONG time to
> convince customers that "safe eject" is actually safe.  Enterprise
> customers are so afraid of doing anything risky that might cause the
> system to crash or hang due to some defect.  I would be very surprised
> to see a customer asking for a force operation when we do not guarantee
> its outcome.  I have not seen such enterprise customers yet.

But we're talking about a kernel that is supposed to run on mobile phones too,
among other things.

Thanks,
Rafael
Toshi Kani Feb. 4, 2013, 8:34 p.m. UTC | #41
On Mon, 2013-02-04 at 21:12 +0100, Rafael J. Wysocki wrote:
> On Monday, February 04, 2013 12:46:24 PM Toshi Kani wrote:
> > On Mon, 2013-02-04 at 20:48 +0100, Rafael J. Wysocki wrote:
> > > On Monday, February 04, 2013 09:02:46 AM Toshi Kani wrote:
> > > > On Mon, 2013-02-04 at 14:41 +0100, Rafael J. Wysocki wrote:
> > > > > On Sunday, February 03, 2013 07:23:49 PM Greg KH wrote:
> > > > > > On Sat, Feb 02, 2013 at 09:15:37PM +0100, Rafael J. Wysocki wrote:
> > > > > > > On Saturday, February 02, 2013 03:58:01 PM Greg KH wrote:
> > > >   :
> > > > > > > Yes, but those are just remove events and we can only see how destructive they
> > > > > > > were after the removal.  The point is to be able to figure out whether or not
> > > > > > > we *want* to do the removal in the first place.
> > > > > > 
> > > > > > Yes, but, you will always race if you try to test to see if you can shut
> > > > > > down a device and then trying to do it.  So walking the bus ahead of
> > > > > > time isn't a good idea.
> > > > > >
> > > > > > And, we really don't have a viable way to recover if disconnect() fails,
> > > > > > do we.  What do we do in that situation, restore the other devices we
> > > > > > disconnected successfully?  How do we remember/know what they were?
> > > > > > 
> > > > > > PCI hotplug almost had this same problem until the designers finally
> > > > > > realized that they just had to accept the fact that removing a PCI
> > > > > > device could either happen by:
> > > > > > 	- a user yanking out the device, at which time the OS better
> > > > > > 	  clean up properly no matter what happens
> > > > > > 	- the user asked nicely to remove a device, and the OS can take
> > > > > > 	  as long as it wants to complete that action, including
> > > > > > 	  stalling for noticable amounts of time before eventually,
> > > > > > 	  always letting the action succeed.
> > > > > > 
> > > > > > I think the second thing is what you have to do here.  If a user tells
> > > > > > the OS it wants to remove these devices, you better do it.  If you
> > > > > > can't, because memory is being used by someone else, either move them
> > > > > > off, or just hope that nothing bad happens, before the user gets
> > > > > > frustrated and yanks out the CPU/memory module themselves physically :)
> > > > > 
> > > > > Well, that we can't help, but sometimes users really *want* the OS to tell them
> > > > > if it is safe to unplug something at this particualr time (think about the
> > > > > Windows' "safe remove" feature for USB sticks, for example; that came out of
> > > > > users' demand AFAIR).
> > > > > 
> > > > > So in my opinion it would be good to give them an option to do "safe eject" or
> > > > > "forcible eject", whichever they prefer.
> > > > 
> > > > For system device hot-plug, it always needs to be "safe eject".  This
> > > > feature will be implemented on mission critical servers, which are
> > > > managed by professional IT folks.  Crashing a server causes serious
> > > > money to the business.
> > > 
> > > Well, "always" is a bit too strong a word as far as human behavior is concerned
> > > in my opinion.
> > > 
> > > That said I would be perfectly fine with not supporting the "forcible eject" to
> > > start with and waiting for the first request to add support for it.  I also
> > > would be fine with taking bets on how much time it's going to take for such a
> > > request to appear. :-)
> > 
> > Sounds good.  In my experience, though, it actually takes a LONG time to
> > convince customers that "safe eject" is actually safe.  Enterprise
> > customers are so afraid of doing anything risky that might cause the
> > system to crash or hang due to some defect.  I would be very surprised
> > to see a customer asking for a force operation when we do not guarantee
> > its outcome.  I have not seen such enterprise customers yet.
> 
> But we're talking about a kernel that is supposed to run on mobile phones too,
> among other things.

I think using this feature for RAS i.e. replacing a faulty device
on-line, will continue to be limited for high-end systems.  For low-end
systems, it does not make sense for customers to pay much $$ for this
feature.  They can just shut the system down for replacement, or they
can simply buy a new system instead of repairing.

That said, using this feature on VM for workload balancing does not
require any special hardware.  So, I can see someone willing to try out
to see how it goes with a force option on VM for personal use.   

Thanks,
-Toshi
Toshi Kani Feb. 4, 2013, 8:59 p.m. UTC | #42
On Mon, 2013-02-04 at 20:45 +0100, Rafael J. Wysocki wrote:
> On Monday, February 04, 2013 09:46:18 AM Toshi Kani wrote:
> > On Mon, 2013-02-04 at 04:46 -0800, Greg KH wrote:
> > > On Sun, Feb 03, 2013 at 05:28:09PM -0700, Toshi Kani wrote:
> > > > On Sat, 2013-02-02 at 16:01 +0100, Greg KH wrote:
> > > > > On Fri, Feb 01, 2013 at 01:40:10PM -0700, Toshi Kani wrote:
> > > > > > On Fri, 2013-02-01 at 07:30 +0000, Greg KH wrote:
> > > > > > > On Thu, Jan 31, 2013 at 06:32:18PM -0700, Toshi Kani wrote:
> > > > > > >  > This is already done for PCI host bridges and platform devices and I don't
> > > > > > > > > see why we can't do that for the other types of devices too.
> > > > > > > > > 
> > > > > > > > > The only missing piece I see is a way to handle the "eject" problem, i.e.
> > > > > > > > > when we try do eject a device at the top of a subtree and need to tear down
> > > > > > > > > the entire subtree below it, but if that's going to lead to a system crash,
> > > > > > > > > for example, we want to cancel the eject.  It seems to me that we'll need some
> > > > > > > > > help from the driver core here.
> > > > > > > > 
> > > > > > > > There are three different approaches suggested for system device
> > > > > > > > hot-plug:
> > > > > > > >  A. Proceed within system device bus scan.
> > > > > > > >  B. Proceed within ACPI bus scan.
> > > > > > > >  C. Proceed with a sequence (as a mini-boot).
> > > > > > > > 
> > > > > > > > Option A uses system devices as tokens, option B uses acpi devices as
> > > > > > > > tokens, and option C uses resource tables as tokens, for their handlers.
> > > > > > > > 
> > > > > > > > Here is summary of key questions & answers so far.  I hope this
> > > > > > > > clarifies why I am suggesting option 3.
> > > > > > > > 
> > > > > > > > 1. What are the system devices?
> > > > > > > > System devices provide system-wide core computing resources, which are
> > > > > > > > essential to compose a computer system.  System devices are not
> > > > > > > > connected to any particular standard buses.
> > > > > > > 
> > > > > > > Not a problem, lots of devices are not connected to any "particular
> > > > > > > standard busses".  All this means is that system devices are connected
> > > > > > > to the "system" bus, nothing more.
> > > > > > 
> > > > > > Can you give me a few examples of other devices that support hotplug and
> > > > > > are not connected to any particular buses?  I will investigate them to
> > > > > > see how they are managed to support hotplug.
> > > > > 
> > > > > Any device that is attached to any bus in the driver model can be
> > > > > hotunplugged from userspace by telling it to be "unbound" from the
> > > > > driver controlling it.  Try it for any platform device in your system to
> > > > > see how it happens.
> > > > 
> > > > The unbind operation, as I understand from you, is to detach a driver
> > > > from a device.  Yes, unbinding can be done for any devices.  It is
> > > > however different from hot-plug operation, which unplugs a device.
> > > 
> > > Physically, yes, but to the driver involved, and the driver core, there
> > > is no difference.  That was one of the primary goals of the driver core
> > > creation so many years ago.
> > > 
> > > > Today, the unbind operation to an ACPI cpu/memory devices causes
> > > > hot-unplug (offline) operation to them, which is one of the major issues
> > > > for us since unbind cannot fail.  This patchset addresses this issue by
> > > > making the unbind operation of ACPI cpu/memory devices to do the
> > > > unbinding only.  ACPI drivers no longer control cpu and memory as they
> > > > are supposed to be controlled by their drivers, cpu and memory modules.
> > > 
> > > I think that's the problem right there, solve that, please.
> > 
> > We cannot eliminate the ACPI drivers since we have to scan ACPI.  But we
> > can limit the ACPI drivers to do the scanning stuff only.   This is
> > precisely the intend of this patchset.  The real stuff, removing actual
> > devices, is done by the system device drivers/modules.
> 
> In case you haven't realized that yet, the $subject patchset has no future.

That's really disappointing, esp. the fact that this basic approach has
been proven to work on other OS for years...


> Let's just talk about how we can get what we need in more general terms.

So, are we heading to an approach of doing everything in ACPI?  I am not
clear about which direction we have agreed with or disagreed with.

As for the eject flag approach, I agree with Greg.


Thanks,
-Toshi
Toshi Kani Feb. 4, 2013, 10:13 p.m. UTC | #43
On Mon, 2013-02-04 at 21:07 +0100, Rafael J. Wysocki wrote:
> On Monday, February 04, 2013 06:33:52 AM Greg KH wrote:
> > On Mon, Feb 04, 2013 at 03:21:22PM +0100, Rafael J. Wysocki wrote:
> > > On Monday, February 04, 2013 04:48:10 AM Greg KH wrote:
> > > > On Sun, Feb 03, 2013 at 09:44:39PM +0100, Rafael J. Wysocki wrote:
> > > > > > Yes, but those are just remove events and we can only see how destructive they
> > > > > > were after the removal.  The point is to be able to figure out whether or not
> > > > > > we *want* to do the removal in the first place.
> > > > > > 
> > > > > > Say you have a computing node which signals a hardware problem in a processor
> > > > > > package (the container with CPU cores, memory, PCI host bridge etc.).  You
> > > > > > may want to eject that package, but you don't want to kill the system this
> > > > > > way.  So if the eject is doable, it is very much desirable to do it, but if it
> > > > > > is not doable, you'd rather shut the box down and do the replacement afterward.
> > > > > > That may be costly, however (maybe weeks of computations), so it should be
> > > > > > avoided if possible, but not at the expense of crashing the box if the eject
> > > > > > doesn't work out.
> > > > > 
> > > > > It seems to me that we could handle that with the help of a new flag, say
> > > > > "no_eject", in struct device, a global mutex, and a function that will walk
> > > > > the given subtree of the device hierarchy and check if "no_eject" is set for
> > > > > any devices in there.  Plus a global "no_eject" switch, perhaps.
> > > > 
> > > > I think this will always be racy, or at worst, slow things down on
> > > > normal device operations as you will always be having to grab this flag
> > > > whenever you want to do something new.
> > > 
> > > I don't see why this particular scheme should be racy, at least I don't see any
> > > obvious races in it (although I'm not that good at races detection in general,
> > > admittedly).
> > > 
> > > Also, I don't expect that flag to be used for everything, just for things known
> > > to seriously break if forcible eject is done.  That may be not precise enough,
> > > so that's a matter of defining its purpose more precisely.
> > > 
> > > We can do something like that on the ACPI level (ie. introduce a no_eject flag
> > > in struct acpi_device and provide an iterface for the layers above ACPI to
> > > manipulate it) but then devices without ACPI namespace objects won't be
> > > covered.  That may not be a big deal, though.
> > > 
> > > So say dev is about to be used for something incompatible with ejecting, so to
> > > speak.  Then, one would do platform_lock_eject(dev), which would check if dev
> > > has an ACPI handle and then take acpi_eject_lock (if so).  The return value of
> > > platform_lock_eject(dev) would need to be checked to see if the device is not
> > > gone.  If it returns success (0), one would do something to the device and
> > > call platform_no_eject(dev) and then platform_unlock_eject(dev).
> > 
> > How does a device "know" it is doing something that is incompatible with
> > ejecting?  That's a non-trivial task from what I can tell.
> 
> I agree that this is complicated in general.  But.
> 
> There are devices known to have software "offline" and "online" operations
> such that after the "offline" the given device is guaranteed to be not used
> until "online".  We have that for CPU cores, for example, and user space can
> do it via /sys/devices/system/cpu/cpuX/online .  So, why don't we make the
> "online" set the no_eject flag (under the lock as appropriate) and the
> "offline" clear it?  And why don't we define such "online" and "offline" for
> all of the other "system" stuff, like memory, PCI host bridges etc. and make it
> behave analogously?
> 
> Then, it is quite simple to say which devices should use the no_eject flag:
> devices that have "online" and "offline" exported to user space.  And guess
> who's responsible for "offlining" all of those things before trying to eject
> them: user space is.  From the kernel's point of view it is all clear.  Hands
> clean. :-)
> 
> Now, there's a different problem how to expose all of the relevant information
> to user space so that it knows what to "offline" for the specific eject
> operation to succeed, but that's kind of separate and worth addressing
> anyway.

So, the idea is to run a user space program that off-lines all relevant
devices before trimming ACPI devices.  Is that right?  That sounds like
a worth idea to consider with.  This basically moves the "sequencer"
part into user space instead of the kernel space in my proposal.  I
agree that how to expose all of the relevant info to user space is an
issue.  Also, we will need to make sure that the user program always
runs per a kernel request and then informs a result back to the kernel,
so that the kernel can do the rest of an operation and inform a result
to FW with _OST or _EJ0.  This loop has to close.  I think it is going
to be more complicated than the kernel-only approach.

In addition, I am not sure if the "no_eject" flag in acpi_device is
really necessary here since the user program will inform the kernel if
all devices are off-line.  Also, the kernel will likely need to expose
the device info to the user program to tell which devices need to be
off-lined.  At that time, the kernel already knows if there is any
on-line device in the scope.


> > What happens if a device wants to set that flag, right after it was told
> > to eject and the device was in the middle of being removed?  How can you
> > "fail" the "I can't be removed me now, so don't" requirement that it now
> > has?
> 
> This one is easy. :-)
> 
> If platform_lock_eject() is called when an eject is under way, it will block
> on acpi_eject_lock until the eject is complete and if the device is gone as
> a result of the eject, it will return an error code.

In this case, we do really need to make sure that the user program does
not get killed in the middle of its operation since the kernel is
holding a lock while it is under way.


Thanks,
-Toshi


> In turn, if an eject happens after platform_lock_eject(), it will block until
> platform_unlock_eject() and if platform_no_eject() is called in between the
> lock and unlock, it will notice the device with no_eject set and bail out.
> 
> Quite obviously, it would be a bug to call platform_lock_eject() from within an
> eject code path.
> 
> Thanks,
> Rafael
> 
>
Rafael J. Wysocki Feb. 4, 2013, 11:19 p.m. UTC | #44
On Monday, February 04, 2013 01:34:18 PM Toshi Kani wrote:
> On Mon, 2013-02-04 at 21:12 +0100, Rafael J. Wysocki wrote:
> > On Monday, February 04, 2013 12:46:24 PM Toshi Kani wrote:
> > > On Mon, 2013-02-04 at 20:48 +0100, Rafael J. Wysocki wrote:
> > > > On Monday, February 04, 2013 09:02:46 AM Toshi Kani wrote:
> > > > > On Mon, 2013-02-04 at 14:41 +0100, Rafael J. Wysocki wrote:
> > > > > > On Sunday, February 03, 2013 07:23:49 PM Greg KH wrote:
> > > > > > > On Sat, Feb 02, 2013 at 09:15:37PM +0100, Rafael J. Wysocki wrote:
> > > > > > > > On Saturday, February 02, 2013 03:58:01 PM Greg KH wrote:
> > > > >   :
> > > > > > > > Yes, but those are just remove events and we can only see how destructive they
> > > > > > > > were after the removal.  The point is to be able to figure out whether or not
> > > > > > > > we *want* to do the removal in the first place.
> > > > > > > 
> > > > > > > Yes, but, you will always race if you try to test to see if you can shut
> > > > > > > down a device and then trying to do it.  So walking the bus ahead of
> > > > > > > time isn't a good idea.
> > > > > > >
> > > > > > > And, we really don't have a viable way to recover if disconnect() fails,
> > > > > > > do we.  What do we do in that situation, restore the other devices we
> > > > > > > disconnected successfully?  How do we remember/know what they were?
> > > > > > > 
> > > > > > > PCI hotplug almost had this same problem until the designers finally
> > > > > > > realized that they just had to accept the fact that removing a PCI
> > > > > > > device could either happen by:
> > > > > > > 	- a user yanking out the device, at which time the OS better
> > > > > > > 	  clean up properly no matter what happens
> > > > > > > 	- the user asked nicely to remove a device, and the OS can take
> > > > > > > 	  as long as it wants to complete that action, including
> > > > > > > 	  stalling for noticable amounts of time before eventually,
> > > > > > > 	  always letting the action succeed.
> > > > > > > 
> > > > > > > I think the second thing is what you have to do here.  If a user tells
> > > > > > > the OS it wants to remove these devices, you better do it.  If you
> > > > > > > can't, because memory is being used by someone else, either move them
> > > > > > > off, or just hope that nothing bad happens, before the user gets
> > > > > > > frustrated and yanks out the CPU/memory module themselves physically :)
> > > > > > 
> > > > > > Well, that we can't help, but sometimes users really *want* the OS to tell them
> > > > > > if it is safe to unplug something at this particualr time (think about the
> > > > > > Windows' "safe remove" feature for USB sticks, for example; that came out of
> > > > > > users' demand AFAIR).
> > > > > > 
> > > > > > So in my opinion it would be good to give them an option to do "safe eject" or
> > > > > > "forcible eject", whichever they prefer.
> > > > > 
> > > > > For system device hot-plug, it always needs to be "safe eject".  This
> > > > > feature will be implemented on mission critical servers, which are
> > > > > managed by professional IT folks.  Crashing a server causes serious
> > > > > money to the business.
> > > > 
> > > > Well, "always" is a bit too strong a word as far as human behavior is concerned
> > > > in my opinion.
> > > > 
> > > > That said I would be perfectly fine with not supporting the "forcible eject" to
> > > > start with and waiting for the first request to add support for it.  I also
> > > > would be fine with taking bets on how much time it's going to take for such a
> > > > request to appear. :-)
> > > 
> > > Sounds good.  In my experience, though, it actually takes a LONG time to
> > > convince customers that "safe eject" is actually safe.  Enterprise
> > > customers are so afraid of doing anything risky that might cause the
> > > system to crash or hang due to some defect.  I would be very surprised
> > > to see a customer asking for a force operation when we do not guarantee
> > > its outcome.  I have not seen such enterprise customers yet.
> > 
> > But we're talking about a kernel that is supposed to run on mobile phones too,
> > among other things.
> 
> I think using this feature for RAS i.e. replacing a faulty device
> on-line, will continue to be limited for high-end systems.  For low-end
> systems, it does not make sense for customers to pay much $$ for this
> feature.  They can just shut the system down for replacement, or they
> can simply buy a new system instead of repairing.
> 
> That said, using this feature on VM for workload balancing does not
> require any special hardware.  So, I can see someone willing to try out
> to see how it goes with a force option on VM for personal use.   

Besides, SMP was a $$ "enterprise" feature not so long ago, so things tend to
change. :-)

Thanks,
Rafael
Rafael J. Wysocki Feb. 4, 2013, 11:23 p.m. UTC | #45
On Monday, February 04, 2013 01:59:27 PM Toshi Kani wrote:
> On Mon, 2013-02-04 at 20:45 +0100, Rafael J. Wysocki wrote:
> > On Monday, February 04, 2013 09:46:18 AM Toshi Kani wrote:
> > > On Mon, 2013-02-04 at 04:46 -0800, Greg KH wrote:
> > > > On Sun, Feb 03, 2013 at 05:28:09PM -0700, Toshi Kani wrote:
> > > > > On Sat, 2013-02-02 at 16:01 +0100, Greg KH wrote:
> > > > > > On Fri, Feb 01, 2013 at 01:40:10PM -0700, Toshi Kani wrote:
> > > > > > > On Fri, 2013-02-01 at 07:30 +0000, Greg KH wrote:
> > > > > > > > On Thu, Jan 31, 2013 at 06:32:18PM -0700, Toshi Kani wrote:
> > > > > > > >  > This is already done for PCI host bridges and platform devices and I don't
> > > > > > > > > > see why we can't do that for the other types of devices too.
> > > > > > > > > > 
> > > > > > > > > > The only missing piece I see is a way to handle the "eject" problem, i.e.
> > > > > > > > > > when we try do eject a device at the top of a subtree and need to tear down
> > > > > > > > > > the entire subtree below it, but if that's going to lead to a system crash,
> > > > > > > > > > for example, we want to cancel the eject.  It seems to me that we'll need some
> > > > > > > > > > help from the driver core here.
> > > > > > > > > 
> > > > > > > > > There are three different approaches suggested for system device
> > > > > > > > > hot-plug:
> > > > > > > > >  A. Proceed within system device bus scan.
> > > > > > > > >  B. Proceed within ACPI bus scan.
> > > > > > > > >  C. Proceed with a sequence (as a mini-boot).
> > > > > > > > > 
> > > > > > > > > Option A uses system devices as tokens, option B uses acpi devices as
> > > > > > > > > tokens, and option C uses resource tables as tokens, for their handlers.
> > > > > > > > > 
> > > > > > > > > Here is summary of key questions & answers so far.  I hope this
> > > > > > > > > clarifies why I am suggesting option 3.
> > > > > > > > > 
> > > > > > > > > 1. What are the system devices?
> > > > > > > > > System devices provide system-wide core computing resources, which are
> > > > > > > > > essential to compose a computer system.  System devices are not
> > > > > > > > > connected to any particular standard buses.
> > > > > > > > 
> > > > > > > > Not a problem, lots of devices are not connected to any "particular
> > > > > > > > standard busses".  All this means is that system devices are connected
> > > > > > > > to the "system" bus, nothing more.
> > > > > > > 
> > > > > > > Can you give me a few examples of other devices that support hotplug and
> > > > > > > are not connected to any particular buses?  I will investigate them to
> > > > > > > see how they are managed to support hotplug.
> > > > > > 
> > > > > > Any device that is attached to any bus in the driver model can be
> > > > > > hotunplugged from userspace by telling it to be "unbound" from the
> > > > > > driver controlling it.  Try it for any platform device in your system to
> > > > > > see how it happens.
> > > > > 
> > > > > The unbind operation, as I understand from you, is to detach a driver
> > > > > from a device.  Yes, unbinding can be done for any devices.  It is
> > > > > however different from hot-plug operation, which unplugs a device.
> > > > 
> > > > Physically, yes, but to the driver involved, and the driver core, there
> > > > is no difference.  That was one of the primary goals of the driver core
> > > > creation so many years ago.
> > > > 
> > > > > Today, the unbind operation to an ACPI cpu/memory devices causes
> > > > > hot-unplug (offline) operation to them, which is one of the major issues
> > > > > for us since unbind cannot fail.  This patchset addresses this issue by
> > > > > making the unbind operation of ACPI cpu/memory devices to do the
> > > > > unbinding only.  ACPI drivers no longer control cpu and memory as they
> > > > > are supposed to be controlled by their drivers, cpu and memory modules.
> > > > 
> > > > I think that's the problem right there, solve that, please.
> > > 
> > > We cannot eliminate the ACPI drivers since we have to scan ACPI.  But we
> > > can limit the ACPI drivers to do the scanning stuff only.   This is
> > > precisely the intend of this patchset.  The real stuff, removing actual
> > > devices, is done by the system device drivers/modules.
> > 
> > In case you haven't realized that yet, the $subject patchset has no future.
> 
> That's really disappointing, esp. the fact that this basic approach has
> been proven to work on other OS for years...
> 
> 
> > Let's just talk about how we can get what we need in more general terms.
> 
> So, are we heading to an approach of doing everything in ACPI?  I am not
> clear about which direction we have agreed with or disagreed with.
> 
> As for the eject flag approach, I agree with Greg.

Well, I'm not sure which of the Greg's thoughts you agree with. :-)

Thanks,
Rafael
Toshi Kani Feb. 4, 2013, 11:33 p.m. UTC | #46
On Tue, 2013-02-05 at 00:23 +0100, Rafael J. Wysocki wrote:
> On Monday, February 04, 2013 01:59:27 PM Toshi Kani wrote:
> > On Mon, 2013-02-04 at 20:45 +0100, Rafael J. Wysocki wrote:
> > > On Monday, February 04, 2013 09:46:18 AM Toshi Kani wrote:
> > > > On Mon, 2013-02-04 at 04:46 -0800, Greg KH wrote:
> > > > > On Sun, Feb 03, 2013 at 05:28:09PM -0700, Toshi Kani wrote:
> > > > > > On Sat, 2013-02-02 at 16:01 +0100, Greg KH wrote:
> > > > > > > On Fri, Feb 01, 2013 at 01:40:10PM -0700, Toshi Kani wrote:
> > > > > > > > On Fri, 2013-02-01 at 07:30 +0000, Greg KH wrote:
> > > > > > > > > On Thu, Jan 31, 2013 at 06:32:18PM -0700, Toshi Kani wrote:
> > > > > > > > >  > This is already done for PCI host bridges and platform devices and I don't
> > > > > > > > > > > see why we can't do that for the other types of devices too.
> > > > > > > > > > > 
> > > > > > > > > > > The only missing piece I see is a way to handle the "eject" problem, i.e.
> > > > > > > > > > > when we try do eject a device at the top of a subtree and need to tear down
> > > > > > > > > > > the entire subtree below it, but if that's going to lead to a system crash,
> > > > > > > > > > > for example, we want to cancel the eject.  It seems to me that we'll need some
> > > > > > > > > > > help from the driver core here.
> > > > > > > > > > 
> > > > > > > > > > There are three different approaches suggested for system device
> > > > > > > > > > hot-plug:
> > > > > > > > > >  A. Proceed within system device bus scan.
> > > > > > > > > >  B. Proceed within ACPI bus scan.
> > > > > > > > > >  C. Proceed with a sequence (as a mini-boot).
> > > > > > > > > > 
> > > > > > > > > > Option A uses system devices as tokens, option B uses acpi devices as
> > > > > > > > > > tokens, and option C uses resource tables as tokens, for their handlers.
> > > > > > > > > > 
> > > > > > > > > > Here is summary of key questions & answers so far.  I hope this
> > > > > > > > > > clarifies why I am suggesting option 3.
> > > > > > > > > > 
> > > > > > > > > > 1. What are the system devices?
> > > > > > > > > > System devices provide system-wide core computing resources, which are
> > > > > > > > > > essential to compose a computer system.  System devices are not
> > > > > > > > > > connected to any particular standard buses.
> > > > > > > > > 
> > > > > > > > > Not a problem, lots of devices are not connected to any "particular
> > > > > > > > > standard busses".  All this means is that system devices are connected
> > > > > > > > > to the "system" bus, nothing more.
> > > > > > > > 
> > > > > > > > Can you give me a few examples of other devices that support hotplug and
> > > > > > > > are not connected to any particular buses?  I will investigate them to
> > > > > > > > see how they are managed to support hotplug.
> > > > > > > 
> > > > > > > Any device that is attached to any bus in the driver model can be
> > > > > > > hotunplugged from userspace by telling it to be "unbound" from the
> > > > > > > driver controlling it.  Try it for any platform device in your system to
> > > > > > > see how it happens.
> > > > > > 
> > > > > > The unbind operation, as I understand from you, is to detach a driver
> > > > > > from a device.  Yes, unbinding can be done for any devices.  It is
> > > > > > however different from hot-plug operation, which unplugs a device.
> > > > > 
> > > > > Physically, yes, but to the driver involved, and the driver core, there
> > > > > is no difference.  That was one of the primary goals of the driver core
> > > > > creation so many years ago.
> > > > > 
> > > > > > Today, the unbind operation to an ACPI cpu/memory devices causes
> > > > > > hot-unplug (offline) operation to them, which is one of the major issues
> > > > > > for us since unbind cannot fail.  This patchset addresses this issue by
> > > > > > making the unbind operation of ACPI cpu/memory devices to do the
> > > > > > unbinding only.  ACPI drivers no longer control cpu and memory as they
> > > > > > are supposed to be controlled by their drivers, cpu and memory modules.
> > > > > 
> > > > > I think that's the problem right there, solve that, please.
> > > > 
> > > > We cannot eliminate the ACPI drivers since we have to scan ACPI.  But we
> > > > can limit the ACPI drivers to do the scanning stuff only.   This is
> > > > precisely the intend of this patchset.  The real stuff, removing actual
> > > > devices, is done by the system device drivers/modules.
> > > 
> > > In case you haven't realized that yet, the $subject patchset has no future.
> > 
> > That's really disappointing, esp. the fact that this basic approach has
> > been proven to work on other OS for years...
> > 
> > 
> > > Let's just talk about how we can get what we need in more general terms.
> > 
> > So, are we heading to an approach of doing everything in ACPI?  I am not
> > clear about which direction we have agreed with or disagreed with.
> > 
> > As for the eject flag approach, I agree with Greg.
> 
> Well, I'm not sure which of the Greg's thoughts you agree with. :-)

Sorry, that was the Greg's comment below.  But then, I saw your other
email clarifying that the no_eject flag only reflects online/offline
status, not how the device is being used.  So, I replied with my
thoughts in a separate email. :)

===
How does a device "know" it is doing something that is incompatible with
ejecting?  That's a non-trivial task from what I can tell.
===

Thanks,
-Toshi
Rafael J. Wysocki Feb. 4, 2013, 11:52 p.m. UTC | #47
On Monday, February 04, 2013 03:13:29 PM Toshi Kani wrote:
> On Mon, 2013-02-04 at 21:07 +0100, Rafael J. Wysocki wrote:
> > On Monday, February 04, 2013 06:33:52 AM Greg KH wrote:
> > > On Mon, Feb 04, 2013 at 03:21:22PM +0100, Rafael J. Wysocki wrote:
> > > > On Monday, February 04, 2013 04:48:10 AM Greg KH wrote:
> > > > > On Sun, Feb 03, 2013 at 09:44:39PM +0100, Rafael J. Wysocki wrote:
> > > > > > > Yes, but those are just remove events and we can only see how destructive they
> > > > > > > were after the removal.  The point is to be able to figure out whether or not
> > > > > > > we *want* to do the removal in the first place.
> > > > > > > 
> > > > > > > Say you have a computing node which signals a hardware problem in a processor
> > > > > > > package (the container with CPU cores, memory, PCI host bridge etc.).  You
> > > > > > > may want to eject that package, but you don't want to kill the system this
> > > > > > > way.  So if the eject is doable, it is very much desirable to do it, but if it
> > > > > > > is not doable, you'd rather shut the box down and do the replacement afterward.
> > > > > > > That may be costly, however (maybe weeks of computations), so it should be
> > > > > > > avoided if possible, but not at the expense of crashing the box if the eject
> > > > > > > doesn't work out.
> > > > > > 
> > > > > > It seems to me that we could handle that with the help of a new flag, say
> > > > > > "no_eject", in struct device, a global mutex, and a function that will walk
> > > > > > the given subtree of the device hierarchy and check if "no_eject" is set for
> > > > > > any devices in there.  Plus a global "no_eject" switch, perhaps.
> > > > > 
> > > > > I think this will always be racy, or at worst, slow things down on
> > > > > normal device operations as you will always be having to grab this flag
> > > > > whenever you want to do something new.
> > > > 
> > > > I don't see why this particular scheme should be racy, at least I don't see any
> > > > obvious races in it (although I'm not that good at races detection in general,
> > > > admittedly).
> > > > 
> > > > Also, I don't expect that flag to be used for everything, just for things known
> > > > to seriously break if forcible eject is done.  That may be not precise enough,
> > > > so that's a matter of defining its purpose more precisely.
> > > > 
> > > > We can do something like that on the ACPI level (ie. introduce a no_eject flag
> > > > in struct acpi_device and provide an iterface for the layers above ACPI to
> > > > manipulate it) but then devices without ACPI namespace objects won't be
> > > > covered.  That may not be a big deal, though.
> > > > 
> > > > So say dev is about to be used for something incompatible with ejecting, so to
> > > > speak.  Then, one would do platform_lock_eject(dev), which would check if dev
> > > > has an ACPI handle and then take acpi_eject_lock (if so).  The return value of
> > > > platform_lock_eject(dev) would need to be checked to see if the device is not
> > > > gone.  If it returns success (0), one would do something to the device and
> > > > call platform_no_eject(dev) and then platform_unlock_eject(dev).
> > > 
> > > How does a device "know" it is doing something that is incompatible with
> > > ejecting?  That's a non-trivial task from what I can tell.
> > 
> > I agree that this is complicated in general.  But.
> > 
> > There are devices known to have software "offline" and "online" operations
> > such that after the "offline" the given device is guaranteed to be not used
> > until "online".  We have that for CPU cores, for example, and user space can
> > do it via /sys/devices/system/cpu/cpuX/online .  So, why don't we make the
> > "online" set the no_eject flag (under the lock as appropriate) and the
> > "offline" clear it?  And why don't we define such "online" and "offline" for
> > all of the other "system" stuff, like memory, PCI host bridges etc. and make it
> > behave analogously?
> > 
> > Then, it is quite simple to say which devices should use the no_eject flag:
> > devices that have "online" and "offline" exported to user space.  And guess
> > who's responsible for "offlining" all of those things before trying to eject
> > them: user space is.  From the kernel's point of view it is all clear.  Hands
> > clean. :-)
> > 
> > Now, there's a different problem how to expose all of the relevant information
> > to user space so that it knows what to "offline" for the specific eject
> > operation to succeed, but that's kind of separate and worth addressing
> > anyway.
> 
> So, the idea is to run a user space program that off-lines all relevant
> devices before trimming ACPI devices.  Is that right?  That sounds like
> a worth idea to consider with.  This basically moves the "sequencer"
> part into user space instead of the kernel space in my proposal.  I
> agree that how to expose all of the relevant info to user space is an
> issue.  Also, we will need to make sure that the user program always
> runs per a kernel request and then informs a result back to the kernel,
> so that the kernel can do the rest of an operation and inform a result
> to FW with _OST or _EJ0.  This loop has to close.  I think it is going
> to be more complicated than the kernel-only approach.

I actually didn't think about that.  The point is that trying to offline
everything *synchronously* may just be pointless, because it may be
offlined upfront, before the eject is even requested.  So the sequence
would be to first offline things that we'll want to eject from user space
and then to send the eject request (e.g. via sysfs too).

Eject requests from eject buttons and things like that may just fail if
some components involved that should be offline are online.  The fact that
we might be able to offline them synchronously if we tried doesn't matter,
pretty much as it doesn't matter for hot-swappable disks.

You'd probably never try to hot-remove a disk before unmounting filesystems
mounted from it or failing it as a RAID component and nobody sane wants the
kernel to do things like that automatically when the user presses the eject
button.  In my opinion we should treat memory eject, or CPU package eject, or
PCI host bridge eject in exactly the same way: Don't eject if it is not
prepared for ejecting in the first place.

And if you think about it, that makes things *massively* simpler, because now
the kernel doesn't heed to worry about all of those "synchronous removal"
scenarions that very well may involve every single device in the system and
the whole problem is nicely split into several separate "implement
offline/online" problems that are subsystem-specific and a single
"eject if everything relevant is offline" problem which is kind of trivial.
Plus the one of exposing information to user space, which is separate too.

Now, each of them can be worked on separately, *tested* separately and
debugged separately if need be and it is much easier to isolate failures
and so on.

> In addition, I am not sure if the "no_eject" flag in acpi_device is
> really necessary here since the user program will inform the kernel if
> all devices are off-line.  Also, the kernel will likely need to expose
> the device info to the user program to tell which devices need to be
> off-lined.  At that time, the kernel already knows if there is any
> on-line device in the scope.

Well, that depends no what "the kernel" means and how it knows that.  Surely
the "online" components have to be marked somehow so that it is easy to check
if they are in the scope in the subsystem-independent way, so why don't we use
something like the no_eject flag for that?

Rafael
Greg Kroah-Hartman Feb. 5, 2013, 12:04 a.m. UTC | #48
On Tue, Feb 05, 2013 at 12:52:30AM +0100, Rafael J. Wysocki wrote:
> You'd probably never try to hot-remove a disk before unmounting filesystems
> mounted from it or failing it as a RAID component and nobody sane wants the
> kernel to do things like that automatically when the user presses the eject
> button.  In my opinion we should treat memory eject, or CPU package eject, or
> PCI host bridge eject in exactly the same way: Don't eject if it is not
> prepared for ejecting in the first place.

Bad example, we have disks hot-removed all the time without any
filesystems being unmounted, and have supported this since the 2.2 days
(although we didn't get it "right" until 2.6.)

PCI Host bridge eject is the same as PCI eject today, the user asks us
to do it, and we can not fail it from happening.  We also can have them
removed without us being told about it in the first place, and can
properly clean up from it all.

> And if you think about it, that makes things *massively* simpler, because now
> the kernel doesn't heed to worry about all of those "synchronous removal"
> scenarions that very well may involve every single device in the system and
> the whole problem is nicely split into several separate "implement
> offline/online" problems that are subsystem-specific and a single
> "eject if everything relevant is offline" problem which is kind of trivial.
> Plus the one of exposing information to user space, which is separate too.
> 
> Now, each of them can be worked on separately, *tested* separately and
> debugged separately if need be and it is much easier to isolate failures
> and so on.

So you are agreeing with me in that we can not fail hot removing any
device, nice :)

greg k-h
Toshi Kani Feb. 5, 2013, 12:55 a.m. UTC | #49
On Tue, 2013-02-05 at 00:52 +0100, Rafael J. Wysocki wrote:
> On Monday, February 04, 2013 03:13:29 PM Toshi Kani wrote:
> > On Mon, 2013-02-04 at 21:07 +0100, Rafael J. Wysocki wrote:
> > > On Monday, February 04, 2013 06:33:52 AM Greg KH wrote:
> > > > On Mon, Feb 04, 2013 at 03:21:22PM +0100, Rafael J. Wysocki wrote:
> > > > > On Monday, February 04, 2013 04:48:10 AM Greg KH wrote:
> > > > > > On Sun, Feb 03, 2013 at 09:44:39PM +0100, Rafael J. Wysocki wrote:
> > > > > > > > Yes, but those are just remove events and we can only see how destructive they
> > > > > > > > were after the removal.  The point is to be able to figure out whether or not
> > > > > > > > we *want* to do the removal in the first place.
> > > > > > > > 
> > > > > > > > Say you have a computing node which signals a hardware problem in a processor
> > > > > > > > package (the container with CPU cores, memory, PCI host bridge etc.).  You
> > > > > > > > may want to eject that package, but you don't want to kill the system this
> > > > > > > > way.  So if the eject is doable, it is very much desirable to do it, but if it
> > > > > > > > is not doable, you'd rather shut the box down and do the replacement afterward.
> > > > > > > > That may be costly, however (maybe weeks of computations), so it should be
> > > > > > > > avoided if possible, but not at the expense of crashing the box if the eject
> > > > > > > > doesn't work out.
> > > > > > > 
> > > > > > > It seems to me that we could handle that with the help of a new flag, say
> > > > > > > "no_eject", in struct device, a global mutex, and a function that will walk
> > > > > > > the given subtree of the device hierarchy and check if "no_eject" is set for
> > > > > > > any devices in there.  Plus a global "no_eject" switch, perhaps.
> > > > > > 
> > > > > > I think this will always be racy, or at worst, slow things down on
> > > > > > normal device operations as you will always be having to grab this flag
> > > > > > whenever you want to do something new.
> > > > > 
> > > > > I don't see why this particular scheme should be racy, at least I don't see any
> > > > > obvious races in it (although I'm not that good at races detection in general,
> > > > > admittedly).
> > > > > 
> > > > > Also, I don't expect that flag to be used for everything, just for things known
> > > > > to seriously break if forcible eject is done.  That may be not precise enough,
> > > > > so that's a matter of defining its purpose more precisely.
> > > > > 
> > > > > We can do something like that on the ACPI level (ie. introduce a no_eject flag
> > > > > in struct acpi_device and provide an iterface for the layers above ACPI to
> > > > > manipulate it) but then devices without ACPI namespace objects won't be
> > > > > covered.  That may not be a big deal, though.
> > > > > 
> > > > > So say dev is about to be used for something incompatible with ejecting, so to
> > > > > speak.  Then, one would do platform_lock_eject(dev), which would check if dev
> > > > > has an ACPI handle and then take acpi_eject_lock (if so).  The return value of
> > > > > platform_lock_eject(dev) would need to be checked to see if the device is not
> > > > > gone.  If it returns success (0), one would do something to the device and
> > > > > call platform_no_eject(dev) and then platform_unlock_eject(dev).
> > > > 
> > > > How does a device "know" it is doing something that is incompatible with
> > > > ejecting?  That's a non-trivial task from what I can tell.
> > > 
> > > I agree that this is complicated in general.  But.
> > > 
> > > There are devices known to have software "offline" and "online" operations
> > > such that after the "offline" the given device is guaranteed to be not used
> > > until "online".  We have that for CPU cores, for example, and user space can
> > > do it via /sys/devices/system/cpu/cpuX/online .  So, why don't we make the
> > > "online" set the no_eject flag (under the lock as appropriate) and the
> > > "offline" clear it?  And why don't we define such "online" and "offline" for
> > > all of the other "system" stuff, like memory, PCI host bridges etc. and make it
> > > behave analogously?
> > > 
> > > Then, it is quite simple to say which devices should use the no_eject flag:
> > > devices that have "online" and "offline" exported to user space.  And guess
> > > who's responsible for "offlining" all of those things before trying to eject
> > > them: user space is.  From the kernel's point of view it is all clear.  Hands
> > > clean. :-)
> > > 
> > > Now, there's a different problem how to expose all of the relevant information
> > > to user space so that it knows what to "offline" for the specific eject
> > > operation to succeed, but that's kind of separate and worth addressing
> > > anyway.
> > 
> > So, the idea is to run a user space program that off-lines all relevant
> > devices before trimming ACPI devices.  Is that right?  That sounds like
> > a worth idea to consider with.  This basically moves the "sequencer"
> > part into user space instead of the kernel space in my proposal.  I
> > agree that how to expose all of the relevant info to user space is an
> > issue.  Also, we will need to make sure that the user program always
> > runs per a kernel request and then informs a result back to the kernel,
> > so that the kernel can do the rest of an operation and inform a result
> > to FW with _OST or _EJ0.  This loop has to close.  I think it is going
> > to be more complicated than the kernel-only approach.
> 
> I actually didn't think about that.  The point is that trying to offline
> everything *synchronously* may just be pointless, because it may be
> offlined upfront, before the eject is even requested.  So the sequence
> would be to first offline things that we'll want to eject from user space
> and then to send the eject request (e.g. via sysfs too).
> 
> Eject requests from eject buttons and things like that may just fail if
> some components involved that should be offline are online.  The fact that
> we might be able to offline them synchronously if we tried doesn't matter,
> pretty much as it doesn't matter for hot-swappable disks.
> 
> You'd probably never try to hot-remove a disk before unmounting filesystems
> mounted from it or failing it as a RAID component and nobody sane wants the
> kernel to do things like that automatically when the user presses the eject
> button.  In my opinion we should treat memory eject, or CPU package eject, or
> PCI host bridge eject in exactly the same way: Don't eject if it is not
> prepared for ejecting in the first place.
> 
> And if you think about it, that makes things *massively* simpler, because now
> the kernel doesn't heed to worry about all of those "synchronous removal"
> scenarions that very well may involve every single device in the system and
> the whole problem is nicely split into several separate "implement
> offline/online" problems that are subsystem-specific and a single
> "eject if everything relevant is offline" problem which is kind of trivial.
> Plus the one of exposing information to user space, which is separate too.

Oh, I see.  Yes, it certainly makes things really simpler.  It will
bring burden to a user, but it could be solved with proper tools.  I
totally agree that I/Os should be removed beforehand.  For CPUs and
memory, it would be a bad TCE for asking a user to find a right set of
the devices to off-line, but this could be addressed with proper tools.
I think we need to check if memory block (a unit of sysfs memory
online/offline) and an ACPI memory object actually corresponds nicely.
But in high-level, this sounds like a workable plan.


> Now, each of them can be worked on separately, *tested* separately and
> debugged separately if need be and it is much easier to isolate failures
> and so on.

Right, but it is also the case with "synchronous removal" as long as we
have sysfs online interface.  The difference is that this approach only
supports sysfs interface for off-lining.


> > In addition, I am not sure if the "no_eject" flag in acpi_device is
> > really necessary here since the user program will inform the kernel if
> > all devices are off-line.  Also, the kernel will likely need to expose
> > the device info to the user program to tell which devices need to be
> > off-lined.  At that time, the kernel already knows if there is any
> > on-line device in the scope.
> 
> Well, that depends no what "the kernel" means and how it knows that.  Surely
> the "online" components have to be marked somehow so that it is easy to check
> if they are in the scope in the subsystem-independent way, so why don't we use
> something like the no_eject flag for that?

Yes, I see your point.  My previous comment assumed that the kernel
would have to obtain device info and tell a user program to off-line
them.  In such case, I thought we would have to walk thru the actual
device tree and see online/offline info anyway.  But, since we are not
doing anything like that, having the flag in acpi_device seems to be a
reasonable way to avoid dealing with the actual device tree.


Thanks,
-Toshi
Rafael J. Wysocki Feb. 5, 2013, 1:02 a.m. UTC | #50
On Monday, February 04, 2013 04:04:47 PM Greg KH wrote:
> On Tue, Feb 05, 2013 at 12:52:30AM +0100, Rafael J. Wysocki wrote:
> > You'd probably never try to hot-remove a disk before unmounting filesystems
> > mounted from it or failing it as a RAID component and nobody sane wants the
> > kernel to do things like that automatically when the user presses the eject
> > button.  In my opinion we should treat memory eject, or CPU package eject, or
> > PCI host bridge eject in exactly the same way: Don't eject if it is not
> > prepared for ejecting in the first place.
> 
> Bad example, we have disks hot-removed all the time without any
> filesystems being unmounted, and have supported this since the 2.2 days
> (although we didn't get it "right" until 2.6.)

Well, that wasn't my point.

My point was that we have tools for unmounting filesystems from disks that
the user wants to hot-remove and the user is supposed to use those tools
before hot-removing the disks.  At least I wouldn't recommend anyone to
do otherwise. :-)

Now, for memory hot-removal we don't have anything like that, as far as I
can say, so my point was why don't we add memory "offline" that can be
done and tested separately from hot-removal and use that before we go and
hot-remove stuff?  And analogously for PCI host bridges etc.?

[Now, there's a question if an "eject" button on the system case, if there is
one, should *always* cause the eject to happen even though things are not
"offline".  My opinion is that not necessarily, because users may not be aware
that they are doing something wrong.

Quite analogously, does the power button always cause the system to shut down?
No.  So why the heck should an eject button always cause an eject to happen?
I see no reason.

That said, the most straightforward approach may be simply to let user space
disable eject events for specific devices when it wants and only enable them
when it knows that the given devices are ready for removal.

But I'm digressing.]

> PCI Host bridge eject is the same as PCI eject today, the user asks us
> to do it, and we can not fail it from happening.  We also can have them
> removed without us being told about it in the first place, and can
> properly clean up from it all.

Well, are you sure we'll always clean up?  I kind of have my doubts. :-)

> > And if you think about it, that makes things *massively* simpler, because now
> > the kernel doesn't heed to worry about all of those "synchronous removal"
> > scenarions that very well may involve every single device in the system and
> > the whole problem is nicely split into several separate "implement
> > offline/online" problems that are subsystem-specific and a single
> > "eject if everything relevant is offline" problem which is kind of trivial.
> > Plus the one of exposing information to user space, which is separate too.
> > 
> > Now, each of them can be worked on separately, *tested* separately and
> > debugged separately if need be and it is much easier to isolate failures
> > and so on.
> 
> So you are agreeing with me in that we can not fail hot removing any
> device, nice :)

That depends on how you define hot-removing.  If you regard the "offline"
as a separate operation that can be carried out independently and hot-remove
as the last step causing the device to actually go away, then I agree that
it can't fail.  The "offline" itself, however, is a different matter (pretty
much like unmounting a file system).

Thanks,
Rafael
Rafael J. Wysocki Feb. 5, 2013, 11:11 a.m. UTC | #51
On Monday, February 04, 2013 04:04:47 PM Greg KH wrote:
> On Tue, Feb 05, 2013 at 12:52:30AM +0100, Rafael J. Wysocki wrote:
> > You'd probably never try to hot-remove a disk before unmounting filesystems
> > mounted from it or failing it as a RAID component and nobody sane wants the
> > kernel to do things like that automatically when the user presses the eject
> > button.  In my opinion we should treat memory eject, or CPU package eject, or
> > PCI host bridge eject in exactly the same way: Don't eject if it is not
> > prepared for ejecting in the first place.
> 
> Bad example, we have disks hot-removed all the time without any
> filesystems being unmounted, and have supported this since the 2.2 days
> (although we didn't get it "right" until 2.6.)

I actually don't think it is really bad, because it exposes the problem nicely.

Namely, there are two arguments that can be made here.  The first one is the
usability argument: Users should always be allowed to do what they want,
because it is [explicit content] annoying if software pretends to know better
what to do than the user (it is a convenience argument too, because usually
it's *easier* to allow users to do what they want).  The second one is the
data integrity argument: Operations that may lead to data loss should never
be carried out, because it is [explicit content] disappointing to lose valuable
stuff by a stupid mistake if software allows that mistake to be made (that also
may be costly in terms of real money).

You seem to believe that we should always follow the usability argument, while
Toshi seems to be thinking that (at least in the case of the "system" devices),
the data integrity argument is more important.  They are both valid arguments,
however, and they are in conflict, so this is a matter of balance.

You're saying that in the case of disks we always follow the usability argument
entirely.  I'm fine with that, although I suspect that some people may not be
considering this as the right balance.

Toshi seems to be thinking that for the hotplug of memory/CPUs/host bridges we
should always follow the data integrity argument entirely, because the users of
that feature value their data so much that they pretty much don't care about
usability.  That very well may be the case, so I'm fine with that too, although
I'm sure there are people who'll argue that this is not the right balance
either.

Now, the point is that we *can* do what Toshi is arguing for and that doesn't
seem to be overly complicated, so my question is: Why don't we do that, at
least to start with?  If it turns out eventually that the users care about
usability too, after all, we can add a switch to adjust things more to their
liking.  Still, we can very well do that later.

Thanks,
Rafael
Greg Kroah-Hartman Feb. 5, 2013, 6:39 p.m. UTC | #52
On Tue, Feb 05, 2013 at 12:11:17PM +0100, Rafael J. Wysocki wrote:
> On Monday, February 04, 2013 04:04:47 PM Greg KH wrote:
> > On Tue, Feb 05, 2013 at 12:52:30AM +0100, Rafael J. Wysocki wrote:
> > > You'd probably never try to hot-remove a disk before unmounting filesystems
> > > mounted from it or failing it as a RAID component and nobody sane wants the
> > > kernel to do things like that automatically when the user presses the eject
> > > button.  In my opinion we should treat memory eject, or CPU package eject, or
> > > PCI host bridge eject in exactly the same way: Don't eject if it is not
> > > prepared for ejecting in the first place.
> > 
> > Bad example, we have disks hot-removed all the time without any
> > filesystems being unmounted, and have supported this since the 2.2 days
> > (although we didn't get it "right" until 2.6.)
> 
> I actually don't think it is really bad, because it exposes the problem nicely.
> 
> Namely, there are two arguments that can be made here.  The first one is the
> usability argument: Users should always be allowed to do what they want,
> because it is [explicit content] annoying if software pretends to know better
> what to do than the user (it is a convenience argument too, because usually
> it's *easier* to allow users to do what they want).  The second one is the
> data integrity argument: Operations that may lead to data loss should never
> be carried out, because it is [explicit content] disappointing to lose valuable
> stuff by a stupid mistake if software allows that mistake to be made (that also
> may be costly in terms of real money).
> 
> You seem to believe that we should always follow the usability argument, while
> Toshi seems to be thinking that (at least in the case of the "system" devices),
> the data integrity argument is more important.  They are both valid arguments,
> however, and they are in conflict, so this is a matter of balance.
> 
> You're saying that in the case of disks we always follow the usability argument
> entirely.  I'm fine with that, although I suspect that some people may not be
> considering this as the right balance.
> 
> Toshi seems to be thinking that for the hotplug of memory/CPUs/host bridges we
> should always follow the data integrity argument entirely, because the users of
> that feature value their data so much that they pretty much don't care about
> usability.  That very well may be the case, so I'm fine with that too, although
> I'm sure there are people who'll argue that this is not the right balance
> either.
> 
> Now, the point is that we *can* do what Toshi is arguing for and that doesn't
> seem to be overly complicated, so my question is: Why don't we do that, at
> least to start with?  If it turns out eventually that the users care about
> usability too, after all, we can add a switch to adjust things more to their
> liking.  Still, we can very well do that later.

Ok, I'd much rather deal with reviewing actual implementations than
talking about theory at this point in time, so let's see what you all
can come up with next and I'll be glad to review it.

thanks,

greg k-h
Rafael J. Wysocki Feb. 5, 2013, 9:13 p.m. UTC | #53
On Tuesday, February 05, 2013 10:39:48 AM Greg KH wrote:
> On Tue, Feb 05, 2013 at 12:11:17PM +0100, Rafael J. Wysocki wrote:
> > On Monday, February 04, 2013 04:04:47 PM Greg KH wrote:
> > > On Tue, Feb 05, 2013 at 12:52:30AM +0100, Rafael J. Wysocki wrote:
> > > > You'd probably never try to hot-remove a disk before unmounting filesystems
> > > > mounted from it or failing it as a RAID component and nobody sane wants the
> > > > kernel to do things like that automatically when the user presses the eject
> > > > button.  In my opinion we should treat memory eject, or CPU package eject, or
> > > > PCI host bridge eject in exactly the same way: Don't eject if it is not
> > > > prepared for ejecting in the first place.
> > > 
> > > Bad example, we have disks hot-removed all the time without any
> > > filesystems being unmounted, and have supported this since the 2.2 days
> > > (although we didn't get it "right" until 2.6.)
> > 
> > I actually don't think it is really bad, because it exposes the problem nicely.
> > 
> > Namely, there are two arguments that can be made here.  The first one is the
> > usability argument: Users should always be allowed to do what they want,
> > because it is [explicit content] annoying if software pretends to know better
> > what to do than the user (it is a convenience argument too, because usually
> > it's *easier* to allow users to do what they want).  The second one is the
> > data integrity argument: Operations that may lead to data loss should never
> > be carried out, because it is [explicit content] disappointing to lose valuable
> > stuff by a stupid mistake if software allows that mistake to be made (that also
> > may be costly in terms of real money).
> > 
> > You seem to believe that we should always follow the usability argument, while
> > Toshi seems to be thinking that (at least in the case of the "system" devices),
> > the data integrity argument is more important.  They are both valid arguments,
> > however, and they are in conflict, so this is a matter of balance.
> > 
> > You're saying that in the case of disks we always follow the usability argument
> > entirely.  I'm fine with that, although I suspect that some people may not be
> > considering this as the right balance.
> > 
> > Toshi seems to be thinking that for the hotplug of memory/CPUs/host bridges we
> > should always follow the data integrity argument entirely, because the users of
> > that feature value their data so much that they pretty much don't care about
> > usability.  That very well may be the case, so I'm fine with that too, although
> > I'm sure there are people who'll argue that this is not the right balance
> > either.
> > 
> > Now, the point is that we *can* do what Toshi is arguing for and that doesn't
> > seem to be overly complicated, so my question is: Why don't we do that, at
> > least to start with?  If it turns out eventually that the users care about
> > usability too, after all, we can add a switch to adjust things more to their
> > liking.  Still, we can very well do that later.
> 
> Ok, I'd much rather deal with reviewing actual implementations than
> talking about theory at this point in time, so let's see what you all
> can come up with next and I'll be glad to review it.

Sure, thanks a lot for your comments so far!

Rafael
diff mbox

Patch

diff --git a/include/linux/sys_hotplug.h b/include/linux/sys_hotplug.h
new file mode 100644
index 0000000..86674dd
--- /dev/null
+++ b/include/linux/sys_hotplug.h
@@ -0,0 +1,181 @@ 
+/*
+ * sys_hotplug.h - System device hot-plug framework
+ *
+ * Copyright (C) 2012 Hewlett-Packard Development Company, L.P.
+ *	Toshi Kani <toshi.kani@hp.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _LINUX_SYS_HOTPLUG_H
+#define _LINUX_SYS_HOTPLUG_H
+
+#include <linux/list.h>
+#include <linux/device.h>
+
+/*
+ * System device hot-plug operation proceeds in the following order.
+ *   Validate phase -> Execute phase -> Commit phase
+ *
+ * The order values below define the calling sequence of platform
+ * neutral handlers for each phase in ascending order.  The order
+ * values of firmware-specific handlers are defined in sys_hotplug.h
+ * under firmware specific directories.
+ */
+
+/* All order values must be smaller than this value */
+#define SHP_ORDER_MAX				0xffffff
+
+/* Add Validate order values */
+
+/* Add Execute order values */
+#define SHP_MEM_ADD_EXECUTE_ORDER		100
+#define SHP_CPU_ADD_EXECUTE_ORDER		110
+
+/* Add Commit order values */
+
+/* Delete Validate order values */
+#define SHP_CPU_DEL_VALIDATE_ORDER		100
+#define SHP_MEM_DEL_VALIDATE_ORDER		110
+
+/* Delete Execute order values */
+#define SHP_CPU_DEL_EXECUTE_ORDER		10
+#define SHP_MEM_DEL_EXECUTE_ORDER		20
+
+/* Delete Commit order values */
+
+/*
+ * Hot-plug request types
+ */
+#define SHP_REQ_ADD		0x000000
+#define SHP_REQ_DELETE		0x000001
+#define SHP_REQ_MASK		0x0000ff
+
+/*
+ * Hot-plug phase types
+ */
+#define SHP_PH_VALIDATE		0x000000
+#define SHP_PH_EXECUTE		0x000100
+#define SHP_PH_COMMIT		0x000200
+#define SHP_PH_MASK		0x00ff00
+
+/*
+ * Hot-plug operation types
+ */
+#define SHP_OP_HOTPLUG		0x000000
+#define SHP_OP_ONLINE		0x010000
+#define SHP_OP_MASK		0xff0000
+
+/*
+ * Hot-plug phases
+ */
+enum shp_phase {
+	SHP_ADD_VALIDATE	= (SHP_REQ_ADD|SHP_PH_VALIDATE),
+	SHP_ADD_EXECUTE		= (SHP_REQ_ADD|SHP_PH_EXECUTE),
+	SHP_ADD_COMMIT		= (SHP_REQ_ADD|SHP_PH_COMMIT),
+	SHP_DEL_VALIDATE	= (SHP_REQ_DELETE|SHP_PH_VALIDATE),
+	SHP_DEL_EXECUTE		= (SHP_REQ_DELETE|SHP_PH_EXECUTE),
+	SHP_DEL_COMMIT		= (SHP_REQ_DELETE|SHP_PH_COMMIT)
+};
+
+/*
+ * Hot-plug operations
+ */
+enum shp_operation {
+	SHP_HOTPLUG_ADD		= (SHP_OP_HOTPLUG|SHP_REQ_ADD),
+	SHP_HOTPLUG_DEL		= (SHP_OP_HOTPLUG|SHP_REQ_DELETE),
+	SHP_ONLINE_ADD		= (SHP_OP_ONLINE|SHP_REQ_ADD),
+	SHP_ONLINE_DEL		= (SHP_OP_ONLINE|SHP_REQ_DELETE)
+};
+
+/*
+ * Hot-plug device classes
+ */
+enum shp_class {
+	SHP_CLS_INVALID		= 0,
+	SHP_CLS_CPU		= 1,
+	SHP_CLS_MEMORY		= 2,
+	SHP_CLS_HOSTBRIDGE	= 3,
+	SHP_CLS_CONTAINER	= 4,
+};
+
+/*
+ * Hot-plug device information
+ */
+union shp_dev_info {
+	struct shp_cpu {
+		u32		cpu_id;
+	} cpu;
+
+	struct shp_memory {
+		int		node;
+		u64		start_addr;
+		u64		length;
+	} mem;
+
+	struct shp_hostbridge {
+	} hb;
+
+	struct shp_node {
+	} node;
+};
+
+struct shp_device {
+	struct list_head	list;
+	struct device		*device;
+	enum shp_class		class;
+	union shp_dev_info	info;
+};
+
+/*
+ * Hot-plug request
+ */
+struct shp_request {
+	/* common info */
+	enum shp_operation	operation;	/* operation */
+
+	/* hot-plug event info: only valid for hot-plug operations */
+	void			*handle;	/* FW handle */
+	u32			event;		/* FW event */
+
+	/* device resource info */
+	struct list_head	dev_list;	/* shp_device list */
+};
+
+/*
+ * Inline Utility Functions
+ */
+static inline bool shp_is_hotplug_op(enum shp_operation operation)
+{
+	return (operation & SHP_OP_MASK) == SHP_OP_HOTPLUG;
+}
+
+static inline bool shp_is_online_op(enum shp_operation operation)
+{
+	return (operation & SHP_OP_MASK) == SHP_OP_ONLINE;
+}
+
+static inline bool shp_is_add_op(enum shp_operation operation)
+{
+	return (operation & SHP_REQ_MASK) == SHP_REQ_ADD;
+}
+
+static inline bool shp_is_add_phase(enum shp_phase phase)
+{
+	return (phase & SHP_REQ_MASK) == SHP_REQ_ADD;
+}
+
+/*
+ * Externs
+ */
+typedef int (*shp_func)(struct shp_request *req, int rollback);
+extern int shp_register_handler(enum shp_phase phase, shp_func func, u32 order);
+extern int shp_unregister_handler(enum shp_phase phase, shp_func func);
+extern int shp_submit_req(struct shp_request *req);
+extern struct shp_request *shp_alloc_request(enum shp_operation operation);
+extern void shp_add_dev_info(struct shp_request *shp_req,
+		struct shp_device *shp_dev);
+
+#endif	/* _LINUX_SYS_HOTPLUG_H */