diff mbox series

powerpc/pseries: add lparctl driver for platform-specific functions

Message ID 20220730000458.130938-1-nathanl@linux.ibm.com (mailing list archive)
State New
Headers show
Series powerpc/pseries: add lparctl driver for platform-specific functions | expand

Checks

Context Check Description
snowpatch_ozlabs/github-powerpc_ppctests success Successfully ran 10 jobs.
snowpatch_ozlabs/github-powerpc_selftests success Successfully ran 10 jobs.
snowpatch_ozlabs/github-powerpc_sparse success Successfully ran 4 jobs.
snowpatch_ozlabs/github-powerpc_kernel_qemu success Successfully ran 23 jobs.

Commit Message

Nathan Lynch July 30, 2022, 12:04 a.m. UTC
This adds a chardev+ioctl-based interface for user space to access pseries
platform-specific functions which don't easily fit elsewhere.

The immediate motivation is to unbreak librtas[1] with kernel lockdown
enabled. librtas provides a thin root-only user-space API, allowing nearly
direct access to RTAS functions. Since its inception, some of librtas's
APIs have used /dev/mem to allocate buffers that are addressable by RTAS
for use with the powerpc-specific rtas() syscall. This design likely would
not be our first choice today, but it has served adequately for about two
decades without much change, and librtas has a non-negligible number of
existing users, including powerpc-utils, ppc64-diag, lsvpd, lscpu
(util-linux), and several non-OSS programs. With lockdown enabled, /dev/mem
access is prohibited, preventing communication with the management console
and breaking associated functions such as DLPAR and partition migration.

So the task is to provide new lockdown-compatible interfaces for librtas to
prefer over the legacy /dev/mem+sys_rtas(), allowing it to maintain its own
user-facing ABIs as much as possible. This means that we make different
interface choices than we would were we writing everything from scratch. In
the ioctl commands added here, we do not map RTAS error statuses to Linux
errno values, because the existing users of librtas's system parameter APIs
expect the RTAS status codes. Instead, the ioctl succeeds if the kernel
manages to call the RTAS function at all, and passes the RTAS status back
to user space in the argument buffer.

Add the ability to retrieve and change system parameters as defined by
PAPR. Along with proposed implementation changes to librtas[2], this
effectively fixes librtas's rtas_get_sysparm() and rtas_set_sysparm() APIs
for existing users with lockdown. This is enough to get HMC communication
working via the proprietary RSCT stack, enabling LPM, processor DLPAR,
memory DLPAR, and various other use cases.

While this starts with RTAS-implemented functions, there's no reason it
couldn't host facilities that rely on hcalls or other PAPR-specified
interfaces. It could be an alternative to adding more key=value lines to
/proc/powrpc/lparcfg. Hence the generic name 'lparctl'.

Utilities tested (ppc64le kernel and user space):
* ppc64_cpu --run-mode (powerpc-utils, gets/sets processor diag run mode)
* serv_config (powerpc-utils, gets/sets various system and LPAR policies)
* lscpu (util-linux, retrieves processor topology)
* RSCT (proprietary, retrieves HMC connection details)

Future work to unbreak more librtas APIs may include:
* VPD retrieval via ibm,get-vpd
* RTAS error injection
* indicator query/manipulation for diagnostics

[1] https://github.com/ibm-power-utilities/librtas
[2] https://github.com/ibm-power-utilities/librtas/pull/26

Signed-off-by: Nathan Lynch <nathanl@linux.ibm.com>
---
 .../userspace-api/ioctl/ioctl-number.rst      |   1 +
 arch/powerpc/include/uapi/asm/lparctl.h       |  63 +++++++
 arch/powerpc/platforms/pseries/Makefile       |   2 +-
 arch/powerpc/platforms/pseries/lparctl.c      | 171 ++++++++++++++++++
 4 files changed, 236 insertions(+), 1 deletion(-)
 create mode 100644 arch/powerpc/include/uapi/asm/lparctl.h
 create mode 100644 arch/powerpc/platforms/pseries/lparctl.c

Comments

Laurent Dufour Aug. 1, 2022, 4:40 p.m. UTC | #1
Le 30/07/2022 à 02:04, Nathan Lynch a écrit :
> This adds a chardev+ioctl-based interface for user space to access pseries
> platform-specific functions which don't easily fit elsewhere.
> 
> The immediate motivation is to unbreak librtas[1] with kernel lockdown
> enabled. librtas provides a thin root-only user-space API, allowing nearly
> direct access to RTAS functions. Since its inception, some of librtas's
> APIs have used /dev/mem to allocate buffers that are addressable by RTAS
> for use with the powerpc-specific rtas() syscall. This design likely would
> not be our first choice today, but it has served adequately for about two
> decades without much change, and librtas has a non-negligible number of
> existing users, including powerpc-utils, ppc64-diag, lsvpd, lscpu
> (util-linux), and several non-OSS programs. With lockdown enabled, /dev/mem
> access is prohibited, preventing communication with the management console
> and breaking associated functions such as DLPAR and partition migration.
> 
> So the task is to provide new lockdown-compatible interfaces for librtas to
> prefer over the legacy /dev/mem+sys_rtas(), allowing it to maintain its own
> user-facing ABIs as much as possible. This means that we make different
> interface choices than we would were we writing everything from scratch. In
> the ioctl commands added here, we do not map RTAS error statuses to Linux
> errno values, because the existing users of librtas's system parameter APIs
> expect the RTAS status codes. Instead, the ioctl succeeds if the kernel
> manages to call the RTAS function at all, and passes the RTAS status back
> to user space in the argument buffer.
> 
> Add the ability to retrieve and change system parameters as defined by
> PAPR. Along with proposed implementation changes to librtas[2], this
> effectively fixes librtas's rtas_get_sysparm() and rtas_set_sysparm() APIs
> for existing users with lockdown. This is enough to get HMC communication
> working via the proprietary RSCT stack, enabling LPM, processor DLPAR,
> memory DLPAR, and various other use cases.
> 
> While this starts with RTAS-implemented functions, there's no reason it
> couldn't host facilities that rely on hcalls or other PAPR-specified
> interfaces. It could be an alternative to adding more key=value lines to
> /proc/powrpc/lparcfg. Hence the generic name 'lparctl'.
> 
> Utilities tested (ppc64le kernel and user space):
> * ppc64_cpu --run-mode (powerpc-utils, gets/sets processor diag run mode)
> * serv_config (powerpc-utils, gets/sets various system and LPAR policies)
> * lscpu (util-linux, retrieves processor topology)
> * RSCT (proprietary, retrieves HMC connection details)
> 
> Future work to unbreak more librtas APIs may include:
> * VPD retrieval via ibm,get-vpd
> * RTAS error injection
> * indicator query/manipulation for diagnostics
> 
> [1] https://github.com/ibm-power-utilities/librtas
> [2] https://github.com/ibm-power-utilities/librtas/pull/26
> 
> Signed-off-by: Nathan Lynch <nathanl@linux.ibm.com>
> ---
>  .../userspace-api/ioctl/ioctl-number.rst      |   1 +
>  arch/powerpc/include/uapi/asm/lparctl.h       |  63 +++++++
>  arch/powerpc/platforms/pseries/Makefile       |   2 +-
>  arch/powerpc/platforms/pseries/lparctl.c      | 171 ++++++++++++++++++
>  4 files changed, 236 insertions(+), 1 deletion(-)
>  create mode 100644 arch/powerpc/include/uapi/asm/lparctl.h
>  create mode 100644 arch/powerpc/platforms/pseries/lparctl.c
> 
> diff --git a/Documentation/userspace-api/ioctl/ioctl-number.rst b/Documentation/userspace-api/ioctl/ioctl-number.rst
> index fcab013e47c9..029de1b7ebdb 100644
> --- a/Documentation/userspace-api/ioctl/ioctl-number.rst
> +++ b/Documentation/userspace-api/ioctl/ioctl-number.rst
> @@ -349,6 +349,7 @@ Code  Seq#    Include File                                           Comments
>                                                                       <mailto:vgo@ratio.de>
>  0xB1  00-1F                                                          PPPoX
>                                                                       <mailto:mostrows@styx.uwaterloo.ca>
> +0xB2  01-02  arch/powerpc/include/uapi/asm/lparctl.h                 powerpc/pseries lparctl API
>  0xB3  00     linux/mmc/ioctl.h
>  0xB4  00-0F  linux/gpio.h                                            <mailto:linux-gpio@vger.kernel.org>
>  0xB5  00-0F  uapi/linux/rpmsg.h                                      <mailto:linux-remoteproc@vger.kernel.org>
> diff --git a/arch/powerpc/include/uapi/asm/lparctl.h b/arch/powerpc/include/uapi/asm/lparctl.h
> new file mode 100644
> index 000000000000..42e1ee5fe3c8
> --- /dev/null
> +++ b/arch/powerpc/include/uapi/asm/lparctl.h
> @@ -0,0 +1,63 @@
> +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
> +#ifndef POWERPC_UAPI_LPARCTL_H
> +#define POWERPC_UAPI_LPARCTL_H
> +
> +#include <linux/ioctl.h>
> +#include <linux/types.h>
> +
> +#define LPARCTL_IOCTL_BASE 0xb2
> +
> +#define LPARCTL_IO(nr)         _IO(LPARCTL_IOCTL_BASE, nr)
> +#define LPARCTL_IOR(nr, type)  _IOR(LPARCTL_IOCTL_BASE, nr, type)
> +#define LPARCTL_IOW(nr, type)  _IOW(LPARCTL_IOCTL_BASE, nr, type)
> +#define LPARCTL_IOWR(nr, type) _IOWR(LPARCTL_IOCTL_BASE, nr, type)
> +
> +/**
> + * struct lparctl_get_system_parameter - System parameter retrieval.
> + *
> + * @rtas_status: (output) The result of the ibm,get-system-parameter RTAS
> + *               call attempted by the kernel.
> + * @token: (input) System parameter token as specified in PAPR+ 7.3.16
> + *         "System Parameters Option".
> + * @data: (input and output) If applicable, any required input data ("OS
> + *        Service Entitlement Status" appears to be the only system
> + *        parameter that requires this). If @rtas_status is zero on return
> + *        from ioctl(), contains the value of the specified parameter. The
> + *        first two bytes contain the (big-endian) length of valid data in
> + *        both cases. If @rtas_status is not zero the contents are
> + *        indeterminate.
> + */
> +struct lparctl_get_system_parameter {
> +	__s32 rtas_status;
> +	__u16 token;
> +	union {
> +		__be16 length;
> +		__u8   data[4002];
> +	};
> +};
> +
> +#define LPARCTL_GET_SYSPARM LPARCTL_IOWR(0x01, struct lparctl_get_system_parameter)
> +
> +/**
> + * struct lparctl_set_system_parameter - System parameter update.
> + *
> + * @rtas_status: (output) The result of the ibm,get-system-parameter RTAS
> + *               call attempted by the kernel.
> + * @token: (input) System parameter token as specified in PAPR+ 7.3.16
> + *         "System Parameters Option".
> + * @data: (input) The desired value for the parameter corresponding to
> + *        @token. The first two bytes contain the (big-endian) length of
> + *        valid data.
> + */
> +struct lparctl_set_system_parameter {
> +	__s32 rtas_status;
> +	__u16 token;
> +	union {
> +		__be16 length;
> +		__u8   data[1026];
> +	};
> +};
> +
> +#define LPARCTL_SET_SYSPARM LPARCTL_IOWR(0x02, struct lparctl_set_system_parameter)
> +
> +#endif /* POWERPC_UAPI_LPARCTL_H */
> diff --git a/arch/powerpc/platforms/pseries/Makefile b/arch/powerpc/platforms/pseries/Makefile
> index 14e143b946a3..8fff7ed10d7c 100644
> --- a/arch/powerpc/platforms/pseries/Makefile
> +++ b/arch/powerpc/platforms/pseries/Makefile
> @@ -3,7 +3,7 @@ ccflags-$(CONFIG_PPC64)			:= $(NO_MINIMAL_TOC)
>  ccflags-$(CONFIG_PPC_PSERIES_DEBUG)	+= -DDEBUG
>  
>  obj-y			:= lpar.o hvCall.o nvram.o reconfig.o \
> -			   of_helpers.o \
> +			   of_helpers.o lparctl.o \
>  			   setup.o iommu.o event_sources.o ras.o \
>  			   firmware.o power.o dlpar.o mobility.o rng.o \
>  			   pci.o pci_dlpar.o eeh_pseries.o msi.o \
> diff --git a/arch/powerpc/platforms/pseries/lparctl.c b/arch/powerpc/platforms/pseries/lparctl.c
> new file mode 100644
> index 000000000000..11a33b8d9234
> --- /dev/null
> +++ b/arch/powerpc/platforms/pseries/lparctl.c
> @@ -0,0 +1,171 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * Character device for accessing pseries/PAPR platform-specific
> + * facilities.
> + */
> +#define pr_fmt(fmt) "lparctl: " fmt
> +
> +#include <linux/build_bug.h>
> +#include <linux/fs.h>
> +#include <linux/init.h>
> +#include <linux/kernel.h>
> +#include <linux/miscdevice.h>
> +#include <linux/slab.h>
> +#include <linux/string.h>
> +#include <asm/lparctl.h>
> +#include <asm/machdep.h>
> +#include <asm/rtas.h>
> +
> +/**
> + * lparctl_get_sysparm() - Query a PAPR system parameter.
> + *
> + * Retrieve the value of the parameter indicated by the @token member of
> + * the &struct lparctl_get_system_parameter at @arg. If available and
> + * accessible, the value of the parameter is copied to the @data member of
> + * the &struct lparctl_get_system_parameter at @arg, and its @rtas_status
> + * field is set to zero. Otherwise, the @rtas_status member reflects the
> + * most recent RTAS call status, and the contents of @data are
> + * indeterminate.
> + *
> + * Non-zero RTAS call statuses are not translated to conventional errno
> + * values. Only kernel issues or API misuse result in an error at the
> + * syscall level. This is to serve the needs of legacy software which
> + * historically has accessed system parameters via the rtas() syscall,
> + * which has similar behavior.
> + *
> + * Return:
> + * * 0 - OK. Caller must examine the @rtas_status member of the returned
> + *       &struct lparctl_get_system_parameter to determine whether a parameter
> + *       value was copied out.
> + * * -EINVAL - The copied-in &struct lparctl_get_system_parameter.rtas_status
> + *             is non-zero.
> + * * -EFAULT - The supplied @arg is a bad address.
> + * * -ENOMEM - Allocation failure.
> + */
> +static long lparctl_get_sysparm(struct lparctl_get_system_parameter __user *argp)
> +{
> +	struct lparctl_get_system_parameter *gsp;
> +	long ret;
> +	int fwrc;
> +
> +	/*
> +	 * Special case to allow user space to probe the command.
> +	 */
> +	if (argp == NULL)
> +		return 0;
> +
> +	gsp = memdup_user(argp, sizeof(*gsp));
> +	if (IS_ERR(gsp)) {
> +		ret = PTR_ERR(gsp);
> +		goto err_return;
> +	}
> +
> +	ret = -EINVAL;
> +	if (gsp->rtas_status != 0)
> +		goto err_free;
> +
> +	do {
> +		static_assert(sizeof(gsp->data) <= sizeof(rtas_data_buf));
> +
> +		spin_lock(&rtas_data_buf_lock);
> +		memset(rtas_data_buf, 0, sizeof(rtas_data_buf));
> +		memcpy(rtas_data_buf, gsp->data, sizeof(gsp->data));
> +		fwrc = rtas_call(rtas_token("ibm,get-system-parameter"), 3, 1,
> +				 NULL, gsp->token, __pa(rtas_data_buf),
> +				 sizeof(gsp->data));
> +		if (fwrc == 0)
> +			memcpy(gsp->data, rtas_data_buf, sizeof(gsp->data));

May be the amount of data copied out to the user space could be
gsp->length. This would prevent copying 4K bytes all the time.

In a more general way, the size of the RTAS buffer is quite big, and I'm
wondering if all the data need to be copied back and forth to the kernel.

Unless there are a high frequency of calls this doesn't make sense, and
keeping the code simple might be the best way. Otherwise limiting the bytes
copied could help a bit.

> +		spin_unlock(&rtas_data_buf_lock);
> +	} while (rtas_busy_delay(fwrc));
> +
> +	gsp->rtas_status = fwrc;
> +	ret = 0;
> +	if (copy_to_user(argp, gsp, sizeof(*gsp)))
> +		ret = -EFAULT;
> +err_free:
> +	kfree(gsp);
> +err_return:
> +	return ret;
> +}
> +
> +static long lparctl_set_sysparm(struct lparctl_set_system_parameter __user *argp)
> +{
> +	struct lparctl_set_system_parameter *ssp;
> +	long ret;
> +	int fwrc;
> +
> +	/*
> +	 * Special case to allow user space to probe the command.
> +	 */
> +	if (argp == NULL)
> +		return 0;
> +
> +	ssp = memdup_user(argp, sizeof(*ssp));

As for the get case, would it be nice to limit the amount of bytes copied
to the interesting "length" ?

> +	if (IS_ERR(ssp)) {
> +		ret = PTR_ERR(ssp);
> +		goto err_return;
> +	}
> +
> +	ret = -EINVAL;
> +	if (ssp->rtas_status != 0)
> +		goto err_free;
> +
> +	do {
> +		static_assert(sizeof(ssp->data) <= sizeof(rtas_data_buf));
> +
> +		spin_lock(&rtas_data_buf_lock);
> +		memset(rtas_data_buf, 0, sizeof(rtas_data_buf));
> +		memcpy(rtas_data_buf, ssp->data, sizeof(ssp->data));
> +		fwrc = rtas_call(rtas_token("ibm,set-system-parameter"), 2, 1,
> +				 NULL, ssp->token, __pa(rtas_data_buf));
> +		spin_unlock(&rtas_data_buf_lock);
> +	} while (rtas_busy_delay(fwrc));
> +
> +	ret = 0;
> +	if (copy_to_user(&argp->rtas_status, &fwrc, sizeof(argp->rtas_status)))
> +		ret = -EFAULT;
> +err_free:
> +	kfree(ssp);
> +err_return:
> +	return ret;
> +}
> +
> +static long lparctl_dev_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
> +{
> +	void __user *argp = (void __user *)arg;
> +	long ret;
> +
> +	switch (ioctl) {
> +	case LPARCTL_GET_SYSPARM:
> +		ret = lparctl_get_sysparm(argp);
> +		break;
> +	case LPARCTL_SET_SYSPARM:
> +		ret = lparctl_set_sysparm(argp);
> +		break;
> +	default:
> +		ret = -ENOIOCTLCMD;
> +		break;
> +	}
> +	return ret;
> +}
> +
> +static const struct file_operations lparctl_ops = {
> +	.unlocked_ioctl = lparctl_dev_ioctl,
> +	.llseek		= noop_llseek,
> +};
> +
> +static struct miscdevice lparctl_dev = {
> +	MISC_DYNAMIC_MINOR,
> +	"lparctl",
> +	&lparctl_ops,
> +};
> +
> +static __init int lparctl_init(void)
> +{
> +	int ret;
> +
> +	ret = misc_register(&lparctl_dev);
> +
> +	return ret;
> +}
> +machine_device_initcall(pseries, lparctl_init);
Nathan Lynch Aug. 12, 2022, 7:14 p.m. UTC | #2
Laurent Dufour <ldufour@linux.ibm.com> writes:
> Le 30/07/2022 à 02:04, Nathan Lynch a écrit :
>> +static long lparctl_get_sysparm(struct lparctl_get_system_parameter __user *argp)
>> +{
>> +	struct lparctl_get_system_parameter *gsp;
>> +	long ret;
>> +	int fwrc;
>> +
>> +	/*
>> +	 * Special case to allow user space to probe the command.
>> +	 */
>> +	if (argp == NULL)
>> +		return 0;
>> +
>> +	gsp = memdup_user(argp, sizeof(*gsp));
>> +	if (IS_ERR(gsp)) {
>> +		ret = PTR_ERR(gsp);
>> +		goto err_return;
>> +	}
>> +
>> +	ret = -EINVAL;
>> +	if (gsp->rtas_status != 0)
>> +		goto err_free;
>> +
>> +	do {
>> +		static_assert(sizeof(gsp->data) <= sizeof(rtas_data_buf));
>> +
>> +		spin_lock(&rtas_data_buf_lock);
>> +		memset(rtas_data_buf, 0, sizeof(rtas_data_buf));
>> +		memcpy(rtas_data_buf, gsp->data, sizeof(gsp->data));
>> +		fwrc = rtas_call(rtas_token("ibm,get-system-parameter"), 3, 1,
>> +				 NULL, gsp->token, __pa(rtas_data_buf),
>> +				 sizeof(gsp->data));
>> +		if (fwrc == 0)
>> +			memcpy(gsp->data, rtas_data_buf, sizeof(gsp->data));
>
> May be the amount of data copied out to the user space could be
> gsp->length. This would prevent copying 4K bytes all the time.
>
> In a more general way, the size of the RTAS buffer is quite big, and I'm
> wondering if all the data need to be copied back and forth to the kernel.
>
> Unless there are a high frequency of calls this doesn't make sense, and
> keeping the code simple might be the best way. Otherwise limiting the bytes
> copied could help a bit.

This is not intended to be a high-bandwidth interface and I don't think
there's much of a performance concern here, so I'd rather just keep the
copy sizes involved constant.

>> +static long lparctl_set_sysparm(struct lparctl_set_system_parameter __user *argp)
>> +{
>> +	struct lparctl_set_system_parameter *ssp;
>> +	long ret;
>> +	int fwrc;
>> +
>> +	/*
>> +	 * Special case to allow user space to probe the command.
>> +	 */
>> +	if (argp == NULL)
>> +		return 0;
>> +
>> +	ssp = memdup_user(argp, sizeof(*ssp));
>
> As for the get case, would it be nice to limit the amount of bytes copied
> to the interesting "length" ?

No, the intent is to pass the buffer contents straight to RTAS without
any validation by the kernel, which would duplicate work already
performed by firmware.
Michal Suchánek Sept. 13, 2022, 9:13 a.m. UTC | #3
On Fri, Aug 12, 2022 at 02:14:21PM -0500, Nathan Lynch wrote:
> Laurent Dufour <ldufour@linux.ibm.com> writes:
> > Le 30/07/2022 à 02:04, Nathan Lynch a écrit :
> >> +static long lparctl_get_sysparm(struct lparctl_get_system_parameter __user *argp)
> >> +{
> >> +	struct lparctl_get_system_parameter *gsp;
> >> +	long ret;
> >> +	int fwrc;
> >> +
> >> +	/*
> >> +	 * Special case to allow user space to probe the command.
> >> +	 */
> >> +	if (argp == NULL)
> >> +		return 0;
> >> +
> >> +	gsp = memdup_user(argp, sizeof(*gsp));
> >> +	if (IS_ERR(gsp)) {
> >> +		ret = PTR_ERR(gsp);
> >> +		goto err_return;
> >> +	}
> >> +
> >> +	ret = -EINVAL;
> >> +	if (gsp->rtas_status != 0)
> >> +		goto err_free;
> >> +
> >> +	do {
> >> +		static_assert(sizeof(gsp->data) <= sizeof(rtas_data_buf));
> >> +
> >> +		spin_lock(&rtas_data_buf_lock);
> >> +		memset(rtas_data_buf, 0, sizeof(rtas_data_buf));
> >> +		memcpy(rtas_data_buf, gsp->data, sizeof(gsp->data));
> >> +		fwrc = rtas_call(rtas_token("ibm,get-system-parameter"), 3, 1,
> >> +				 NULL, gsp->token, __pa(rtas_data_buf),
> >> +				 sizeof(gsp->data));
> >> +		if (fwrc == 0)
> >> +			memcpy(gsp->data, rtas_data_buf, sizeof(gsp->data));
> >
> > May be the amount of data copied out to the user space could be
> > gsp->length. This would prevent copying 4K bytes all the time.
> >
> > In a more general way, the size of the RTAS buffer is quite big, and I'm
> > wondering if all the data need to be copied back and forth to the kernel.
> >
> > Unless there are a high frequency of calls this doesn't make sense, and
> > keeping the code simple might be the best way. Otherwise limiting the bytes
> > copied could help a bit.
> 
> This is not intended to be a high-bandwidth interface and I don't think
> there's much of a performance concern here, so I'd rather just keep the
> copy sizes involved constant.

But that's absolutely horrible!

The user wants the VPD data, all of it. And you only give one page with
this interface.

Worse, the call is not reentrant so you need to lock against other users
calling the call while the current caller is retrieving the inidividual
pagaes.

You could do that per process, but then processes with userspace
threading would want the data as well so you would have to save the
arguments of the last call, and compare to arguments of any subsequent
call to determine if you can let it pass or block.

And when you do all that there will be a process that retrieves a couple
of pages and goes out for lunch or loses interest completely, blocking
out everyone from accessing the interface at all.

Thanks

Michal
Nathan Lynch Sept. 13, 2022, 3:59 p.m. UTC | #4
Michal Suchánek <msuchanek@suse.de> writes:

> On Fri, Aug 12, 2022 at 02:14:21PM -0500, Nathan Lynch wrote:
>> Laurent Dufour <ldufour@linux.ibm.com> writes:
>> > Le 30/07/2022 à 02:04, Nathan Lynch a écrit :
>> >> +static long lparctl_get_sysparm(struct lparctl_get_system_parameter __user *argp)
>> >> +{
>> >> +	struct lparctl_get_system_parameter *gsp;
>> >> +	long ret;
>> >> +	int fwrc;
>> >> +
>> >> +	/*
>> >> +	 * Special case to allow user space to probe the command.
>> >> +	 */
>> >> +	if (argp == NULL)
>> >> +		return 0;
>> >> +
>> >> +	gsp = memdup_user(argp, sizeof(*gsp));
>> >> +	if (IS_ERR(gsp)) {
>> >> +		ret = PTR_ERR(gsp);
>> >> +		goto err_return;
>> >> +	}
>> >> +
>> >> +	ret = -EINVAL;
>> >> +	if (gsp->rtas_status != 0)
>> >> +		goto err_free;
>> >> +
>> >> +	do {
>> >> +		static_assert(sizeof(gsp->data) <= sizeof(rtas_data_buf));
>> >> +
>> >> +		spin_lock(&rtas_data_buf_lock);
>> >> +		memset(rtas_data_buf, 0, sizeof(rtas_data_buf));
>> >> +		memcpy(rtas_data_buf, gsp->data, sizeof(gsp->data));
>> >> +		fwrc = rtas_call(rtas_token("ibm,get-system-parameter"), 3, 1,
>> >> +				 NULL, gsp->token, __pa(rtas_data_buf),
>> >> +				 sizeof(gsp->data));
>> >> +		if (fwrc == 0)
>> >> +			memcpy(gsp->data, rtas_data_buf, sizeof(gsp->data));
>> >
>> > May be the amount of data copied out to the user space could be
>> > gsp->length. This would prevent copying 4K bytes all the time.
>> >
>> > In a more general way, the size of the RTAS buffer is quite big, and I'm
>> > wondering if all the data need to be copied back and forth to the kernel.
>> >
>> > Unless there are a high frequency of calls this doesn't make sense, and
>> > keeping the code simple might be the best way. Otherwise limiting the bytes
>> > copied could help a bit.
>> 
>> This is not intended to be a high-bandwidth interface and I don't think
>> there's much of a performance concern here, so I'd rather just keep the
>> copy sizes involved constant.
>
> But that's absolutely horrible!

?

> The user wants the VPD data, all of it. And you only give one page with
> this interface.

The code here is for system parameters, which have a known maximum size,
unlike VPD. There's no code for VPD retrieval in this patch.

But I'm happy to constructively discuss how a VPD ioctl interface should
work.

> Worse, the call is not reentrant so you need to lock against other users
> calling the call while the current caller is retrieving the inidividual
> pagaes.
>
> You could do that per process, but then processes with userspace
> threading would want the data as well so you would have to save the
> arguments of the last call, and compare to arguments of any subsequent
> call to determine if you can let it pass or block.
>
> And when you do all that there will be a process that retrieves a couple
> of pages and goes out for lunch or loses interest completely, blocking
> out everyone from accessing the interface at all.

Right, the ibm,get-vpd RTAS function is tricky to expose to user space.

It needs to be called repeatedly until all data has been returned, 4KB
at a time.

Only one ibm,get-vpd sequence can be in progress at any time. If an
ibm,get-vpd sequence is begun while another sequence is already
outstanding, the first one is invalidated -- I would guess -1 or some
other error is returned on its next call.

So a new system-call level interface for VPD retrieval probably should
not expose the repeating sequence-based nature of the RTAS function to
user space, to prevent concurrent clients from interfering with each
other. That implies that the kernel should buffer the VPD results
internally; at least that's the only idea I've had so far. Open to
other suggestions.
Michal Suchánek Sept. 13, 2022, 4:33 p.m. UTC | #5
On Tue, Sep 13, 2022 at 10:59:56AM -0500, Nathan Lynch wrote:
> Michal Suchánek <msuchanek@suse.de> writes:
> 
> > On Fri, Aug 12, 2022 at 02:14:21PM -0500, Nathan Lynch wrote:
> >> Laurent Dufour <ldufour@linux.ibm.com> writes:
> >> > Le 30/07/2022 à 02:04, Nathan Lynch a écrit :
> >> >> +static long lparctl_get_sysparm(struct lparctl_get_system_parameter __user *argp)
> >> >> +{
> >> >> +	struct lparctl_get_system_parameter *gsp;
> >> >> +	long ret;
> >> >> +	int fwrc;
> >> >> +
> >> >> +	/*
> >> >> +	 * Special case to allow user space to probe the command.
> >> >> +	 */
> >> >> +	if (argp == NULL)
> >> >> +		return 0;
> >> >> +
> >> >> +	gsp = memdup_user(argp, sizeof(*gsp));
> >> >> +	if (IS_ERR(gsp)) {
> >> >> +		ret = PTR_ERR(gsp);
> >> >> +		goto err_return;
> >> >> +	}
> >> >> +
> >> >> +	ret = -EINVAL;
> >> >> +	if (gsp->rtas_status != 0)
> >> >> +		goto err_free;
> >> >> +
> >> >> +	do {
> >> >> +		static_assert(sizeof(gsp->data) <= sizeof(rtas_data_buf));
> >> >> +
> >> >> +		spin_lock(&rtas_data_buf_lock);
> >> >> +		memset(rtas_data_buf, 0, sizeof(rtas_data_buf));
> >> >> +		memcpy(rtas_data_buf, gsp->data, sizeof(gsp->data));
> >> >> +		fwrc = rtas_call(rtas_token("ibm,get-system-parameter"), 3, 1,
> >> >> +				 NULL, gsp->token, __pa(rtas_data_buf),
> >> >> +				 sizeof(gsp->data));
> >> >> +		if (fwrc == 0)
> >> >> +			memcpy(gsp->data, rtas_data_buf, sizeof(gsp->data));
> >> >
> >> > May be the amount of data copied out to the user space could be
> >> > gsp->length. This would prevent copying 4K bytes all the time.
> >> >
> >> > In a more general way, the size of the RTAS buffer is quite big, and I'm
> >> > wondering if all the data need to be copied back and forth to the kernel.
> >> >
> >> > Unless there are a high frequency of calls this doesn't make sense, and
> >> > keeping the code simple might be the best way. Otherwise limiting the bytes
> >> > copied could help a bit.
> >> 
> >> This is not intended to be a high-bandwidth interface and I don't think
> >> there's much of a performance concern here, so I'd rather just keep the
> >> copy sizes involved constant.
> >
> > But that's absolutely horrible!
> 
> ?
> 
> > The user wants the VPD data, all of it. And you only give one page with
> > this interface.
> 
> The code here is for system parameters, which have a known maximum size,
> unlike VPD. There's no code for VPD retrieval in this patch.

But we do need to support the calls that return multiple pages of data.

If the new driver supports only the simple calls it's a failure.

> 
> But I'm happy to constructively discuss how a VPD ioctl interface should
> work.
> 
> > Worse, the call is not reentrant so you need to lock against other users
> > calling the call while the current caller is retrieving the inidividual
> > pagaes.
> >
> > You could do that per process, but then processes with userspace
> > threading would want the data as well so you would have to save the
> > arguments of the last call, and compare to arguments of any subsequent
> > call to determine if you can let it pass or block.
> >
> > And when you do all that there will be a process that retrieves a couple
> > of pages and goes out for lunch or loses interest completely, blocking
> > out everyone from accessing the interface at all.
> 
> Right, the ibm,get-vpd RTAS function is tricky to expose to user space.
> 
> It needs to be called repeatedly until all data has been returned, 4KB
> at a time.
> 
> Only one ibm,get-vpd sequence can be in progress at any time. If an
> ibm,get-vpd sequence is begun while another sequence is already
> outstanding, the first one is invalidated -- I would guess -1 or some
> other error is returned on its next call.
> 
> So a new system-call level interface for VPD retrieval probably should
> not expose the repeating sequence-based nature of the RTAS function to
> user space, to prevent concurrent clients from interfering with each
> other. That implies that the kernel should buffer the VPD results
> internally; at least that's the only idea I've had so far. Open to
> other suggestions.

It can save the data to an user-supplied buffer until all data is
transferred or the buffer space runs out.

Thanks

Michal
Nathan Lynch Sept. 13, 2022, 5:02 p.m. UTC | #6
Michal Suchánek <msuchanek@suse.de> writes:
> On Tue, Sep 13, 2022 at 10:59:56AM -0500, Nathan Lynch wrote:
>> Michal Suchánek <msuchanek@suse.de> writes:
>> 
>> > On Fri, Aug 12, 2022 at 02:14:21PM -0500, Nathan Lynch wrote:
>> >> Laurent Dufour <ldufour@linux.ibm.com> writes:
>> >> > Le 30/07/2022 à 02:04, Nathan Lynch a écrit :
>> >> >> +static long lparctl_get_sysparm(struct lparctl_get_system_parameter __user *argp)
>> >> >> +{
>> >> >> +	struct lparctl_get_system_parameter *gsp;
>> >> >> +	long ret;
>> >> >> +	int fwrc;
>> >> >> +
>> >> >> +	/*
>> >> >> +	 * Special case to allow user space to probe the command.
>> >> >> +	 */
>> >> >> +	if (argp == NULL)
>> >> >> +		return 0;
>> >> >> +
>> >> >> +	gsp = memdup_user(argp, sizeof(*gsp));
>> >> >> +	if (IS_ERR(gsp)) {
>> >> >> +		ret = PTR_ERR(gsp);
>> >> >> +		goto err_return;
>> >> >> +	}
>> >> >> +
>> >> >> +	ret = -EINVAL;
>> >> >> +	if (gsp->rtas_status != 0)
>> >> >> +		goto err_free;
>> >> >> +
>> >> >> +	do {
>> >> >> +		static_assert(sizeof(gsp->data) <= sizeof(rtas_data_buf));
>> >> >> +
>> >> >> +		spin_lock(&rtas_data_buf_lock);
>> >> >> +		memset(rtas_data_buf, 0, sizeof(rtas_data_buf));
>> >> >> +		memcpy(rtas_data_buf, gsp->data, sizeof(gsp->data));
>> >> >> +		fwrc = rtas_call(rtas_token("ibm,get-system-parameter"), 3, 1,
>> >> >> +				 NULL, gsp->token, __pa(rtas_data_buf),
>> >> >> +				 sizeof(gsp->data));
>> >> >> +		if (fwrc == 0)
>> >> >> +			memcpy(gsp->data, rtas_data_buf, sizeof(gsp->data));
>> >> >
>> >> > May be the amount of data copied out to the user space could be
>> >> > gsp->length. This would prevent copying 4K bytes all the time.
>> >> >
>> >> > In a more general way, the size of the RTAS buffer is quite big, and I'm
>> >> > wondering if all the data need to be copied back and forth to the kernel.
>> >> >
>> >> > Unless there are a high frequency of calls this doesn't make sense, and
>> >> > keeping the code simple might be the best way. Otherwise limiting the bytes
>> >> > copied could help a bit.
>> >> 
>> >> This is not intended to be a high-bandwidth interface and I don't think
>> >> there's much of a performance concern here, so I'd rather just keep the
>> >> copy sizes involved constant.
>> >
>> > But that's absolutely horrible!
>> 
>> ?
>> 
>> > The user wants the VPD data, all of it. And you only give one page with
>> > this interface.
>> 
>> The code here is for system parameters, which have a known maximum size,
>> unlike VPD. There's no code for VPD retrieval in this patch.
>
> But we do need to support the calls that return multiple pages of data.
>
> If the new driver supports only the simple calls it's a failure.

Michal, will you please moderate your tone? I think you can communicate
your concerns without calling my work "absolutely horrible" or a
"failure". Thanks.

Anyway, of course I intend to support the more complex calls, but
supporting the simple calls actually unbreaks a lot of stuff.

>> But I'm happy to constructively discuss how a VPD ioctl interface should
>> work.
>> 
>> > Worse, the call is not reentrant so you need to lock against other users
>> > calling the call while the current caller is retrieving the inidividual
>> > pagaes.
>> >
>> > You could do that per process, but then processes with userspace
>> > threading would want the data as well so you would have to save the
>> > arguments of the last call, and compare to arguments of any subsequent
>> > call to determine if you can let it pass or block.
>> >
>> > And when you do all that there will be a process that retrieves a couple
>> > of pages and goes out for lunch or loses interest completely, blocking
>> > out everyone from accessing the interface at all.
>> 
>> Right, the ibm,get-vpd RTAS function is tricky to expose to user space.
>> 
>> It needs to be called repeatedly until all data has been returned, 4KB
>> at a time.
>> 
>> Only one ibm,get-vpd sequence can be in progress at any time. If an
>> ibm,get-vpd sequence is begun while another sequence is already
>> outstanding, the first one is invalidated -- I would guess -1 or some
>> other error is returned on its next call.
>> 
>> So a new system-call level interface for VPD retrieval probably should
>> not expose the repeating sequence-based nature of the RTAS function to
>> user space, to prevent concurrent clients from interfering with each
>> other. That implies that the kernel should buffer the VPD results
>> internally; at least that's the only idea I've had so far. Open to
>> other suggestions.
>
> It can save the data to an user-supplied buffer until all data is
> transferred or the buffer space runs out.

Yes, of course, thanks. Assuming user space can discover the appropriate
buffer size, which should be possible.
Michal Suchánek Sept. 14, 2022, 8:14 a.m. UTC | #7
On Tue, Sep 13, 2022 at 12:02:42PM -0500, Nathan Lynch wrote:
> Michal Suchánek <msuchanek@suse.de> writes:
> > On Tue, Sep 13, 2022 at 10:59:56AM -0500, Nathan Lynch wrote:
> >> Michal Suchánek <msuchanek@suse.de> writes:
> >> 
> >> > On Fri, Aug 12, 2022 at 02:14:21PM -0500, Nathan Lynch wrote:
> >> >> Laurent Dufour <ldufour@linux.ibm.com> writes:
> >> >> > Le 30/07/2022 à 02:04, Nathan Lynch a écrit :
> >> >> >> +static long lparctl_get_sysparm(struct lparctl_get_system_parameter __user *argp)
> >> >> >> +{
> >> >> >> +	struct lparctl_get_system_parameter *gsp;
> >> >> >> +	long ret;
> >> >> >> +	int fwrc;
> >> >> >> +
> >> >> >> +	/*
> >> >> >> +	 * Special case to allow user space to probe the command.
> >> >> >> +	 */
> >> >> >> +	if (argp == NULL)
> >> >> >> +		return 0;
> >> >> >> +
> >> >> >> +	gsp = memdup_user(argp, sizeof(*gsp));
> >> >> >> +	if (IS_ERR(gsp)) {
> >> >> >> +		ret = PTR_ERR(gsp);
> >> >> >> +		goto err_return;
> >> >> >> +	}
> >> >> >> +
> >> >> >> +	ret = -EINVAL;
> >> >> >> +	if (gsp->rtas_status != 0)
> >> >> >> +		goto err_free;
> >> >> >> +
> >> >> >> +	do {
> >> >> >> +		static_assert(sizeof(gsp->data) <= sizeof(rtas_data_buf));
> >> >> >> +
> >> >> >> +		spin_lock(&rtas_data_buf_lock);
> >> >> >> +		memset(rtas_data_buf, 0, sizeof(rtas_data_buf));
> >> >> >> +		memcpy(rtas_data_buf, gsp->data, sizeof(gsp->data));
> >> >> >> +		fwrc = rtas_call(rtas_token("ibm,get-system-parameter"), 3, 1,
> >> >> >> +				 NULL, gsp->token, __pa(rtas_data_buf),
> >> >> >> +				 sizeof(gsp->data));
> >> >> >> +		if (fwrc == 0)
> >> >> >> +			memcpy(gsp->data, rtas_data_buf, sizeof(gsp->data));
> >> >> >
> >> >> > May be the amount of data copied out to the user space could be
> >> >> > gsp->length. This would prevent copying 4K bytes all the time.
> >> >> >
> >> >> > In a more general way, the size of the RTAS buffer is quite big, and I'm
> >> >> > wondering if all the data need to be copied back and forth to the kernel.
> >> >> >
> >> >> > Unless there are a high frequency of calls this doesn't make sense, and
> >> >> > keeping the code simple might be the best way. Otherwise limiting the bytes
> >> >> > copied could help a bit.
> >> >> 
> >> >> This is not intended to be a high-bandwidth interface and I don't think
> >> >> there's much of a performance concern here, so I'd rather just keep the
> >> >> copy sizes involved constant.
> >> >
> >> > But that's absolutely horrible!
> >> 
> >> ?
> >> 
> >> > The user wants the VPD data, all of it. And you only give one page with
> >> > this interface.
> >> 
> >> The code here is for system parameters, which have a known maximum size,
> >> unlike VPD. There's no code for VPD retrieval in this patch.
> >
> > But we do need to support the calls that return multiple pages of data.
> >
> > If the new driver supports only the simple calls it's a failure.
> 
> Michal, will you please moderate your tone? I think you can communicate
> your concerns without calling my work "absolutely horrible" or a
> "failure". Thanks.

Sorry, it's not a good wording.

> Anyway, of course I intend to support the more complex calls, but
> supporting the simple calls actually unbreaks a lot of stuff.

The thing is that supporting calls that return more than one page of
data is absolutely required, and this interface built around fixed size
data transfer can't do it.

So it sounds like a ticked for redoing the driver right after it's
implemented, or ending up with two subtly different interfaces - one for
the calls that can return multiple pages of data, and one for the simple
calls.

That does not sound like a good idea at all to me.

> 
> >> But I'm happy to constructively discuss how a VPD ioctl interface should
> >> work.
> >> 
> >> > Worse, the call is not reentrant so you need to lock against other users
> >> > calling the call while the current caller is retrieving the inidividual
> >> > pagaes.
> >> >
> >> > You could do that per process, but then processes with userspace
> >> > threading would want the data as well so you would have to save the
> >> > arguments of the last call, and compare to arguments of any subsequent
> >> > call to determine if you can let it pass or block.
> >> >
> >> > And when you do all that there will be a process that retrieves a couple
> >> > of pages and goes out for lunch or loses interest completely, blocking
> >> > out everyone from accessing the interface at all.
> >> 
> >> Right, the ibm,get-vpd RTAS function is tricky to expose to user space.
> >> 
> >> It needs to be called repeatedly until all data has been returned, 4KB
> >> at a time.
> >> 
> >> Only one ibm,get-vpd sequence can be in progress at any time. If an
> >> ibm,get-vpd sequence is begun while another sequence is already
> >> outstanding, the first one is invalidated -- I would guess -1 or some
> >> other error is returned on its next call.
> >> 
> >> So a new system-call level interface for VPD retrieval probably should
> >> not expose the repeating sequence-based nature of the RTAS function to
> >> user space, to prevent concurrent clients from interfering with each
> >> other. That implies that the kernel should buffer the VPD results
> >> internally; at least that's the only idea I've had so far. Open to
> >> other suggestions.
> >
> > It can save the data to an user-supplied buffer until all data is
> > transferred or the buffer space runs out.
> 
> Yes, of course, thanks. Assuming user space can discover the appropriate
> buffer size, which should be possible.

It will not be entirely reliable because the data size may change over
time but assuming the performance is not an issue the caller can just
call again with a bigger buffer if the data hapens to grow at the very
moment they tried to retrieve it.

Thanks

Michal
Nathan Lynch Sept. 15, 2022, 1:43 p.m. UTC | #8
Michal Suchánek <msuchanek@suse.de> writes:
> On Tue, Sep 13, 2022 at 12:02:42PM -0500, Nathan Lynch wrote:
>> Anyway, of course I intend to support the more complex calls, but
>> supporting the simple calls actually unbreaks a lot of stuff.
>
> The thing is that supporting calls that return more than one page of
> data is absolutely required, and this interface built around fixed size
> data transfer can't do it.

Again, it is appropriate for the system parameter commands and handlers
to deal in small fixed size buffers. Code for VPD retrieval will have to
work differently.

> So it sounds like a ticked for redoing the driver right after it's
> implemented, or ending up with two subtly different interfaces - one for
> the calls that can return multiple pages of data, and one for the simple
> calls.
>
> That does not sound like a good idea at all to me.

That's not my plan, and I won't be trying to get anything merged without
supporting some of the more complex cases. OK?
diff mbox series

Patch

diff --git a/Documentation/userspace-api/ioctl/ioctl-number.rst b/Documentation/userspace-api/ioctl/ioctl-number.rst
index fcab013e47c9..029de1b7ebdb 100644
--- a/Documentation/userspace-api/ioctl/ioctl-number.rst
+++ b/Documentation/userspace-api/ioctl/ioctl-number.rst
@@ -349,6 +349,7 @@  Code  Seq#    Include File                                           Comments
                                                                      <mailto:vgo@ratio.de>
 0xB1  00-1F                                                          PPPoX
                                                                      <mailto:mostrows@styx.uwaterloo.ca>
+0xB2  01-02  arch/powerpc/include/uapi/asm/lparctl.h                 powerpc/pseries lparctl API
 0xB3  00     linux/mmc/ioctl.h
 0xB4  00-0F  linux/gpio.h                                            <mailto:linux-gpio@vger.kernel.org>
 0xB5  00-0F  uapi/linux/rpmsg.h                                      <mailto:linux-remoteproc@vger.kernel.org>
diff --git a/arch/powerpc/include/uapi/asm/lparctl.h b/arch/powerpc/include/uapi/asm/lparctl.h
new file mode 100644
index 000000000000..42e1ee5fe3c8
--- /dev/null
+++ b/arch/powerpc/include/uapi/asm/lparctl.h
@@ -0,0 +1,63 @@ 
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef POWERPC_UAPI_LPARCTL_H
+#define POWERPC_UAPI_LPARCTL_H
+
+#include <linux/ioctl.h>
+#include <linux/types.h>
+
+#define LPARCTL_IOCTL_BASE 0xb2
+
+#define LPARCTL_IO(nr)         _IO(LPARCTL_IOCTL_BASE, nr)
+#define LPARCTL_IOR(nr, type)  _IOR(LPARCTL_IOCTL_BASE, nr, type)
+#define LPARCTL_IOW(nr, type)  _IOW(LPARCTL_IOCTL_BASE, nr, type)
+#define LPARCTL_IOWR(nr, type) _IOWR(LPARCTL_IOCTL_BASE, nr, type)
+
+/**
+ * struct lparctl_get_system_parameter - System parameter retrieval.
+ *
+ * @rtas_status: (output) The result of the ibm,get-system-parameter RTAS
+ *               call attempted by the kernel.
+ * @token: (input) System parameter token as specified in PAPR+ 7.3.16
+ *         "System Parameters Option".
+ * @data: (input and output) If applicable, any required input data ("OS
+ *        Service Entitlement Status" appears to be the only system
+ *        parameter that requires this). If @rtas_status is zero on return
+ *        from ioctl(), contains the value of the specified parameter. The
+ *        first two bytes contain the (big-endian) length of valid data in
+ *        both cases. If @rtas_status is not zero the contents are
+ *        indeterminate.
+ */
+struct lparctl_get_system_parameter {
+	__s32 rtas_status;
+	__u16 token;
+	union {
+		__be16 length;
+		__u8   data[4002];
+	};
+};
+
+#define LPARCTL_GET_SYSPARM LPARCTL_IOWR(0x01, struct lparctl_get_system_parameter)
+
+/**
+ * struct lparctl_set_system_parameter - System parameter update.
+ *
+ * @rtas_status: (output) The result of the ibm,get-system-parameter RTAS
+ *               call attempted by the kernel.
+ * @token: (input) System parameter token as specified in PAPR+ 7.3.16
+ *         "System Parameters Option".
+ * @data: (input) The desired value for the parameter corresponding to
+ *        @token. The first two bytes contain the (big-endian) length of
+ *        valid data.
+ */
+struct lparctl_set_system_parameter {
+	__s32 rtas_status;
+	__u16 token;
+	union {
+		__be16 length;
+		__u8   data[1026];
+	};
+};
+
+#define LPARCTL_SET_SYSPARM LPARCTL_IOWR(0x02, struct lparctl_set_system_parameter)
+
+#endif /* POWERPC_UAPI_LPARCTL_H */
diff --git a/arch/powerpc/platforms/pseries/Makefile b/arch/powerpc/platforms/pseries/Makefile
index 14e143b946a3..8fff7ed10d7c 100644
--- a/arch/powerpc/platforms/pseries/Makefile
+++ b/arch/powerpc/platforms/pseries/Makefile
@@ -3,7 +3,7 @@  ccflags-$(CONFIG_PPC64)			:= $(NO_MINIMAL_TOC)
 ccflags-$(CONFIG_PPC_PSERIES_DEBUG)	+= -DDEBUG
 
 obj-y			:= lpar.o hvCall.o nvram.o reconfig.o \
-			   of_helpers.o \
+			   of_helpers.o lparctl.o \
 			   setup.o iommu.o event_sources.o ras.o \
 			   firmware.o power.o dlpar.o mobility.o rng.o \
 			   pci.o pci_dlpar.o eeh_pseries.o msi.o \
diff --git a/arch/powerpc/platforms/pseries/lparctl.c b/arch/powerpc/platforms/pseries/lparctl.c
new file mode 100644
index 000000000000..11a33b8d9234
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/lparctl.c
@@ -0,0 +1,171 @@ 
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Character device for accessing pseries/PAPR platform-specific
+ * facilities.
+ */
+#define pr_fmt(fmt) "lparctl: " fmt
+
+#include <linux/build_bug.h>
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/miscdevice.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <asm/lparctl.h>
+#include <asm/machdep.h>
+#include <asm/rtas.h>
+
+/**
+ * lparctl_get_sysparm() - Query a PAPR system parameter.
+ *
+ * Retrieve the value of the parameter indicated by the @token member of
+ * the &struct lparctl_get_system_parameter at @arg. If available and
+ * accessible, the value of the parameter is copied to the @data member of
+ * the &struct lparctl_get_system_parameter at @arg, and its @rtas_status
+ * field is set to zero. Otherwise, the @rtas_status member reflects the
+ * most recent RTAS call status, and the contents of @data are
+ * indeterminate.
+ *
+ * Non-zero RTAS call statuses are not translated to conventional errno
+ * values. Only kernel issues or API misuse result in an error at the
+ * syscall level. This is to serve the needs of legacy software which
+ * historically has accessed system parameters via the rtas() syscall,
+ * which has similar behavior.
+ *
+ * Return:
+ * * 0 - OK. Caller must examine the @rtas_status member of the returned
+ *       &struct lparctl_get_system_parameter to determine whether a parameter
+ *       value was copied out.
+ * * -EINVAL - The copied-in &struct lparctl_get_system_parameter.rtas_status
+ *             is non-zero.
+ * * -EFAULT - The supplied @arg is a bad address.
+ * * -ENOMEM - Allocation failure.
+ */
+static long lparctl_get_sysparm(struct lparctl_get_system_parameter __user *argp)
+{
+	struct lparctl_get_system_parameter *gsp;
+	long ret;
+	int fwrc;
+
+	/*
+	 * Special case to allow user space to probe the command.
+	 */
+	if (argp == NULL)
+		return 0;
+
+	gsp = memdup_user(argp, sizeof(*gsp));
+	if (IS_ERR(gsp)) {
+		ret = PTR_ERR(gsp);
+		goto err_return;
+	}
+
+	ret = -EINVAL;
+	if (gsp->rtas_status != 0)
+		goto err_free;
+
+	do {
+		static_assert(sizeof(gsp->data) <= sizeof(rtas_data_buf));
+
+		spin_lock(&rtas_data_buf_lock);
+		memset(rtas_data_buf, 0, sizeof(rtas_data_buf));
+		memcpy(rtas_data_buf, gsp->data, sizeof(gsp->data));
+		fwrc = rtas_call(rtas_token("ibm,get-system-parameter"), 3, 1,
+				 NULL, gsp->token, __pa(rtas_data_buf),
+				 sizeof(gsp->data));
+		if (fwrc == 0)
+			memcpy(gsp->data, rtas_data_buf, sizeof(gsp->data));
+		spin_unlock(&rtas_data_buf_lock);
+	} while (rtas_busy_delay(fwrc));
+
+	gsp->rtas_status = fwrc;
+	ret = 0;
+	if (copy_to_user(argp, gsp, sizeof(*gsp)))
+		ret = -EFAULT;
+err_free:
+	kfree(gsp);
+err_return:
+	return ret;
+}
+
+static long lparctl_set_sysparm(struct lparctl_set_system_parameter __user *argp)
+{
+	struct lparctl_set_system_parameter *ssp;
+	long ret;
+	int fwrc;
+
+	/*
+	 * Special case to allow user space to probe the command.
+	 */
+	if (argp == NULL)
+		return 0;
+
+	ssp = memdup_user(argp, sizeof(*ssp));
+	if (IS_ERR(ssp)) {
+		ret = PTR_ERR(ssp);
+		goto err_return;
+	}
+
+	ret = -EINVAL;
+	if (ssp->rtas_status != 0)
+		goto err_free;
+
+	do {
+		static_assert(sizeof(ssp->data) <= sizeof(rtas_data_buf));
+
+		spin_lock(&rtas_data_buf_lock);
+		memset(rtas_data_buf, 0, sizeof(rtas_data_buf));
+		memcpy(rtas_data_buf, ssp->data, sizeof(ssp->data));
+		fwrc = rtas_call(rtas_token("ibm,set-system-parameter"), 2, 1,
+				 NULL, ssp->token, __pa(rtas_data_buf));
+		spin_unlock(&rtas_data_buf_lock);
+	} while (rtas_busy_delay(fwrc));
+
+	ret = 0;
+	if (copy_to_user(&argp->rtas_status, &fwrc, sizeof(argp->rtas_status)))
+		ret = -EFAULT;
+err_free:
+	kfree(ssp);
+err_return:
+	return ret;
+}
+
+static long lparctl_dev_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
+{
+	void __user *argp = (void __user *)arg;
+	long ret;
+
+	switch (ioctl) {
+	case LPARCTL_GET_SYSPARM:
+		ret = lparctl_get_sysparm(argp);
+		break;
+	case LPARCTL_SET_SYSPARM:
+		ret = lparctl_set_sysparm(argp);
+		break;
+	default:
+		ret = -ENOIOCTLCMD;
+		break;
+	}
+	return ret;
+}
+
+static const struct file_operations lparctl_ops = {
+	.unlocked_ioctl = lparctl_dev_ioctl,
+	.llseek		= noop_llseek,
+};
+
+static struct miscdevice lparctl_dev = {
+	MISC_DYNAMIC_MINOR,
+	"lparctl",
+	&lparctl_ops,
+};
+
+static __init int lparctl_init(void)
+{
+	int ret;
+
+	ret = misc_register(&lparctl_dev);
+
+	return ret;
+}
+machine_device_initcall(pseries, lparctl_init);