diff mbox series

[RFC,1/1] s390/pci: expose UID checking state in sysfs

Message ID 20210111093857.24070-2-schnelle@linux.ibm.com
State New
Headers show
Series PCI: s390 global attribute "UID Checking" | expand

Commit Message

Niklas Schnelle Jan. 11, 2021, 9:38 a.m. UTC
We use the UID of a zPCI adapter, or the UID of the function zero if
there are multiple functions in an adapter, as PCI domain if and only if
UID Checking is turned on.
Otherwise we automatically generate domains as devices appear.

The state of UID Checking is thus essential to know if the PCI domain
will be stable, yet currently there is no way to access this information
from userspace.
So let's solve this by showing the state of UID checking as a sysfs
attribute in /sys/bus/pci/uid_checking

Signed-off-by: Niklas Schnelle <schnelle@linux.ibm.com>
---
 Documentation/ABI/testing/sysfs-bus-pci | 11 ++++++++
 arch/s390/include/asm/pci.h             |  3 +++
 arch/s390/pci/pci.c                     |  4 +++
 arch/s390/pci/pci_sysfs.c               | 34 +++++++++++++++++++++++++
 4 files changed, 52 insertions(+)

Comments

Bjorn Helgaas Jan. 12, 2021, 9:50 p.m. UTC | #1
On Mon, Jan 11, 2021 at 10:38:57AM +0100, Niklas Schnelle wrote:
> We use the UID of a zPCI adapter, or the UID of the function zero if
> there are multiple functions in an adapter, as PCI domain if and only if
> UID Checking is turned on.
> Otherwise we automatically generate domains as devices appear.
> 
> The state of UID Checking is thus essential to know if the PCI domain
> will be stable, yet currently there is no way to access this information
> from userspace.
> So let's solve this by showing the state of UID checking as a sysfs
> attribute in /sys/bus/pci/uid_checking

Cosmetic: can't tell if the above is two paragraphs separated by blank
lines or four separated by either blank lines or short last lines.
Please separate or reflow to avoid the ambiguity.

I don't have any input on the s390 issues, and I assume this will go
via the s390 tree.

> Signed-off-by: Niklas Schnelle <schnelle@linux.ibm.com>
> ---
>  Documentation/ABI/testing/sysfs-bus-pci | 11 ++++++++
>  arch/s390/include/asm/pci.h             |  3 +++
>  arch/s390/pci/pci.c                     |  4 +++
>  arch/s390/pci/pci_sysfs.c               | 34 +++++++++++++++++++++++++
>  4 files changed, 52 insertions(+)
> 
> diff --git a/Documentation/ABI/testing/sysfs-bus-pci b/Documentation/ABI/testing/sysfs-bus-pci
> index 25c9c39770c6..a174aac0ebb0 100644
> --- a/Documentation/ABI/testing/sysfs-bus-pci
> +++ b/Documentation/ABI/testing/sysfs-bus-pci
> @@ -375,3 +375,14 @@ Description:
>  		The value comes from the PCI kernel device state and can be one
>  		of: "unknown", "error", "D0", D1", "D2", "D3hot", "D3cold".
>  		The file is read only.
> +What:		/sys/bus/pci/zpci/uid_checking
> +Date:		December 2020
> +Contact:	Niklas Schnelle <schnelle@linux.ibm.com>
> +Description:
> +		This attribute exposes the global state of UID Checking on
> +		an s390 Linux system. If UID Checking is on this file
> +		contains '1' otherwise '0'. If UID Checking is on the UID of
> +		a zPCI device, or the UID of function zero for a multi-function
> +		device will be used as its PCI Domain number. If UID Checking
> +		is off PCI Domain numbers are generated automatically and
> +		are not stable across reboots.
> diff --git a/arch/s390/include/asm/pci.h b/arch/s390/include/asm/pci.h
> index 212628932ddc..3cfa6cc701ba 100644
> --- a/arch/s390/include/asm/pci.h
> +++ b/arch/s390/include/asm/pci.h
> @@ -285,6 +285,9 @@ void zpci_debug_exit_device(struct zpci_dev *);
>  /* Error reporting */
>  int zpci_report_error(struct pci_dev *, struct zpci_report_error_header *);
>  
> +/* Sysfs Entries */
> +int zpci_sysfs_init(void);
> +
>  #ifdef CONFIG_NUMA
>  
>  /* Returns the node based on PCI bus */
> diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c
> index 41df8fcfddde..c16c93e5f9af 100644
> --- a/arch/s390/pci/pci.c
> +++ b/arch/s390/pci/pci.c
> @@ -881,6 +881,10 @@ static int __init pci_base_init(void)
>  	if (rc)
>  		goto out_find;
>  
> +	rc = zpci_sysfs_init();
> +	if (rc)
> +		goto out_find;
> +
>  	s390_pci_initialized = 1;
>  	return 0;
>  
> diff --git a/arch/s390/pci/pci_sysfs.c b/arch/s390/pci/pci_sysfs.c
> index 5c028bee91b9..d00690f73539 100644
> --- a/arch/s390/pci/pci_sysfs.c
> +++ b/arch/s390/pci/pci_sysfs.c
> @@ -172,3 +172,37 @@ const struct attribute_group *zpci_attr_groups[] = {
>  	&pfip_attr_group,
>  	NULL,
>  };
> +
> +/* Global zPCI attributes */
> +static ssize_t uid_checking_show(struct kobject *kobj,
> +				 struct kobj_attribute *attr, char *buf)
> +{
> +	return sprintf(buf, "%i\n", zpci_unique_uid);
> +}
> +
> +static struct kobj_attribute sys_zpci_uid_checking_attr =
> +	__ATTR(uid_checking, 0444, uid_checking_show, NULL);

Use DEVICE_ATTR_RO instead of __ATTR.

> +static struct kset *zpci_global_kset;
> +
> +static struct attribute *zpci_attrs_global[] = {
> +	&sys_zpci_uid_checking_attr.attr,
> +	NULL,
> +};
> +
> +static struct attribute_group zpci_attr_group_global = {
> +	.attrs = zpci_attrs_global,
> +};
> +
> +int __init zpci_sysfs_init(void)
> +{
> +	struct kset *pci_bus_kset;
> +
> +	pci_bus_kset = bus_get_kset(&pci_bus_type);
> +
> +	zpci_global_kset = kset_create_and_add("zpci", NULL, &pci_bus_kset->kobj);
> +	if (!zpci_global_kset)
> +		return -ENOMEM;
> +
> +	return sysfs_create_group(&zpci_global_kset->kobj, &zpci_attr_group_global);
> +}
> -- 
> 2.17.1
>
Niklas Schnelle Jan. 13, 2021, 7:47 a.m. UTC | #2
On 1/12/21 10:50 PM, Bjorn Helgaas wrote:
> On Mon, Jan 11, 2021 at 10:38:57AM +0100, Niklas Schnelle wrote:
>> We use the UID of a zPCI adapter, or the UID of the function zero if
>> there are multiple functions in an adapter, as PCI domain if and only if
>> UID Checking is turned on.
>> Otherwise we automatically generate domains as devices appear.
>>
>> The state of UID Checking is thus essential to know if the PCI domain
>> will be stable, yet currently there is no way to access this information
>> from userspace.
>> So let's solve this by showing the state of UID checking as a sysfs
>> attribute in /sys/bus/pci/uid_checking
> 
> Cosmetic: can't tell if the above is two paragraphs separated by blank
> lines or four separated by either blank lines or short last lines.
> Please separate or reflow to avoid the ambiguity.

Thanks, you're right I split it in 3 proper paragraphs now.
Also the commit message was out of sync with the documentation,
cover letter and code. This version actually uses
/sys/bus/pci/zpci/uid_checking sorry about that.

> 
> I don't have any input on the s390 issues, and I assume this will go
> via the s390 tree.
> 
>> Signed-off-by: Niklas Schnelle <schnelle@linux.ibm.com>
>> ---
>>  Documentation/ABI/testing/sysfs-bus-pci | 11 ++++++++
>>  arch/s390/include/asm/pci.h             |  3 +++
>>  arch/s390/pci/pci.c                     |  4 +++
>>  arch/s390/pci/pci_sysfs.c               | 34 +++++++++++++++++++++++++
>>  4 files changed, 52 insertions(+)
>>
>> diff --git a/Documentation/ABI/testing/sysfs-bus-pci b/Documentation/ABI/testing/sysfs-bus-pci
>> index 25c9c39770c6..a174aac0ebb0 100644
>> --- a/Documentation/ABI/testing/sysfs-bus-pci
>> +++ b/Documentation/ABI/testing/sysfs-bus-pci
>> @@ -375,3 +375,14 @@ Description:
>>  		The value comes from the PCI kernel device state and can be one
>>  		of: "unknown", "error", "D0", D1", "D2", "D3hot", "D3cold".
>>  		The file is read only.
>> +What:		/sys/bus/pci/zpci/uid_checking
>> +Date:		December 2020
>> +Contact:	Niklas Schnelle <schnelle@linux.ibm.com>
>> +Description:
>> +		This attribute exposes the global state of UID Checking on
>> +		an s390 Linux system. If UID Checking is on this file
>> +		contains '1' otherwise '0'. If UID Checking is on the UID of
>> +		a zPCI device, or the UID of function zero for a multi-function
>> +		device will be used as its PCI Domain number. If UID Checking
>> +		is off PCI Domain numbers are generated automatically and
>> +		are not stable across reboots.
>> diff --git a/arch/s390/include/asm/pci.h b/arch/s390/include/asm/pci.h
>> index 212628932ddc..3cfa6cc701ba 100644
>> --- a/arch/s390/include/asm/pci.h
>> +++ b/arch/s390/include/asm/pci.h
>> @@ -285,6 +285,9 @@ void zpci_debug_exit_device(struct zpci_dev *);
>>  /* Error reporting */
>>  int zpci_report_error(struct pci_dev *, struct zpci_report_error_header *);
... snip ...
>> +
>> +/* Global zPCI attributes */
>> +static ssize_t uid_checking_show(struct kobject *kobj,
>> +				 struct kobj_attribute *attr, char *buf)
>> +{
>> +	return sprintf(buf, "%i\n", zpci_unique_uid);
>> +}
>> +
>> +static struct kobj_attribute sys_zpci_uid_checking_attr =
>> +	__ATTR(uid_checking, 0444, uid_checking_show, NULL);
> 
> Use DEVICE_ATTR_RO instead of __ATTR.

It's my understanding that DEVICE_ATTR_* is only for
per device attributes. This one is global for the entire
Z PCI. I just tried with BUS_ATTR_RO instead
and that works but only if I put the attribute at
/sys/bus/pci/uid_checking instead of with a zpci
subfolder. This path would work for us too, we
currently don't have any other global attributes
that we are planning to expose but those could of
course come up in the future.

> 
... snip ...
>>
Bjorn Helgaas Jan. 13, 2021, 6:55 p.m. UTC | #3
On Wed, Jan 13, 2021 at 08:47:58AM +0100, Niklas Schnelle wrote:
> On 1/12/21 10:50 PM, Bjorn Helgaas wrote:
> > On Mon, Jan 11, 2021 at 10:38:57AM +0100, Niklas Schnelle wrote:
> >> We use the UID of a zPCI adapter, or the UID of the function zero if
> >> there are multiple functions in an adapter, as PCI domain if and only if
> >> UID Checking is turned on.
> >> Otherwise we automatically generate domains as devices appear.
> >>
> >> The state of UID Checking is thus essential to know if the PCI domain
> >> will be stable, yet currently there is no way to access this information
> >> from userspace.
> >> So let's solve this by showing the state of UID checking as a sysfs
> >> attribute in /sys/bus/pci/uid_checking

> >> +/* Global zPCI attributes */
> >> +static ssize_t uid_checking_show(struct kobject *kobj,
> >> +				 struct kobj_attribute *attr, char *buf)
> >> +{
> >> +	return sprintf(buf, "%i\n", zpci_unique_uid);
> >> +}
> >> +
> >> +static struct kobj_attribute sys_zpci_uid_checking_attr =
> >> +	__ATTR(uid_checking, 0444, uid_checking_show, NULL);
> > 
> > Use DEVICE_ATTR_RO instead of __ATTR.
> 
> It's my understanding that DEVICE_ATTR_* is only for
> per device attributes. This one is global for the entire
> Z PCI. I just tried with BUS_ATTR_RO instead
> and that works but only if I put the attribute at
> /sys/bus/pci/uid_checking instead of with a zpci
> subfolder. This path would work for us too, we
> currently don't have any other global attributes
> that we are planning to expose but those could of
> course come up in the future.

Ah, I missed the fact that this is a kobj_attribute, not a
device_attribute.  Maybe KERNEL_ATTR_RO()?  Very few uses so far, but
seems like it might fit?

Bjorn
Niklas Schnelle Jan. 14, 2021, 1:20 p.m. UTC | #4
On 1/13/21 7:55 PM, Bjorn Helgaas wrote:
> On Wed, Jan 13, 2021 at 08:47:58AM +0100, Niklas Schnelle wrote:
>> On 1/12/21 10:50 PM, Bjorn Helgaas wrote:
>>> On Mon, Jan 11, 2021 at 10:38:57AM +0100, Niklas Schnelle wrote:
>>>> We use the UID of a zPCI adapter, or the UID of the function zero if
>>>> there are multiple functions in an adapter, as PCI domain if and only if
>>>> UID Checking is turned on.
>>>> Otherwise we automatically generate domains as devices appear.
>>>>
>>>> The state of UID Checking is thus essential to know if the PCI domain
>>>> will be stable, yet currently there is no way to access this information
>>>> from userspace.
>>>> So let's solve this by showing the state of UID checking as a sysfs
>>>> attribute in /sys/bus/pci/uid_checking
> 
>>>> +/* Global zPCI attributes */
>>>> +static ssize_t uid_checking_show(struct kobject *kobj,
>>>> +				 struct kobj_attribute *attr, char *buf)
>>>> +{
>>>> +	return sprintf(buf, "%i\n", zpci_unique_uid);
>>>> +}
>>>> +
>>>> +static struct kobj_attribute sys_zpci_uid_checking_attr =
>>>> +	__ATTR(uid_checking, 0444, uid_checking_show, NULL);
>>>
>>> Use DEVICE_ATTR_RO instead of __ATTR.
>>
>> It's my understanding that DEVICE_ATTR_* is only for
>> per device attributes. This one is global for the entire
>> Z PCI. I just tried with BUS_ATTR_RO instead
>> and that works but only if I put the attribute at
>> /sys/bus/pci/uid_checking instead of with a zpci
>> subfolder. This path would work for us too, we
>> currently don't have any other global attributes
>> that we are planning to expose but those could of
>> course come up in the future.
> 
> Ah, I missed the fact that this is a kobj_attribute, not a
> device_attribute.  Maybe KERNEL_ATTR_RO()?  Very few uses so far, but
> seems like it might fit?
> 
> Bjorn
> 

KERNEL_ATTR_* is currently not exported in any header. After
adding it to include/linuc/sysfs.h it indeed works perfectly.
Adding Christian Brauner as suggested by get_maintainers for
their opinion. I'm of course willing to provide a patch
for that move should it be desired.

@Bjorn apart from the correct macro do you have a preference
for either suggested path /sys/bus/pci/zpci/uid_checking vs
/sys/bus/pci/uid_checking?

For completeness some further internal discussion lets
us tend to rather name it to "unique_uids" but I guess that
doesn't make a difference to non s390 people ;-)
Christian Brauner Jan. 14, 2021, 1:44 p.m. UTC | #5
On Thu, Jan 14, 2021 at 02:20:10PM +0100, Niklas Schnelle wrote:
> 
> 
> On 1/13/21 7:55 PM, Bjorn Helgaas wrote:
> > On Wed, Jan 13, 2021 at 08:47:58AM +0100, Niklas Schnelle wrote:
> >> On 1/12/21 10:50 PM, Bjorn Helgaas wrote:
> >>> On Mon, Jan 11, 2021 at 10:38:57AM +0100, Niklas Schnelle wrote:
> >>>> We use the UID of a zPCI adapter, or the UID of the function zero if
> >>>> there are multiple functions in an adapter, as PCI domain if and only if
> >>>> UID Checking is turned on.
> >>>> Otherwise we automatically generate domains as devices appear.
> >>>>
> >>>> The state of UID Checking is thus essential to know if the PCI domain
> >>>> will be stable, yet currently there is no way to access this information
> >>>> from userspace.
> >>>> So let's solve this by showing the state of UID checking as a sysfs
> >>>> attribute in /sys/bus/pci/uid_checking
> > 
> >>>> +/* Global zPCI attributes */
> >>>> +static ssize_t uid_checking_show(struct kobject *kobj,
> >>>> +				 struct kobj_attribute *attr, char *buf)
> >>>> +{
> >>>> +	return sprintf(buf, "%i\n", zpci_unique_uid);
> >>>> +}
> >>>> +
> >>>> +static struct kobj_attribute sys_zpci_uid_checking_attr =
> >>>> +	__ATTR(uid_checking, 0444, uid_checking_show, NULL);
> >>>
> >>> Use DEVICE_ATTR_RO instead of __ATTR.
> >>
> >> It's my understanding that DEVICE_ATTR_* is only for
> >> per device attributes. This one is global for the entire
> >> Z PCI. I just tried with BUS_ATTR_RO instead
> >> and that works but only if I put the attribute at
> >> /sys/bus/pci/uid_checking instead of with a zpci
> >> subfolder. This path would work for us too, we
> >> currently don't have any other global attributes
> >> that we are planning to expose but those could of
> >> course come up in the future.
> > 
> > Ah, I missed the fact that this is a kobj_attribute, not a
> > device_attribute.  Maybe KERNEL_ATTR_RO()?  Very few uses so far, but
> > seems like it might fit?
> > 
> > Bjorn
> > 
> 
> KERNEL_ATTR_* is currently not exported in any header. After
> adding it to include/linuc/sysfs.h it indeed works perfectly.
> Adding Christian Brauner as suggested by get_maintainers for
> their opinion. I'm of course willing to provide a patch

Hey Niklas et al. :)

I think this will need input from Greg. He should be best versed in
sysfs attributes. The problem with KERNEL_ATTR_* to me seems that it's
supposed to be kernel internal. Now, that might just be a matter of
renaming the macro but let's see whether Greg has any better idea or
more questions. :)

Christian
Greg Kroah-Hartman Jan. 14, 2021, 1:58 p.m. UTC | #6
On Thu, Jan 14, 2021 at 02:44:53PM +0100, Christian Brauner wrote:
> On Thu, Jan 14, 2021 at 02:20:10PM +0100, Niklas Schnelle wrote:
> > 
> > 
> > On 1/13/21 7:55 PM, Bjorn Helgaas wrote:
> > > On Wed, Jan 13, 2021 at 08:47:58AM +0100, Niklas Schnelle wrote:
> > >> On 1/12/21 10:50 PM, Bjorn Helgaas wrote:
> > >>> On Mon, Jan 11, 2021 at 10:38:57AM +0100, Niklas Schnelle wrote:
> > >>>> We use the UID of a zPCI adapter, or the UID of the function zero if
> > >>>> there are multiple functions in an adapter, as PCI domain if and only if
> > >>>> UID Checking is turned on.
> > >>>> Otherwise we automatically generate domains as devices appear.
> > >>>>
> > >>>> The state of UID Checking is thus essential to know if the PCI domain
> > >>>> will be stable, yet currently there is no way to access this information
> > >>>> from userspace.
> > >>>> So let's solve this by showing the state of UID checking as a sysfs
> > >>>> attribute in /sys/bus/pci/uid_checking
> > > 
> > >>>> +/* Global zPCI attributes */
> > >>>> +static ssize_t uid_checking_show(struct kobject *kobj,
> > >>>> +				 struct kobj_attribute *attr, char *buf)
> > >>>> +{
> > >>>> +	return sprintf(buf, "%i\n", zpci_unique_uid);
> > >>>> +}
> > >>>> +
> > >>>> +static struct kobj_attribute sys_zpci_uid_checking_attr =
> > >>>> +	__ATTR(uid_checking, 0444, uid_checking_show, NULL);
> > >>>
> > >>> Use DEVICE_ATTR_RO instead of __ATTR.
> > >>
> > >> It's my understanding that DEVICE_ATTR_* is only for
> > >> per device attributes. This one is global for the entire
> > >> Z PCI. I just tried with BUS_ATTR_RO instead
> > >> and that works but only if I put the attribute at
> > >> /sys/bus/pci/uid_checking instead of with a zpci
> > >> subfolder. This path would work for us too, we
> > >> currently don't have any other global attributes
> > >> that we are planning to expose but those could of
> > >> course come up in the future.
> > > 
> > > Ah, I missed the fact that this is a kobj_attribute, not a
> > > device_attribute.  Maybe KERNEL_ATTR_RO()?  Very few uses so far, but
> > > seems like it might fit?
> > > 
> > > Bjorn
> > > 
> > 
> > KERNEL_ATTR_* is currently not exported in any header. After
> > adding it to include/linuc/sysfs.h it indeed works perfectly.
> > Adding Christian Brauner as suggested by get_maintainers for
> > their opinion. I'm of course willing to provide a patch
> 
> Hey Niklas et al. :)
> 
> I think this will need input from Greg. He should be best versed in
> sysfs attributes. The problem with KERNEL_ATTR_* to me seems that it's
> supposed to be kernel internal. Now, that might just be a matter of
> renaming the macro but let's see whether Greg has any better idea or
> more questions. :)

The big question is, why are you needing this?

No driver or driver subsystem should EVER be messing with a "raw"
kobject like this.  Just use the existing DEVICE_* macros instead
please.

If you are using a raw kobject, please ask me how to do this properly,
as that is something that should NEVER show up in the /sys/devices/*
tree.  Otherwise userspace tools will break.

thanks,

greg k-h
Niklas Schnelle Jan. 14, 2021, 3:06 p.m. UTC | #7
On 1/14/21 2:58 PM, Greg Kroah-Hartman wrote:
> On Thu, Jan 14, 2021 at 02:44:53PM +0100, Christian Brauner wrote:
>> On Thu, Jan 14, 2021 at 02:20:10PM +0100, Niklas Schnelle wrote:
>>>
>>>
>>> On 1/13/21 7:55 PM, Bjorn Helgaas wrote:
>>>> On Wed, Jan 13, 2021 at 08:47:58AM +0100, Niklas Schnelle wrote:
>>>>> On 1/12/21 10:50 PM, Bjorn Helgaas wrote:
>>>>>> On Mon, Jan 11, 2021 at 10:38:57AM +0100, Niklas Schnelle wrote:
>>>>>>> We use the UID of a zPCI adapter, or the UID of the function zero if
>>>>>>> there are multiple functions in an adapter, as PCI domain if and only if
>>>>>>> UID Checking is turned on.
>>>>>>> Otherwise we automatically generate domains as devices appear.
>>>>>>>
>>>>>>> The state of UID Checking is thus essential to know if the PCI domain
>>>>>>> will be stable, yet currently there is no way to access this information
>>>>>>> from userspace.
>>>>>>> So let's solve this by showing the state of UID checking as a sysfs
>>>>>>> attribute in /sys/bus/pci/uid_checking
>>>>
>>>>>>> +/* Global zPCI attributes */
>>>>>>> +static ssize_t uid_checking_show(struct kobject *kobj,
>>>>>>> +				 struct kobj_attribute *attr, char *buf)
>>>>>>> +{
>>>>>>> +	return sprintf(buf, "%i\n", zpci_unique_uid);
>>>>>>> +}
>>>>>>> +
>>>>>>> +static struct kobj_attribute sys_zpci_uid_checking_attr =
>>>>>>> +	__ATTR(uid_checking, 0444, uid_checking_show, NULL);
>>>>>>
>>>>>> Use DEVICE_ATTR_RO instead of __ATTR.
>>>>>
>>>>> It's my understanding that DEVICE_ATTR_* is only for
>>>>> per device attributes. This one is global for the entire
>>>>> Z PCI. I just tried with BUS_ATTR_RO instead
>>>>> and that works but only if I put the attribute at
>>>>> /sys/bus/pci/uid_checking instead of with a zpci
>>>>> subfolder. This path would work for us too, we
>>>>> currently don't have any other global attributes
>>>>> that we are planning to expose but those could of
>>>>> course come up in the future.
>>>>
>>>> Ah, I missed the fact that this is a kobj_attribute, not a
>>>> device_attribute.  Maybe KERNEL_ATTR_RO()?  Very few uses so far, but
>>>> seems like it might fit?
>>>>
>>>> Bjorn
>>>>
>>>
>>> KERNEL_ATTR_* is currently not exported in any header. After
>>> adding it to include/linuc/sysfs.h it indeed works perfectly.
>>> Adding Christian Brauner as suggested by get_maintainers for
>>> their opinion. I'm of course willing to provide a patch
>>
>> Hey Niklas et al. :)
>>
>> I think this will need input from Greg. He should be best versed in
>> sysfs attributes. The problem with KERNEL_ATTR_* to me seems that it's
>> supposed to be kernel internal. Now, that might just be a matter of
>> renaming the macro but let's see whether Greg has any better idea or
>> more questions. :)
> 
> The big question is, why are you needing this?
> 
> No driver or driver subsystem should EVER be messing with a "raw"
> kobject like this.  Just use the existing DEVICE_* macros instead
> please.
> 
> If you are using a raw kobject, please ask me how to do this properly,
> as that is something that should NEVER show up in the /sys/devices/*
> tree.  Otherwise userspace tools will break.
> 
> thanks,
> 
> greg k-h

Hi Greg,

this is for an architecture specific but global i.e. not device bound PCI
attribute. That's why DEVICE_ATTR_* does not work. BUS_ATTR_* would work
but only if we place the attribute directly under /sys/bus/pci/new_attr.

I'm aware that this is quite unusual in fact I couldn't find anything
similar. That's why this is an RFC, with a lengthy cover letter
explaining our use case, that I sent to Bjorn to figure out where to
even place the attribute.

So I guess this is indeed me asking you how to do this properly.
That said it does not show up under /sys/devices/* only /sys/bus/pci/*.

Best regards,
Niklas
Greg Kroah-Hartman Jan. 14, 2021, 3:17 p.m. UTC | #8
On Thu, Jan 14, 2021 at 04:06:11PM +0100, Niklas Schnelle wrote:
> 
> 
> On 1/14/21 2:58 PM, Greg Kroah-Hartman wrote:
> > On Thu, Jan 14, 2021 at 02:44:53PM +0100, Christian Brauner wrote:
> >> On Thu, Jan 14, 2021 at 02:20:10PM +0100, Niklas Schnelle wrote:
> >>>
> >>>
> >>> On 1/13/21 7:55 PM, Bjorn Helgaas wrote:
> >>>> On Wed, Jan 13, 2021 at 08:47:58AM +0100, Niklas Schnelle wrote:
> >>>>> On 1/12/21 10:50 PM, Bjorn Helgaas wrote:
> >>>>>> On Mon, Jan 11, 2021 at 10:38:57AM +0100, Niklas Schnelle wrote:
> >>>>>>> We use the UID of a zPCI adapter, or the UID of the function zero if
> >>>>>>> there are multiple functions in an adapter, as PCI domain if and only if
> >>>>>>> UID Checking is turned on.
> >>>>>>> Otherwise we automatically generate domains as devices appear.
> >>>>>>>
> >>>>>>> The state of UID Checking is thus essential to know if the PCI domain
> >>>>>>> will be stable, yet currently there is no way to access this information
> >>>>>>> from userspace.
> >>>>>>> So let's solve this by showing the state of UID checking as a sysfs
> >>>>>>> attribute in /sys/bus/pci/uid_checking
> >>>>
> >>>>>>> +/* Global zPCI attributes */
> >>>>>>> +static ssize_t uid_checking_show(struct kobject *kobj,
> >>>>>>> +				 struct kobj_attribute *attr, char *buf)
> >>>>>>> +{
> >>>>>>> +	return sprintf(buf, "%i\n", zpci_unique_uid);
> >>>>>>> +}
> >>>>>>> +
> >>>>>>> +static struct kobj_attribute sys_zpci_uid_checking_attr =
> >>>>>>> +	__ATTR(uid_checking, 0444, uid_checking_show, NULL);
> >>>>>>
> >>>>>> Use DEVICE_ATTR_RO instead of __ATTR.
> >>>>>
> >>>>> It's my understanding that DEVICE_ATTR_* is only for
> >>>>> per device attributes. This one is global for the entire
> >>>>> Z PCI. I just tried with BUS_ATTR_RO instead
> >>>>> and that works but only if I put the attribute at
> >>>>> /sys/bus/pci/uid_checking instead of with a zpci
> >>>>> subfolder. This path would work for us too, we
> >>>>> currently don't have any other global attributes
> >>>>> that we are planning to expose but those could of
> >>>>> course come up in the future.
> >>>>
> >>>> Ah, I missed the fact that this is a kobj_attribute, not a
> >>>> device_attribute.  Maybe KERNEL_ATTR_RO()?  Very few uses so far, but
> >>>> seems like it might fit?
> >>>>
> >>>> Bjorn
> >>>>
> >>>
> >>> KERNEL_ATTR_* is currently not exported in any header. After
> >>> adding it to include/linuc/sysfs.h it indeed works perfectly.
> >>> Adding Christian Brauner as suggested by get_maintainers for
> >>> their opinion. I'm of course willing to provide a patch
> >>
> >> Hey Niklas et al. :)
> >>
> >> I think this will need input from Greg. He should be best versed in
> >> sysfs attributes. The problem with KERNEL_ATTR_* to me seems that it's
> >> supposed to be kernel internal. Now, that might just be a matter of
> >> renaming the macro but let's see whether Greg has any better idea or
> >> more questions. :)
> > 
> > The big question is, why are you needing this?
> > 
> > No driver or driver subsystem should EVER be messing with a "raw"
> > kobject like this.  Just use the existing DEVICE_* macros instead
> > please.
> > 
> > If you are using a raw kobject, please ask me how to do this properly,
> > as that is something that should NEVER show up in the /sys/devices/*
> > tree.  Otherwise userspace tools will break.
> > 
> > thanks,
> > 
> > greg k-h
> 
> Hi Greg,
> 
> this is for an architecture specific but global i.e. not device bound PCI
> attribute. That's why DEVICE_ATTR_* does not work. BUS_ATTR_* would work
> but only if we place the attribute directly under /sys/bus/pci/new_attr.

Then you are doing something wrong :)

Where _exactly_ are you wanting to put this attribute?

> I'm aware that this is quite unusual in fact I couldn't find anything
> similar. That's why this is an RFC, with a lengthy cover letter
> explaining our use case, that I sent to Bjorn to figure out where to
> even place the attribute.
> 
> So I guess this is indeed me asking you how to do this properly.
> That said it does not show up under /sys/devices/* only /sys/bus/pci/*.

Do NOT put random kobjects under a bus subsystem.  If you need that,
then use BUS_ATTR_* as that is what it is there for.

Again, if you are in a driver subsystem, do not use a raw kobject.
Either something is already there for you, or what you want to do is not
correct :)

thanks,

greg k-h
Niklas Schnelle Jan. 14, 2021, 3:51 p.m. UTC | #9
On 1/14/21 4:17 PM, Greg Kroah-Hartman wrote:
> On Thu, Jan 14, 2021 at 04:06:11PM +0100, Niklas Schnelle wrote:
>>
>>
>> On 1/14/21 2:58 PM, Greg Kroah-Hartman wrote:
>>> On Thu, Jan 14, 2021 at 02:44:53PM +0100, Christian Brauner wrote:
>>>> On Thu, Jan 14, 2021 at 02:20:10PM +0100, Niklas Schnelle wrote:
>>>>>
>>>>>
>>>>> On 1/13/21 7:55 PM, Bjorn Helgaas wrote:
>>>>>> On Wed, Jan 13, 2021 at 08:47:58AM +0100, Niklas Schnelle wrote:
>>>>>>> On 1/12/21 10:50 PM, Bjorn Helgaas wrote:
>>>>>>>> On Mon, Jan 11, 2021 at 10:38:57AM +0100, Niklas Schnelle wrote:
>>>>>>>>> We use the UID of a zPCI adapter, or the UID of the function zero if
>>>>>>>>> there are multiple functions in an adapter, as PCI domain if and only if
>>>>>>>>> UID Checking is turned on.
>>>>>>>>> Otherwise we automatically generate domains as devices appear.
>>>>>>>>>
>>>>>>>>> The state of UID Checking is thus essential to know if the PCI domain
>>>>>>>>> will be stable, yet currently there is no way to access this information
>>>>>>>>> from userspace.
>>>>>>>>> So let's solve this by showing the state of UID checking as a sysfs
>>>>>>>>> attribute in /sys/bus/pci/uid_checking
>>>>>>
>>>>>>>>> +/* Global zPCI attributes */
>>>>>>>>> +static ssize_t uid_checking_show(struct kobject *kobj,
>>>>>>>>> +				 struct kobj_attribute *attr, char *buf)
>>>>>>>>> +{
>>>>>>>>> +	return sprintf(buf, "%i\n", zpci_unique_uid);
>>>>>>>>> +}
>>>>>>>>> +
>>>>>>>>> +static struct kobj_attribute sys_zpci_uid_checking_attr =
>>>>>>>>> +	__ATTR(uid_checking, 0444, uid_checking_show, NULL);
>>>>>>>>
>>>>>>>> Use DEVICE_ATTR_RO instead of __ATTR.
>>>>>>>
>>>>>>> It's my understanding that DEVICE_ATTR_* is only for
>>>>>>> per device attributes. This one is global for the entire
>>>>>>> Z PCI. I just tried with BUS_ATTR_RO instead
>>>>>>> and that works but only if I put the attribute at
>>>>>>> /sys/bus/pci/uid_checking instead of with a zpci
>>>>>>> subfolder. This path would work for us too, we
>>>>>>> currently don't have any other global attributes
>>>>>>> that we are planning to expose but those could of
>>>>>>> course come up in the future.
>>>>>>
>>>>>> Ah, I missed the fact that this is a kobj_attribute, not a
>>>>>> device_attribute.  Maybe KERNEL_ATTR_RO()?  Very few uses so far, but
>>>>>> seems like it might fit?
>>>>>>
>>>>>> Bjorn
>>>>>>
>>>>>
>>>>> KERNEL_ATTR_* is currently not exported in any header. After
>>>>> adding it to include/linuc/sysfs.h it indeed works perfectly.
>>>>> Adding Christian Brauner as suggested by get_maintainers for
>>>>> their opinion. I'm of course willing to provide a patch
>>>>
>>>> Hey Niklas et al. :)
>>>>
>>>> I think this will need input from Greg. He should be best versed in
>>>> sysfs attributes. The problem with KERNEL_ATTR_* to me seems that it's
>>>> supposed to be kernel internal. Now, that might just be a matter of
>>>> renaming the macro but let's see whether Greg has any better idea or
>>>> more questions. :)
>>>
>>> The big question is, why are you needing this?
>>>
>>> No driver or driver subsystem should EVER be messing with a "raw"
>>> kobject like this.  Just use the existing DEVICE_* macros instead
>>> please.
>>>
>>> If you are using a raw kobject, please ask me how to do this properly,
>>> as that is something that should NEVER show up in the /sys/devices/*
>>> tree.  Otherwise userspace tools will break.
>>>
>>> thanks,
>>>
>>> greg k-h
>>
>> Hi Greg,
>>
>> this is for an architecture specific but global i.e. not device bound PCI
>> attribute. That's why DEVICE_ATTR_* does not work. BUS_ATTR_* would work
>> but only if we place the attribute directly under /sys/bus/pci/new_attr.
> 
> Then you are doing something wrong :)

That is very possible.

> 
> Where _exactly_ are you wanting to put this attribute?

I'm trying for /sys/bus/pci/zpci/uid_checking, I'm using
the below code and the attribute even shows up but reading
it gives me two 0 bytes only.
The relevant code is only a slight alteration of the original patch
as follows:

static ssize_t uid_checking_show(struct bus_type *bus, char *buf)
{
	return sprintf(buf, "%i\n", zpci_unique_uid);
}
static BUS_ATTR_RO(uid_checking);

static struct kset *zpci_global_kset;

static struct attribute *zpci_attrs_global[] = {
	&bus_attr_uid_checking.attr,
	NULL,
};

static struct attribute_group zpci_attr_group_global = {
	.attrs = zpci_attrs_global,
};

int __init zpci_sysfs_init(void)
{
	struct kset *pci_bus_kset;

	pci_bus_kset = bus_get_kset(&pci_bus_type);

	zpci_global_kset = kset_create_and_add("zpci", NULL, &pci_bus_kset->kobj);
	if (!zpci_global_kset)
		return -ENOMEM;

	return sysfs_create_group(&zpci_global_kset->kobj, &zpci_attr_group_global);
}


> 
>> I'm aware that this is quite unusual in fact I couldn't find anything
>> similar. That's why this is an RFC, with a lengthy cover letter
>> explaining our use case, that I sent to Bjorn to figure out where to
>> even place the attribute.
>>
>> So I guess this is indeed me asking you how to do this properly.
>> That said it does not show up under /sys/devices/* only /sys/bus/pci/*.
> 
> Do NOT put random kobjects under a bus subsystem.  If you need that,
> then use BUS_ATTR_* as that is what it is there for.
> 
> Again, if you are in a driver subsystem, do not use a raw kobject.
> Either something is already there for you, or what you want to do is not
> correct :)

Understood and thanks for the clear advice!

> 
> thanks,
> 
> greg k-h
>
Greg Kroah-Hartman Jan. 14, 2021, 4:14 p.m. UTC | #10
On Thu, Jan 14, 2021 at 04:51:17PM +0100, Niklas Schnelle wrote:
> 
> 
> On 1/14/21 4:17 PM, Greg Kroah-Hartman wrote:
> > On Thu, Jan 14, 2021 at 04:06:11PM +0100, Niklas Schnelle wrote:
> >>
> >>
> >> On 1/14/21 2:58 PM, Greg Kroah-Hartman wrote:
> >>> On Thu, Jan 14, 2021 at 02:44:53PM +0100, Christian Brauner wrote:
> >>>> On Thu, Jan 14, 2021 at 02:20:10PM +0100, Niklas Schnelle wrote:
> >>>>>
> >>>>>
> >>>>> On 1/13/21 7:55 PM, Bjorn Helgaas wrote:
> >>>>>> On Wed, Jan 13, 2021 at 08:47:58AM +0100, Niklas Schnelle wrote:
> >>>>>>> On 1/12/21 10:50 PM, Bjorn Helgaas wrote:
> >>>>>>>> On Mon, Jan 11, 2021 at 10:38:57AM +0100, Niklas Schnelle wrote:
> >>>>>>>>> We use the UID of a zPCI adapter, or the UID of the function zero if
> >>>>>>>>> there are multiple functions in an adapter, as PCI domain if and only if
> >>>>>>>>> UID Checking is turned on.
> >>>>>>>>> Otherwise we automatically generate domains as devices appear.
> >>>>>>>>>
> >>>>>>>>> The state of UID Checking is thus essential to know if the PCI domain
> >>>>>>>>> will be stable, yet currently there is no way to access this information
> >>>>>>>>> from userspace.
> >>>>>>>>> So let's solve this by showing the state of UID checking as a sysfs
> >>>>>>>>> attribute in /sys/bus/pci/uid_checking
> >>>>>>
> >>>>>>>>> +/* Global zPCI attributes */
> >>>>>>>>> +static ssize_t uid_checking_show(struct kobject *kobj,
> >>>>>>>>> +				 struct kobj_attribute *attr, char *buf)
> >>>>>>>>> +{
> >>>>>>>>> +	return sprintf(buf, "%i\n", zpci_unique_uid);
> >>>>>>>>> +}
> >>>>>>>>> +
> >>>>>>>>> +static struct kobj_attribute sys_zpci_uid_checking_attr =
> >>>>>>>>> +	__ATTR(uid_checking, 0444, uid_checking_show, NULL);
> >>>>>>>>
> >>>>>>>> Use DEVICE_ATTR_RO instead of __ATTR.
> >>>>>>>
> >>>>>>> It's my understanding that DEVICE_ATTR_* is only for
> >>>>>>> per device attributes. This one is global for the entire
> >>>>>>> Z PCI. I just tried with BUS_ATTR_RO instead
> >>>>>>> and that works but only if I put the attribute at
> >>>>>>> /sys/bus/pci/uid_checking instead of with a zpci
> >>>>>>> subfolder. This path would work for us too, we
> >>>>>>> currently don't have any other global attributes
> >>>>>>> that we are planning to expose but those could of
> >>>>>>> course come up in the future.
> >>>>>>
> >>>>>> Ah, I missed the fact that this is a kobj_attribute, not a
> >>>>>> device_attribute.  Maybe KERNEL_ATTR_RO()?  Very few uses so far, but
> >>>>>> seems like it might fit?
> >>>>>>
> >>>>>> Bjorn
> >>>>>>
> >>>>>
> >>>>> KERNEL_ATTR_* is currently not exported in any header. After
> >>>>> adding it to include/linuc/sysfs.h it indeed works perfectly.
> >>>>> Adding Christian Brauner as suggested by get_maintainers for
> >>>>> their opinion. I'm of course willing to provide a patch
> >>>>
> >>>> Hey Niklas et al. :)
> >>>>
> >>>> I think this will need input from Greg. He should be best versed in
> >>>> sysfs attributes. The problem with KERNEL_ATTR_* to me seems that it's
> >>>> supposed to be kernel internal. Now, that might just be a matter of
> >>>> renaming the macro but let's see whether Greg has any better idea or
> >>>> more questions. :)
> >>>
> >>> The big question is, why are you needing this?
> >>>
> >>> No driver or driver subsystem should EVER be messing with a "raw"
> >>> kobject like this.  Just use the existing DEVICE_* macros instead
> >>> please.
> >>>
> >>> If you are using a raw kobject, please ask me how to do this properly,
> >>> as that is something that should NEVER show up in the /sys/devices/*
> >>> tree.  Otherwise userspace tools will break.
> >>>
> >>> thanks,
> >>>
> >>> greg k-h
> >>
> >> Hi Greg,
> >>
> >> this is for an architecture specific but global i.e. not device bound PCI
> >> attribute. That's why DEVICE_ATTR_* does not work. BUS_ATTR_* would work
> >> but only if we place the attribute directly under /sys/bus/pci/new_attr.
> > 
> > Then you are doing something wrong :)
> 
> That is very possible.
> 
> > 
> > Where _exactly_ are you wanting to put this attribute?
> 
> I'm trying for /sys/bus/pci/zpci/uid_checking, I'm using
> the below code and the attribute even shows up but reading
> it gives me two 0 bytes only.
> The relevant code is only a slight alteration of the original patch
> as follows:
> 
> static ssize_t uid_checking_show(struct bus_type *bus, char *buf)
> {
> 	return sprintf(buf, "%i\n", zpci_unique_uid);
> }
> static BUS_ATTR_RO(uid_checking);
> 
> static struct kset *zpci_global_kset;
> 
> static struct attribute *zpci_attrs_global[] = {
> 	&bus_attr_uid_checking.attr,
> 	NULL,
> };
> 
> static struct attribute_group zpci_attr_group_global = {
> 	.attrs = zpci_attrs_global,
> };

Name your attribute group, and then you do not have to mess with a
"raw" kobject like you are below:

> 
> int __init zpci_sysfs_init(void)
> {
> 	struct kset *pci_bus_kset;
> 
> 	pci_bus_kset = bus_get_kset(&pci_bus_type);
> 
> 	zpci_global_kset = kset_create_and_add("zpci", NULL, &pci_bus_kset->kobj);

No, do not mess with at kset, just set the default attribute group for
the bus to the above, and you should be fine.

> 	if (!zpci_global_kset)
> 		return -ENOMEM;
> 
> 	return sysfs_create_group(&zpci_global_kset->kobj, &zpci_attr_group_global);

Huge hint, if in a driver, or bus subsystem, and you call sysfs_*,
that's usually a huge clue that you are doing something wrong.

Try the above again, with a simple attribute group, and name for it, and
it should "just work".

thanks,

greg k-h
Niklas Schnelle Jan. 15, 2021, 11:20 a.m. UTC | #11
On 1/14/21 5:14 PM, Greg Kroah-Hartman wrote:
> On Thu, Jan 14, 2021 at 04:51:17PM +0100, Niklas Schnelle wrote:
>>
>>
>> On 1/14/21 4:17 PM, Greg Kroah-Hartman wrote:
>>> On Thu, Jan 14, 2021 at 04:06:11PM +0100, Niklas Schnelle wrote:
>>>>
>>>>
>>>> On 1/14/21 2:58 PM, Greg Kroah-Hartman wrote:
>>>>> On Thu, Jan 14, 2021 at 02:44:53PM +0100, Christian Brauner wrote:
>>>>>> On Thu, Jan 14, 2021 at 02:20:10PM +0100, Niklas Schnelle wrote:
>>>>>>>
>>>>>>>
>>>>>>> On 1/13/21 7:55 PM, Bjorn Helgaas wrote:
>>>>>>>> On Wed, Jan 13, 2021 at 08:47:58AM +0100, Niklas Schnelle wrote:
>>>>>>>>> On 1/12/21 10:50 PM, Bjorn Helgaas wrote:
... snip ...
>>>>>>
>>>>>> Hey Niklas et al. :)
>>>>>>
>>>>>> I think this will need input from Greg. He should be best versed in
>>>>>> sysfs attributes. The problem with KERNEL_ATTR_* to me seems that it's
>>>>>> supposed to be kernel internal. Now, that might just be a matter of
>>>>>> renaming the macro but let's see whether Greg has any better idea or
>>>>>> more questions. :)
>>>>>
>>>>> The big question is, why are you needing this?
>>>>>
>>>>> No driver or driver subsystem should EVER be messing with a "raw"
>>>>> kobject like this.  Just use the existing DEVICE_* macros instead
>>>>> please.
>>>>>
>>>>> If you are using a raw kobject, please ask me how to do this properly,
>>>>> as that is something that should NEVER show up in the /sys/devices/*
>>>>> tree.  Otherwise userspace tools will break.
>>>>>
>>>>> thanks,
>>>>>
>>>>> greg k-h
>>>>
>>>> Hi Greg,
>>>>
>>>> this is for an architecture specific but global i.e. not device bound PCI
>>>> attribute. That's why DEVICE_ATTR_* does not work. BUS_ATTR_* would work
>>>> but only if we place the attribute directly under /sys/bus/pci/new_attr.
>>>
>>> Then you are doing something wrong :)
>>
>> That is very possible.
>>
>>>
>>> Where _exactly_ are you wanting to put this attribute?
>>
>> I'm trying for /sys/bus/pci/zpci/uid_checking, I'm using
>> the below code and the attribute even shows up but reading
>> it gives me two 0 bytes only.
>> The relevant code is only a slight alteration of the original patch
>> as follows:
>>
>> static ssize_t uid_checking_show(struct bus_type *bus, char *buf)
>> {
>> 	return sprintf(buf, "%i\n", zpci_unique_uid);
>> }
>> static BUS_ATTR_RO(uid_checking);
>>
>> static struct kset *zpci_global_kset;
>>
>> static struct attribute *zpci_attrs_global[] = {
>> 	&bus_attr_uid_checking.attr,
>> 	NULL,
>> };
>>
>> static struct attribute_group zpci_attr_group_global = {
>> 	.attrs = zpci_attrs_global,
>> };
> 
> Name your attribute group, and then you do not have to mess with a
> "raw" kobject like you are below:

Thanks for this tip and sorry for bothering you again.

> 
>>
>> int __init zpci_sysfs_init(void)
>> {
>> 	struct kset *pci_bus_kset;
>>
>> 	pci_bus_kset = bus_get_kset(&pci_bus_type);
>>
>> 	zpci_global_kset = kset_create_and_add("zpci", NULL, &pci_bus_kset->kobj);
> 
> No, do not mess with at kset, just set the default attribute group for
> the bus to the above, and you should be fine.

Oh ok, I got this idea from the code adding /sys/bus/pci/slots/ in
drivers/pci/slot.c:pci_slot_init(). See below maybe we can clean that up too.

> 
>> 	if (!zpci_global_kset)
>> 		return -ENOMEM;
>>
>> 	return sysfs_create_group(&zpci_global_kset->kobj, &zpci_attr_group_global);
> 
> Huge hint, if in a driver, or bus subsystem, and you call sysfs_*,
> that's usually a huge clue that you are doing something wrong.
> 
> Try the above again, with a simple attribute group, and name for it, and
> it should "just work".

I'm probably missing something but I don't get how this could work in
this case. If I'm seeing this right the default attribute group here
is pci_bus_type.bus_groups and that is already set in drivers/pci/pci-driver.c
so I don't think I should set that.

I did however find bus_create_file() which does work when using the path
/sys/bus/pci/uid_checking instead. This would work for us if Bjorn is okay with
that path and the code is really clean and simple too.

That said, I think we could also add something like bus_create_group().
Then we could use that to also clean up drivers/pci/slot.c:pci_slot_init()
and get the original path /sys/bus/pci/zpci/uid_checking.

I think this would also allow us to get rid of pci_bus_get_kset() which is
only used in that function and seems to me like it encourages use of raw ksets.
Or I'm completely off the mark and just missing something important.

thanks,
Niklas
Greg Kroah-Hartman Jan. 15, 2021, 12:02 p.m. UTC | #12
On Fri, Jan 15, 2021 at 12:20:59PM +0100, Niklas Schnelle wrote:
> 
> 
> On 1/14/21 5:14 PM, Greg Kroah-Hartman wrote:
> > On Thu, Jan 14, 2021 at 04:51:17PM +0100, Niklas Schnelle wrote:
> >>
> >>
> >> On 1/14/21 4:17 PM, Greg Kroah-Hartman wrote:
> >>> On Thu, Jan 14, 2021 at 04:06:11PM +0100, Niklas Schnelle wrote:
> >>>>
> >>>>
> >>>> On 1/14/21 2:58 PM, Greg Kroah-Hartman wrote:
> >>>>> On Thu, Jan 14, 2021 at 02:44:53PM +0100, Christian Brauner wrote:
> >>>>>> On Thu, Jan 14, 2021 at 02:20:10PM +0100, Niklas Schnelle wrote:
> >>>>>>>
> >>>>>>>
> >>>>>>> On 1/13/21 7:55 PM, Bjorn Helgaas wrote:
> >>>>>>>> On Wed, Jan 13, 2021 at 08:47:58AM +0100, Niklas Schnelle wrote:
> >>>>>>>>> On 1/12/21 10:50 PM, Bjorn Helgaas wrote:
> ... snip ...
> >>>>>>
> >>>>>> Hey Niklas et al. :)
> >>>>>>
> >>>>>> I think this will need input from Greg. He should be best versed in
> >>>>>> sysfs attributes. The problem with KERNEL_ATTR_* to me seems that it's
> >>>>>> supposed to be kernel internal. Now, that might just be a matter of
> >>>>>> renaming the macro but let's see whether Greg has any better idea or
> >>>>>> more questions. :)
> >>>>>
> >>>>> The big question is, why are you needing this?
> >>>>>
> >>>>> No driver or driver subsystem should EVER be messing with a "raw"
> >>>>> kobject like this.  Just use the existing DEVICE_* macros instead
> >>>>> please.
> >>>>>
> >>>>> If you are using a raw kobject, please ask me how to do this properly,
> >>>>> as that is something that should NEVER show up in the /sys/devices/*
> >>>>> tree.  Otherwise userspace tools will break.
> >>>>>
> >>>>> thanks,
> >>>>>
> >>>>> greg k-h
> >>>>
> >>>> Hi Greg,
> >>>>
> >>>> this is for an architecture specific but global i.e. not device bound PCI
> >>>> attribute. That's why DEVICE_ATTR_* does not work. BUS_ATTR_* would work
> >>>> but only if we place the attribute directly under /sys/bus/pci/new_attr.
> >>>
> >>> Then you are doing something wrong :)
> >>
> >> That is very possible.
> >>
> >>>
> >>> Where _exactly_ are you wanting to put this attribute?
> >>
> >> I'm trying for /sys/bus/pci/zpci/uid_checking, I'm using
> >> the below code and the attribute even shows up but reading
> >> it gives me two 0 bytes only.
> >> The relevant code is only a slight alteration of the original patch
> >> as follows:
> >>
> >> static ssize_t uid_checking_show(struct bus_type *bus, char *buf)
> >> {
> >> 	return sprintf(buf, "%i\n", zpci_unique_uid);
> >> }
> >> static BUS_ATTR_RO(uid_checking);
> >>
> >> static struct kset *zpci_global_kset;
> >>
> >> static struct attribute *zpci_attrs_global[] = {
> >> 	&bus_attr_uid_checking.attr,
> >> 	NULL,
> >> };
> >>
> >> static struct attribute_group zpci_attr_group_global = {
> >> 	.attrs = zpci_attrs_global,
> >> };
> > 
> > Name your attribute group, and then you do not have to mess with a
> > "raw" kobject like you are below:
> 
> Thanks for this tip and sorry for bothering you again.
> 
> > 
> >>
> >> int __init zpci_sysfs_init(void)
> >> {
> >> 	struct kset *pci_bus_kset;
> >>
> >> 	pci_bus_kset = bus_get_kset(&pci_bus_type);
> >>
> >> 	zpci_global_kset = kset_create_and_add("zpci", NULL, &pci_bus_kset->kobj);
> > 
> > No, do not mess with at kset, just set the default attribute group for
> > the bus to the above, and you should be fine.
> 
> Oh ok, I got this idea from the code adding /sys/bus/pci/slots/ in
> drivers/pci/slot.c:pci_slot_init(). See below maybe we can clean that up too.

Slots are "interesting" and that code is really old, we know how to do
things better now :)

But I doubt we should change anything there, as it does work, and
userspace is used to how they come/go.

> >> 	if (!zpci_global_kset)
> >> 		return -ENOMEM;
> >>
> >> 	return sysfs_create_group(&zpci_global_kset->kobj, &zpci_attr_group_global);
> > 
> > Huge hint, if in a driver, or bus subsystem, and you call sysfs_*,
> > that's usually a huge clue that you are doing something wrong.
> > 
> > Try the above again, with a simple attribute group, and name for it, and
> > it should "just work".
> 
> I'm probably missing something but I don't get how this could work in
> this case. If I'm seeing this right the default attribute group here
> is pci_bus_type.bus_groups and that is already set in drivers/pci/pci-driver.c
> so I don't think I should set that.

Yes, add your group to that list of groups and all should be good.

> I did however find bus_create_file() which does work when using the path
> /sys/bus/pci/uid_checking instead. This would work for us if Bjorn is okay with
> that path and the code is really clean and simple too.

No, use the above group you already have please.

> That said, I think we could also add something like bus_create_group().

No, use the group list as show above please.

> Then we could use that to also clean up drivers/pci/slot.c:pci_slot_init()
> and get the original path /sys/bus/pci/zpci/uid_checking.

What needs to be cleaned up there?

> I think this would also allow us to get rid of pci_bus_get_kset() which is
> only used in that function and seems to me like it encourages use of raw ksets.
> Or I'm completely off the mark and just missing something important.

Cleaning up slots is great, but they are "odd", so be careful.

thanks,

greg k-h
Bjorn Helgaas Jan. 15, 2021, 3:29 p.m. UTC | #13
On Fri, Jan 15, 2021 at 12:20:59PM +0100, Niklas Schnelle wrote:
> On 1/14/21 5:14 PM, Greg Kroah-Hartman wrote:
> > On Thu, Jan 14, 2021 at 04:51:17PM +0100, Niklas Schnelle wrote:
> >> On 1/14/21 4:17 PM, Greg Kroah-Hartman wrote:
> >>> On Thu, Jan 14, 2021 at 04:06:11PM +0100, Niklas Schnelle wrote:
> >>>> On 1/14/21 2:58 PM, Greg Kroah-Hartman wrote:
> >>>>> On Thu, Jan 14, 2021 at 02:44:53PM +0100, Christian Brauner wrote:
> >>>>>> On Thu, Jan 14, 2021 at 02:20:10PM +0100, Niklas Schnelle wrote:
> >>>>>>> On 1/13/21 7:55 PM, Bjorn Helgaas wrote:
> >>>>>>>> On Wed, Jan 13, 2021 at 08:47:58AM +0100, Niklas Schnelle wrote:
> >>>>>>>>> On 1/12/21 10:50 PM, Bjorn Helgaas wrote:
> ... snip ...
> >>>>>>
> >>>>>> Hey Niklas et al. :)
> >>>>>>
> >>>>>> I think this will need input from Greg. He should be best versed in
> >>>>>> sysfs attributes. The problem with KERNEL_ATTR_* to me seems that it's
> >>>>>> supposed to be kernel internal. Now, that might just be a matter of
> >>>>>> renaming the macro but let's see whether Greg has any better idea or
> >>>>>> more questions. :)
> >>>>>
> >>>>> The big question is, why are you needing this?
> >>>>>
> >>>>> No driver or driver subsystem should EVER be messing with a "raw"
> >>>>> kobject like this.  Just use the existing DEVICE_* macros instead
> >>>>> please.
> >>>>>
> >>>>> If you are using a raw kobject, please ask me how to do this properly,
> >>>>> as that is something that should NEVER show up in the /sys/devices/*
> >>>>> tree.  Otherwise userspace tools will break.
> >>>>>
> >>>>> thanks,
> >>>>>
> >>>>> greg k-h
> >>>>
> >>>> Hi Greg,
> >>>>
> >>>> this is for an architecture specific but global i.e. not device bound PCI
> >>>> attribute. That's why DEVICE_ATTR_* does not work. BUS_ATTR_* would work
> >>>> but only if we place the attribute directly under /sys/bus/pci/new_attr.
> >>>
> >>> Then you are doing something wrong :)
> >>
> >> That is very possible.
> >>
> >>>
> >>> Where _exactly_ are you wanting to put this attribute?
> >>
> >> I'm trying for /sys/bus/pci/zpci/uid_checking, I'm using
> >> the below code and the attribute even shows up but reading
> >> it gives me two 0 bytes only.
> >> The relevant code is only a slight alteration of the original patch
> >> as follows:
> >>
> >> static ssize_t uid_checking_show(struct bus_type *bus, char *buf)
> >> {
> >> 	return sprintf(buf, "%i\n", zpci_unique_uid);
> >> }
> >> static BUS_ATTR_RO(uid_checking);
> >>
> >> static struct kset *zpci_global_kset;
> >>
> >> static struct attribute *zpci_attrs_global[] = {
> >> 	&bus_attr_uid_checking.attr,
> >> 	NULL,
> >> };
> >>
> >> static struct attribute_group zpci_attr_group_global = {
> >> 	.attrs = zpci_attrs_global,
> >> };
> > 
> > Name your attribute group, and then you do not have to mess with a
> > "raw" kobject like you are below:
> 
> Thanks for this tip and sorry for bothering you again.
> 
> > 
> >>
> >> int __init zpci_sysfs_init(void)
> >> {
> >> 	struct kset *pci_bus_kset;
> >>
> >> 	pci_bus_kset = bus_get_kset(&pci_bus_type);
> >>
> >> 	zpci_global_kset = kset_create_and_add("zpci", NULL, &pci_bus_kset->kobj);
> > 
> > No, do not mess with at kset, just set the default attribute group for
> > the bus to the above, and you should be fine.
> 
> Oh ok, I got this idea from the code adding /sys/bus/pci/slots/ in
> drivers/pci/slot.c:pci_slot_init(). See below maybe we can clean that up too.
> 
> > 
> >> 	if (!zpci_global_kset)
> >> 		return -ENOMEM;
> >>
> >> 	return sysfs_create_group(&zpci_global_kset->kobj, &zpci_attr_group_global);
> > 
> > Huge hint, if in a driver, or bus subsystem, and you call sysfs_*,
> > that's usually a huge clue that you are doing something wrong.
> > 
> > Try the above again, with a simple attribute group, and name for it, and
> > it should "just work".
> 
> I'm probably missing something but I don't get how this could work
> in this case. If I'm seeing this right the default attribute group
> here is pci_bus_type.bus_groups and that is already set in
> drivers/pci/pci-driver.c so I don't think I should set that.
> 
> I did however find bus_create_file() which does work when using the
> path /sys/bus/pci/uid_checking instead. This would work for us if
> Bjorn is okay with that path and the code is really clean and simple
> too.
> 
> That said, I think we could also add something like
> bus_create_group().  Then we could use that to also clean up
> drivers/pci/slot.c:pci_slot_init() and get the original path
> /sys/bus/pci/zpci/uid_checking.

I don't think "uid_checking" is quite the right name.  It says
something about the *implementation*, but it doesn't convey what that
*means* to userspace.  IIUC this file tells userspace something about
whether a given PCI device always has the same PCI domain/bus/dev/fn
address (or maybe just the same domain?)

It sounds like this feature could be useful beyond just s390, and
other arches might implement it differently, without the UID concept.
If so, I'm OK with something at the /sys/bus/pci/xxx level as long as
the name is not s390-specific (and "uid" sounds s390-specific).

I assume it would also help with the udev/systemd end if you could
make this less s390 dependent.

Bjorn
Niklas Schnelle Jan. 15, 2021, 4:15 p.m. UTC | #14
On 1/15/21 4:29 PM, Bjorn Helgaas wrote:
> On Fri, Jan 15, 2021 at 12:20:59PM +0100, Niklas Schnelle wrote:
>> On 1/14/21 5:14 PM, Greg Kroah-Hartman wrote:
>>> On Thu, Jan 14, 2021 at 04:51:17PM +0100, Niklas Schnelle wrote:
>>>> On 1/14/21 4:17 PM, Greg Kroah-Hartman wrote:
>>>>> On Thu, Jan 14, 2021 at 04:06:11PM +0100, Niklas Schnelle wrote:
>>>>>> On 1/14/21 2:58 PM, Greg Kroah-Hartman wrote:
>>>>>>> On Thu, Jan 14, 2021 at 02:44:53PM +0100, Christian Brauner wrote:
>>>>>>>> On Thu, Jan 14, 2021 at 02:20:10PM +0100, Niklas Schnelle wrote:
>>>>>>>>> On 1/13/21 7:55 PM, Bjorn Helgaas wrote:
>>>>>>>>>> On Wed, Jan 13, 2021 at 08:47:58AM +0100, Niklas Schnelle wrote:
>>>>>>>>>>> On 1/12/21 10:50 PM, Bjorn Helgaas wrote:
>> ... snip ...
>>>>>>>>
>>>>>>>> Hey Niklas et al. :)
>>>>>>>>
>>>>>>>> I think this will need input from Greg. He should be best versed in
>>>>>>>> sysfs attributes. The problem with KERNEL_ATTR_* to me seems that it's
>>>>>>>> supposed to be kernel internal. Now, that might just be a matter of
>>>>>>>> renaming the macro but let's see whether Greg has any better idea or
>>>>>>>> more questions. :)
>>>>>>>
>>>>>>> The big question is, why are you needing this?
>>>>>>>
>>>>>>> No driver or driver subsystem should EVER be messing with a "raw"
>>>>>>> kobject like this.  Just use the existing DEVICE_* macros instead
>>>>>>> please.
>>>>>>>
>>>>>>> If you are using a raw kobject, please ask me how to do this properly,
>>>>>>> as that is something that should NEVER show up in the /sys/devices/*
>>>>>>> tree.  Otherwise userspace tools will break.
>>>>>>>
>>>>>>> thanks,
>>>>>>>
>>>>>>> greg k-h
>>>>>>
>>>>>> Hi Greg,
>>>>>>
>>>>>> this is for an architecture specific but global i.e. not device bound PCI
>>>>>> attribute. That's why DEVICE_ATTR_* does not work. BUS_ATTR_* would work
>>>>>> but only if we place the attribute directly under /sys/bus/pci/new_attr.
>>>>>
>>>>> Then you are doing something wrong :)
>>>>
>>>> That is very possible.
>>>>
>>>>>
>>>>> Where _exactly_ are you wanting to put this attribute?
>>>>
>>>> I'm trying for /sys/bus/pci/zpci/uid_checking, I'm using
>>>> the below code and the attribute even shows up but reading
>>>> it gives me two 0 bytes only.
>>>> The relevant code is only a slight alteration of the original patch
>>>> as follows:
>>>>
>>>> static ssize_t uid_checking_show(struct bus_type *bus, char *buf)
>>>> {
>>>> 	return sprintf(buf, "%i\n", zpci_unique_uid);
>>>> }
>>>> static BUS_ATTR_RO(uid_checking);
>>>>
>>>> static struct kset *zpci_global_kset;
>>>>
>>>> static struct attribute *zpci_attrs_global[] = {
>>>> 	&bus_attr_uid_checking.attr,
>>>> 	NULL,
>>>> };
>>>>
>>>> static struct attribute_group zpci_attr_group_global = {
>>>> 	.attrs = zpci_attrs_global,
>>>> };
>>>
>>> Name your attribute group, and then you do not have to mess with a
>>> "raw" kobject like you are below:
>>
>> Thanks for this tip and sorry for bothering you again.
>>
>>>
>>>>
>>>> int __init zpci_sysfs_init(void)
>>>> {
>>>> 	struct kset *pci_bus_kset;
>>>>
>>>> 	pci_bus_kset = bus_get_kset(&pci_bus_type);
>>>>
>>>> 	zpci_global_kset = kset_create_and_add("zpci", NULL, &pci_bus_kset->kobj);
>>>
>>> No, do not mess with at kset, just set the default attribute group for
>>> the bus to the above, and you should be fine.
>>
>> Oh ok, I got this idea from the code adding /sys/bus/pci/slots/ in
>> drivers/pci/slot.c:pci_slot_init(). See below maybe we can clean that up too.
>>
>>>
>>>> 	if (!zpci_global_kset)
>>>> 		return -ENOMEM;
>>>>
>>>> 	return sysfs_create_group(&zpci_global_kset->kobj, &zpci_attr_group_global);
>>>
>>> Huge hint, if in a driver, or bus subsystem, and you call sysfs_*,
>>> that's usually a huge clue that you are doing something wrong.
>>>
>>> Try the above again, with a simple attribute group, and name for it, and
>>> it should "just work".
>>
>> I'm probably missing something but I don't get how this could work
>> in this case. If I'm seeing this right the default attribute group
>> here is pci_bus_type.bus_groups and that is already set in
>> drivers/pci/pci-driver.c so I don't think I should set that.
>>
>> I did however find bus_create_file() which does work when using the
>> path /sys/bus/pci/uid_checking instead. This would work for us if
>> Bjorn is okay with that path and the code is really clean and simple
>> too.
>>
>> That said, I think we could also add something like
>> bus_create_group().  Then we could use that to also clean up
>> drivers/pci/slot.c:pci_slot_init() and get the original path
>> /sys/bus/pci/zpci/uid_checking.
> 
> I don't think "uid_checking" is quite the right name.  It says
> something about the *implementation*, but it doesn't convey what that
> *means* to userspace.  IIUC this file tells userspace something about
> whether a given PCI device always has the same PCI domain/bus/dev/fn
> address (or maybe just the same domain?)

Yes you're right, in fact we internally also started to think about
something more meaning oriented like "unique_uids". This indeed results
in PCI addresses which can be relied upon as stable identifiers. For us
it's enough that the domain matches as the bus is always 0 and
dev/fn are determined by the device and SR-IOV stride etc.
so will remain the same for equivalent configurations.

> 
> It sounds like this feature could be useful beyond just s390, and
> other arches might implement it differently, without the UID concept.
> If so, I'm OK with something at the /sys/bus/pci/xxx level as long as
> the name is not s390-specific (and "uid" sounds s390-specific).
> 
> I assume it would also help with the udev/systemd end if you could
> make this less s390 dependent.
> 
> Bjorn

That's a very good point! I'm absolutely open to making this a
common concept. I think the gist could be that the addressing/ids on
the bus are reproducible, there might be some user configuration
required but then they can be relied upon for stable names like
network interfaces. So maybe a name like "reproducible_addressing"?
I guess one non-s390 exclusive case of this could be virtualized
PCI under QEMU/KVM etc. versus real hardware or even UEFI/Firmware
guarantees, right?
Niklas Schnelle Jan. 21, 2021, 3:31 p.m. UTC | #15
On 1/15/21 4:29 PM, Bjorn Helgaas wrote:
> On Fri, Jan 15, 2021 at 12:20:59PM +0100, Niklas Schnelle wrote:
>> On 1/14/21 5:14 PM, Greg Kroah-Hartman wrote:
>>> On Thu, Jan 14, 2021 at 04:51:17PM +0100, Niklas Schnelle wrote:
>>>> On 1/14/21 4:17 PM, Greg Kroah-Hartman wrote:
>>>>> On Thu, Jan 14, 2021 at 04:06:11PM +0100, Niklas Schnelle wrote:
>>>>>> On 1/14/21 2:58 PM, Greg Kroah-Hartman wrote:
>>>>>>> On Thu, Jan 14, 2021 at 02:44:53PM +0100, Christian Brauner wrote:
>>>>>>>> On Thu, Jan 14, 2021 at 02:20:10PM +0100, Niklas Schnelle wrote:
>>>>>>>>> On 1/13/21 7:55 PM, Bjorn Helgaas wrote:
>>>>>>>>>> On Wed, Jan 13, 2021 at 08:47:58AM +0100, Niklas Schnelle wrote:
>>>>>>>>>>> On 1/12/21 10:50 PM, Bjorn Helgaas wrote:
>> ... snip ...
>>
>>>
>>>> 	if (!zpci_global_kset)
>>>> 		return -ENOMEM;
>>>>
>>>> 	return sysfs_create_group(&zpci_global_kset->kobj, &zpci_attr_group_global);
>>>
>>> Huge hint, if in a driver, or bus subsystem, and you call sysfs_*,
>>> that's usually a huge clue that you are doing something wrong.
>>>
>>> Try the above again, with a simple attribute group, and name for it, and
>>> it should "just work".
>>
>> I'm probably missing something but I don't get how this could work
>> in this case. If I'm seeing this right the default attribute group
>> here is pci_bus_type.bus_groups and that is already set in
>> drivers/pci/pci-driver.c so I don't think I should set that.
>>
>> I did however find bus_create_file() which does work when using the
>> path /sys/bus/pci/uid_checking instead. This would work for us if
>> Bjorn is okay with that path and the code is really clean and simple
>> too.
>>
>> That said, I think we could also add something like
>> bus_create_group().  Then we could use that to also clean up
>> drivers/pci/slot.c:pci_slot_init() and get the original path
>> /sys/bus/pci/zpci/uid_checking.
> 
> I don't think "uid_checking" is quite the right name.  It says
> something about the *implementation*, but it doesn't convey what that
> *means* to userspace.  IIUC this file tells userspace something about
> whether a given PCI device always has the same PCI domain/bus/dev/fn
> address (or maybe just the same domain?)
> 
> It sounds like this feature could be useful beyond just s390, and
> other arches might implement it differently, without the UID concept.
> If so, I'm OK with something at the /sys/bus/pci/xxx level as long as
> the name is not s390-specific (and "uid" sounds s390-specific).
> 
> I assume it would also help with the udev/systemd end if you could
> make this less s390 dependent.
> 
> Bjorn
> 

Hi Bjorn,

I've thought about this more and even implemented a proof of concept
patch for a global attribute using a pcibios_has_reproducible_addressing()
hook. 

However after implementing it I think as a more general and
future proof concept it makes more sense to do this as a per device
attribute, maybe as another flag in "stuct pci_dev" named something
like "reliable_address". My reasoning behind this can be best be seen
with a QEMU example. While I expect that QEMU can easily guarantee
that one can always use "0000:01:00.0" for a virtio-pci NIC and
thus enp1s0 interface name, the same might be harder to guarantee
for a SR-IOV VF passed through with vfio-pci in that same VM and
even less so if a thunderbolt controller is passed through and
enumeration may depend on daisy chaining. The QEMU example
also applies to s390 and maybe others will in the future.
I'll send an RFC for a per device attribute but didn't want to
let this discussion stand as if we had abandoned the idea and if
you would prefer a global attribute I can also send an RFC for that.

Besst regards,
Niklas
Bjorn Helgaas Jan. 21, 2021, 3:54 p.m. UTC | #16
[Greg may be able to help compare/contrast this s390 UID with udev
persistent names]

On Thu, Jan 21, 2021 at 04:31:55PM +0100, Niklas Schnelle wrote:
> On 1/15/21 4:29 PM, Bjorn Helgaas wrote:
> > On Fri, Jan 15, 2021 at 12:20:59PM +0100, Niklas Schnelle wrote:
> >> On 1/14/21 5:14 PM, Greg Kroah-Hartman wrote:
> >>> On Thu, Jan 14, 2021 at 04:51:17PM +0100, Niklas Schnelle wrote:
> >>>> On 1/14/21 4:17 PM, Greg Kroah-Hartman wrote:
> >>>>> On Thu, Jan 14, 2021 at 04:06:11PM +0100, Niklas Schnelle wrote:
> >>>>>> On 1/14/21 2:58 PM, Greg Kroah-Hartman wrote:
> >>>>>>> On Thu, Jan 14, 2021 at 02:44:53PM +0100, Christian Brauner wrote:
> >>>>>>>> On Thu, Jan 14, 2021 at 02:20:10PM +0100, Niklas Schnelle wrote:
> >>>>>>>>> On 1/13/21 7:55 PM, Bjorn Helgaas wrote:
> >>>>>>>>>> On Wed, Jan 13, 2021 at 08:47:58AM +0100, Niklas Schnelle wrote:
> >>>>>>>>>>> On 1/12/21 10:50 PM, Bjorn Helgaas wrote:
> >> ... snip ...
> >>
> >>>
> >>>> 	if (!zpci_global_kset)
> >>>> 		return -ENOMEM;
> >>>>
> >>>> 	return sysfs_create_group(&zpci_global_kset->kobj, &zpci_attr_group_global);
> >>>
> >>> Huge hint, if in a driver, or bus subsystem, and you call sysfs_*,
> >>> that's usually a huge clue that you are doing something wrong.
> >>>
> >>> Try the above again, with a simple attribute group, and name for it, and
> >>> it should "just work".
> >>
> >> I'm probably missing something but I don't get how this could work
> >> in this case. If I'm seeing this right the default attribute group
> >> here is pci_bus_type.bus_groups and that is already set in
> >> drivers/pci/pci-driver.c so I don't think I should set that.
> >>
> >> I did however find bus_create_file() which does work when using the
> >> path /sys/bus/pci/uid_checking instead. This would work for us if
> >> Bjorn is okay with that path and the code is really clean and simple
> >> too.
> >>
> >> That said, I think we could also add something like
> >> bus_create_group().  Then we could use that to also clean up
> >> drivers/pci/slot.c:pci_slot_init() and get the original path
> >> /sys/bus/pci/zpci/uid_checking.
> > 
> > I don't think "uid_checking" is quite the right name.  It says
> > something about the *implementation*, but it doesn't convey what that
> > *means* to userspace.  IIUC this file tells userspace something about
> > whether a given PCI device always has the same PCI domain/bus/dev/fn
> > address (or maybe just the same domain?)
> > 
> > It sounds like this feature could be useful beyond just s390, and
> > other arches might implement it differently, without the UID concept.
> > If so, I'm OK with something at the /sys/bus/pci/xxx level as long as
> > the name is not s390-specific (and "uid" sounds s390-specific).
> > 
> > I assume it would also help with the udev/systemd end if you could
> > make this less s390 dependent.
> 
> I've thought about this more and even implemented a proof of concept
> patch for a global attribute using a pcibios_has_reproducible_addressing()
> hook. 
> 
> However after implementing it I think as a more general and
> future proof concept it makes more sense to do this as a per device
> attribute, maybe as another flag in "stuct pci_dev" named something
> like "reliable_address". My reasoning behind this can be best be seen
> with a QEMU example. While I expect that QEMU can easily guarantee
> that one can always use "0000:01:00.0" for a virtio-pci NIC and
> thus enp1s0 interface name, the same might be harder to guarantee
> for a SR-IOV VF passed through with vfio-pci in that same VM and
> even less so if a thunderbolt controller is passed through and
> enumeration may depend on daisy chaining. The QEMU example
> also applies to s390 and maybe others will in the future.

I'm a little wary of using the PCI geographical address
("0000:01:00.0") as a stable name.  Even if you can make a way to use
that to identify a specific device instance, regardless of how it is
plugged in or passed through, it sounds like we could end up with
"physical PCI addresses" and "virtual PCI addresses" that look the
same and would cause confusion.

This concept sounds similar to the udev concept of a "persistent
device name".  What advantages does this s390 UID have over the udev
approach?

There are optional PCI device serial numbers that we currently don't
really make use of.  Would that be a generic way to help with this?
Greg Kroah-Hartman Jan. 21, 2021, 4:11 p.m. UTC | #17
On Thu, Jan 21, 2021 at 09:54:45AM -0600, Bjorn Helgaas wrote:
> [Greg may be able to help compare/contrast this s390 UID with udev
> persistent names]
> 
> On Thu, Jan 21, 2021 at 04:31:55PM +0100, Niklas Schnelle wrote:
> > On 1/15/21 4:29 PM, Bjorn Helgaas wrote:
> > > On Fri, Jan 15, 2021 at 12:20:59PM +0100, Niklas Schnelle wrote:
> > >> On 1/14/21 5:14 PM, Greg Kroah-Hartman wrote:
> > >>> On Thu, Jan 14, 2021 at 04:51:17PM +0100, Niklas Schnelle wrote:
> > >>>> On 1/14/21 4:17 PM, Greg Kroah-Hartman wrote:
> > >>>>> On Thu, Jan 14, 2021 at 04:06:11PM +0100, Niklas Schnelle wrote:
> > >>>>>> On 1/14/21 2:58 PM, Greg Kroah-Hartman wrote:
> > >>>>>>> On Thu, Jan 14, 2021 at 02:44:53PM +0100, Christian Brauner wrote:
> > >>>>>>>> On Thu, Jan 14, 2021 at 02:20:10PM +0100, Niklas Schnelle wrote:
> > >>>>>>>>> On 1/13/21 7:55 PM, Bjorn Helgaas wrote:
> > >>>>>>>>>> On Wed, Jan 13, 2021 at 08:47:58AM +0100, Niklas Schnelle wrote:
> > >>>>>>>>>>> On 1/12/21 10:50 PM, Bjorn Helgaas wrote:
> > >> ... snip ...
> > >>
> > >>>
> > >>>> 	if (!zpci_global_kset)
> > >>>> 		return -ENOMEM;
> > >>>>
> > >>>> 	return sysfs_create_group(&zpci_global_kset->kobj, &zpci_attr_group_global);
> > >>>
> > >>> Huge hint, if in a driver, or bus subsystem, and you call sysfs_*,
> > >>> that's usually a huge clue that you are doing something wrong.
> > >>>
> > >>> Try the above again, with a simple attribute group, and name for it, and
> > >>> it should "just work".
> > >>
> > >> I'm probably missing something but I don't get how this could work
> > >> in this case. If I'm seeing this right the default attribute group
> > >> here is pci_bus_type.bus_groups and that is already set in
> > >> drivers/pci/pci-driver.c so I don't think I should set that.
> > >>
> > >> I did however find bus_create_file() which does work when using the
> > >> path /sys/bus/pci/uid_checking instead. This would work for us if
> > >> Bjorn is okay with that path and the code is really clean and simple
> > >> too.
> > >>
> > >> That said, I think we could also add something like
> > >> bus_create_group().  Then we could use that to also clean up
> > >> drivers/pci/slot.c:pci_slot_init() and get the original path
> > >> /sys/bus/pci/zpci/uid_checking.
> > > 
> > > I don't think "uid_checking" is quite the right name.  It says
> > > something about the *implementation*, but it doesn't convey what that
> > > *means* to userspace.  IIUC this file tells userspace something about
> > > whether a given PCI device always has the same PCI domain/bus/dev/fn
> > > address (or maybe just the same domain?)
> > > 
> > > It sounds like this feature could be useful beyond just s390, and
> > > other arches might implement it differently, without the UID concept.
> > > If so, I'm OK with something at the /sys/bus/pci/xxx level as long as
> > > the name is not s390-specific (and "uid" sounds s390-specific).
> > > 
> > > I assume it would also help with the udev/systemd end if you could
> > > make this less s390 dependent.
> > 
> > I've thought about this more and even implemented a proof of concept
> > patch for a global attribute using a pcibios_has_reproducible_addressing()
> > hook. 
> > 
> > However after implementing it I think as a more general and
> > future proof concept it makes more sense to do this as a per device
> > attribute, maybe as another flag in "stuct pci_dev" named something
> > like "reliable_address". My reasoning behind this can be best be seen
> > with a QEMU example. While I expect that QEMU can easily guarantee
> > that one can always use "0000:01:00.0" for a virtio-pci NIC and
> > thus enp1s0 interface name, the same might be harder to guarantee
> > for a SR-IOV VF passed through with vfio-pci in that same VM and
> > even less so if a thunderbolt controller is passed through and
> > enumeration may depend on daisy chaining. The QEMU example
> > also applies to s390 and maybe others will in the future.
> 
> I'm a little wary of using the PCI geographical address
> ("0000:01:00.0") as a stable name.  Even if you can make a way to use
> that to identify a specific device instance, regardless of how it is
> plugged in or passed through, it sounds like we could end up with
> "physical PCI addresses" and "virtual PCI addresses" that look the
> same and would cause confusion.

Agreed, as we all know, PCI addresses are never a stable name and can
change every boot on some systems.  Never rely on them, but you can use
them as a "hint" for something that you have to determine is different
from something else that is the same type of device.

> This concept sounds similar to the udev concept of a "persistent
> device name".  What advantages does this s390 UID have over the udev
> approach?
> 
> There are optional PCI device serial numbers that we currently don't
> really make use of.  Would that be a generic way to help with this?

Only if you can require that they be unique.  Is there such a
requirement and who enforces it?

For USB, it was only "required" to have unique serial numbers for one
class of devices (printers), and even then, it really wasn't enforced
that much so you can't rely on it being unique at all, which makes it
pretty useless :(

thanks,

greg k-h
Niklas Schnelle Jan. 21, 2021, 5:04 p.m. UTC | #18
On 1/21/21 4:54 PM, Bjorn Helgaas wrote:
> [Greg may be able to help compare/contrast this s390 UID with udev
> persistent names]
> 
> On Thu, Jan 21, 2021 at 04:31:55PM +0100, Niklas Schnelle wrote:
>> On 1/15/21 4:29 PM, Bjorn Helgaas wrote:
>>> On Fri, Jan 15, 2021 at 12:20:59PM +0100, Niklas Schnelle wrote:
>>>> On 1/14/21 5:14 PM, Greg Kroah-Hartman wrote:
>>>>> On Thu, Jan 14, 2021 at 04:51:17PM +0100, Niklas Schnelle wrote:
>>>>>> On 1/14/21 4:17 PM, Greg Kroah-Hartman wrote:
>>>>>>> On Thu, Jan 14, 2021 at 04:06:11PM +0100, Niklas Schnelle wrote:
>>>>>>>> On 1/14/21 2:58 PM, Greg Kroah-Hartman wrote:
>>>>>>>>> On Thu, Jan 14, 2021 at 02:44:53PM +0100, Christian Brauner wrote:
>>>>>>>>>> On Thu, Jan 14, 2021 at 02:20:10PM +0100, Niklas Schnelle wrote:
>>>>>>>>>>> On 1/13/21 7:55 PM, Bjorn Helgaas wrote:
>>>>>>>>>>>> On Wed, Jan 13, 2021 at 08:47:58AM +0100, Niklas Schnelle wrote:
>>>>>>>>>>>>> On 1/12/21 10:50 PM, Bjorn Helgaas wrote:
>>>> ... snip ...
>>>>
>>>>>
>>>>>> 	if (!zpci_global_kset)
>>>>>> 		return -ENOMEM;
>>>>>>
>>>>>> 	return sysfs_create_group(&zpci_global_kset->kobj, &zpci_attr_group_global);
>>>>>
>>>>> Huge hint, if in a driver, or bus subsystem, and you call sysfs_*,
>>>>> that's usually a huge clue that you are doing something wrong.
>>>>>
>>>>> Try the above again, with a simple attribute group, and name for it, and
>>>>> it should "just work".
>>>>
>>>> I'm probably missing something but I don't get how this could work
>>>> in this case. If I'm seeing this right the default attribute group
>>>> here is pci_bus_type.bus_groups and that is already set in
>>>> drivers/pci/pci-driver.c so I don't think I should set that.
>>>>
>>>> I did however find bus_create_file() which does work when using the
>>>> path /sys/bus/pci/uid_checking instead. This would work for us if
>>>> Bjorn is okay with that path and the code is really clean and simple
>>>> too.
>>>>
>>>> That said, I think we could also add something like
>>>> bus_create_group().  Then we could use that to also clean up
>>>> drivers/pci/slot.c:pci_slot_init() and get the original path
>>>> /sys/bus/pci/zpci/uid_checking.
>>>
>>> I don't think "uid_checking" is quite the right name.  It says
>>> something about the *implementation*, but it doesn't convey what that
>>> *means* to userspace.  IIUC this file tells userspace something about
>>> whether a given PCI device always has the same PCI domain/bus/dev/fn
>>> address (or maybe just the same domain?)
>>>
>>> It sounds like this feature could be useful beyond just s390, and
>>> other arches might implement it differently, without the UID concept.
>>> If so, I'm OK with something at the /sys/bus/pci/xxx level as long as
>>> the name is not s390-specific (and "uid" sounds s390-specific).
>>>
>>> I assume it would also help with the udev/systemd end if you could
>>> make this less s390 dependent.
>>
>> I've thought about this more and even implemented a proof of concept
>> patch for a global attribute using a pcibios_has_reproducible_addressing()
>> hook. 
>>
>> However after implementing it I think as a more general and
>> future proof concept it makes more sense to do this as a per device
>> attribute, maybe as another flag in "stuct pci_dev" named something
>> like "reliable_address". My reasoning behind this can be best be seen
>> with a QEMU example. While I expect that QEMU can easily guarantee
>> that one can always use "0000:01:00.0" for a virtio-pci NIC and
>> thus enp1s0 interface name, the same might be harder to guarantee
>> for a SR-IOV VF passed through with vfio-pci in that same VM and
>> even less so if a thunderbolt controller is passed through and
>> enumeration may depend on daisy chaining. The QEMU example
>> also applies to s390 and maybe others will in the future.
> 
> I'm a little wary of using the PCI geographical address
> ("0000:01:00.0") as a stable name.  Even if you can make a way to use
> that to identify a specific device instance, regardless of how it is
> plugged in or passed through, it sounds like we could end up with
> "physical PCI addresses" and "virtual PCI addresses" that look the
> same and would cause confusion.
> 
> This concept sounds similar to the udev concept of a "persistent
> device name".  What advantages does this s390 UID have over the udev
> approach?
> 
> There are optional PCI device serial numbers that we currently don't
> really make use of.  Would that be a generic way to help with this?
> 

As far as I understand systemd/udev uses the PCI geographical address
by default ("enP<domain>p<bus>s<hotplug_slot_idx>...") for PCI attached
network interfaces in many cases and a lot of users have already built
their firewall/routing rules on these.

At the same time as you said this isn't ideal. On s390 things are a
bit special since it's either really unstable, if UID Checking
is not enforced (the value I wanted to expose is 0) the domain
part and thus the interface name may change between reboots.
On the other hand when it is enforced, the Domain can be defined
in the machine configuration (IOCDS, think Domain XML but formatted
for punch cards) and the bus is always 00.
The PCI geographical address and thus network interface names are then
stable in the same way as with our CCW attached network interfaces where
the CCW addresses have been stable for a long time and are regularly
used for configuration. The interface would however still conform to
the existing systemd/udev standard theme so the only change the user
sees is that we would prefer the interface naming scheme which
does not include the hotplug slot index (ehotplug slot indices are unique
in the machine hypervisor and thus can't be the same in to guests
on the same machine). We would thus not add a new name at all and
thanks to the altname mechanism all existing rules including
persistent naming via manual udev rules would keep working.

Now taking this beyond s390 my idea is that under some circumstances
just as with UID Uniqueness for us, the platform can tell if a PCI
geographical address is a reliable identifier thus sytemd/udev
has more information about the quality of existing naming schemes
incorporating information from the geographical address.

Looking at my personal KVM guests (Ubuntu, Arch Linux, Ubuntu ARM64)
as well as my workstation (Arch Linux) all of them use a scheme
with parts of the geographical address.

So in essence my idea is all about either choosing the best existing
default name or making sure we at least know if it may not be reliable.

I hope that makes sense and I have added Lennart Poettering
as he's the one everyone likes to blame for these names or at
least can probably tell me what I missed.

Best,
Niklas
Greg Kroah-Hartman Jan. 21, 2021, 5:28 p.m. UTC | #19
On Thu, Jan 21, 2021 at 06:04:52PM +0100, Niklas Schnelle wrote:
> 
> 
> On 1/21/21 4:54 PM, Bjorn Helgaas wrote:
> > [Greg may be able to help compare/contrast this s390 UID with udev
> > persistent names]
> > 
> > On Thu, Jan 21, 2021 at 04:31:55PM +0100, Niklas Schnelle wrote:
> >> On 1/15/21 4:29 PM, Bjorn Helgaas wrote:
> >>> On Fri, Jan 15, 2021 at 12:20:59PM +0100, Niklas Schnelle wrote:
> >>>> On 1/14/21 5:14 PM, Greg Kroah-Hartman wrote:
> >>>>> On Thu, Jan 14, 2021 at 04:51:17PM +0100, Niklas Schnelle wrote:
> >>>>>> On 1/14/21 4:17 PM, Greg Kroah-Hartman wrote:
> >>>>>>> On Thu, Jan 14, 2021 at 04:06:11PM +0100, Niklas Schnelle wrote:
> >>>>>>>> On 1/14/21 2:58 PM, Greg Kroah-Hartman wrote:
> >>>>>>>>> On Thu, Jan 14, 2021 at 02:44:53PM +0100, Christian Brauner wrote:
> >>>>>>>>>> On Thu, Jan 14, 2021 at 02:20:10PM +0100, Niklas Schnelle wrote:
> >>>>>>>>>>> On 1/13/21 7:55 PM, Bjorn Helgaas wrote:
> >>>>>>>>>>>> On Wed, Jan 13, 2021 at 08:47:58AM +0100, Niklas Schnelle wrote:
> >>>>>>>>>>>>> On 1/12/21 10:50 PM, Bjorn Helgaas wrote:
> >>>> ... snip ...
> >>>>
> >>>>>
> >>>>>> 	if (!zpci_global_kset)
> >>>>>> 		return -ENOMEM;
> >>>>>>
> >>>>>> 	return sysfs_create_group(&zpci_global_kset->kobj, &zpci_attr_group_global);
> >>>>>
> >>>>> Huge hint, if in a driver, or bus subsystem, and you call sysfs_*,
> >>>>> that's usually a huge clue that you are doing something wrong.
> >>>>>
> >>>>> Try the above again, with a simple attribute group, and name for it, and
> >>>>> it should "just work".
> >>>>
> >>>> I'm probably missing something but I don't get how this could work
> >>>> in this case. If I'm seeing this right the default attribute group
> >>>> here is pci_bus_type.bus_groups and that is already set in
> >>>> drivers/pci/pci-driver.c so I don't think I should set that.
> >>>>
> >>>> I did however find bus_create_file() which does work when using the
> >>>> path /sys/bus/pci/uid_checking instead. This would work for us if
> >>>> Bjorn is okay with that path and the code is really clean and simple
> >>>> too.
> >>>>
> >>>> That said, I think we could also add something like
> >>>> bus_create_group().  Then we could use that to also clean up
> >>>> drivers/pci/slot.c:pci_slot_init() and get the original path
> >>>> /sys/bus/pci/zpci/uid_checking.
> >>>
> >>> I don't think "uid_checking" is quite the right name.  It says
> >>> something about the *implementation*, but it doesn't convey what that
> >>> *means* to userspace.  IIUC this file tells userspace something about
> >>> whether a given PCI device always has the same PCI domain/bus/dev/fn
> >>> address (or maybe just the same domain?)
> >>>
> >>> It sounds like this feature could be useful beyond just s390, and
> >>> other arches might implement it differently, without the UID concept.
> >>> If so, I'm OK with something at the /sys/bus/pci/xxx level as long as
> >>> the name is not s390-specific (and "uid" sounds s390-specific).
> >>>
> >>> I assume it would also help with the udev/systemd end if you could
> >>> make this less s390 dependent.
> >>
> >> I've thought about this more and even implemented a proof of concept
> >> patch for a global attribute using a pcibios_has_reproducible_addressing()
> >> hook. 
> >>
> >> However after implementing it I think as a more general and
> >> future proof concept it makes more sense to do this as a per device
> >> attribute, maybe as another flag in "stuct pci_dev" named something
> >> like "reliable_address". My reasoning behind this can be best be seen
> >> with a QEMU example. While I expect that QEMU can easily guarantee
> >> that one can always use "0000:01:00.0" for a virtio-pci NIC and
> >> thus enp1s0 interface name, the same might be harder to guarantee
> >> for a SR-IOV VF passed through with vfio-pci in that same VM and
> >> even less so if a thunderbolt controller is passed through and
> >> enumeration may depend on daisy chaining. The QEMU example
> >> also applies to s390 and maybe others will in the future.
> > 
> > I'm a little wary of using the PCI geographical address
> > ("0000:01:00.0") as a stable name.  Even if you can make a way to use
> > that to identify a specific device instance, regardless of how it is
> > plugged in or passed through, it sounds like we could end up with
> > "physical PCI addresses" and "virtual PCI addresses" that look the
> > same and would cause confusion.
> > 
> > This concept sounds similar to the udev concept of a "persistent
> > device name".  What advantages does this s390 UID have over the udev
> > approach?
> > 
> > There are optional PCI device serial numbers that we currently don't
> > really make use of.  Would that be a generic way to help with this?
> > 
> 
> As far as I understand systemd/udev uses the PCI geographical address
> by default ("enP<domain>p<bus>s<hotplug_slot_idx>...") for PCI attached
> network interfaces in many cases and a lot of users have already built
> their firewall/routing rules on these.

Which is fine as "normally" that does not change.  But on some machines,
it is quite volatile so users pick a different naming scheme.

And this is all done in userspace, I really don't understand what you
want to do in the kernel here.  If you want to expose another unique
thing that the hardware knows about, wonderful, userspace can then use
that if it wants to in how it names specific devices.  But don't put
that naming in the kernel, that's not where it belongs.

> Now taking this beyond s390 my idea is that under some circumstances
> just as with UID Uniqueness for us, the platform can tell if a PCI
> geographical address is a reliable identifier thus sytemd/udev
> has more information about the quality of existing naming schemes
> incorporating information from the geographical address.

The platform does not "know" if this is reliable or not, sorry.  That's
not how PCI or UEFI works.

> Looking at my personal KVM guests (Ubuntu, Arch Linux, Ubuntu ARM64)
> as well as my workstation (Arch Linux) all of them use a scheme
> with parts of the geographical address.

Because for the most part, yes, this works.  Until you plug another
device into the system.  Or remove one.  Or plug a hotplug device in and
then cold boot with it plugged in (or removed).  Or, my favorite system,
just decide to renumber the PCI bus every other boot "just because".

None of that variability can be known by the kernel, that's only known by
the user of that system, so again, they can make the best decision as to
how to name their devices.  If you want to use the systemd default,
wonderful, but know that it does not work for everyone, so systemd
allows you to do whatever you want.

> So in essence my idea is all about either choosing the best existing
> default name or making sure we at least know if it may not be reliable.

There is no reliability "score" here, sorry.  Hardware is fun :)

good luck!

greg k-h
Niklas Schnelle Jan. 21, 2021, 5:43 p.m. UTC | #20
On 1/21/21 6:28 PM, Greg Kroah-Hartman wrote:
> On Thu, Jan 21, 2021 at 06:04:52PM +0100, Niklas Schnelle wrote:
>>
>>
>> On 1/21/21 4:54 PM, Bjorn Helgaas wrote:
>>> [Greg may be able to help compare/contrast this s390 UID with udev
>>> persistent names]
>>>
>>> On Thu, Jan 21, 2021 at 04:31:55PM +0100, Niklas Schnelle wrote:
>>>> On 1/15/21 4:29 PM, Bjorn Helgaas wrote:
>>>>> On Fri, Jan 15, 2021 at 12:20:59PM +0100, Niklas Schnelle wrote:
>>>>>> On 1/14/21 5:14 PM, Greg Kroah-Hartman wrote:
>>>>>>> On Thu, Jan 14, 2021 at 04:51:17PM +0100, Niklas Schnelle wrote:
>>>>>>>> On 1/14/21 4:17 PM, Greg Kroah-Hartman wrote:
>>>>>>>>> On Thu, Jan 14, 2021 at 04:06:11PM +0100, Niklas Schnelle wrote:
>>>>>>>>>> On 1/14/21 2:58 PM, Greg Kroah-Hartman wrote:
>>>>>>>>>>> On Thu, Jan 14, 2021 at 02:44:53PM +0100, Christian Brauner wrote:
>>>>>>>>>>>> On Thu, Jan 14, 2021 at 02:20:10PM +0100, Niklas Schnelle wrote:
>>>>>>>>>>>>> On 1/13/21 7:55 PM, Bjorn Helgaas wrote:
>>>>>>>>>>>>>> On Wed, Jan 13, 2021 at 08:47:58AM +0100, Niklas Schnelle wrote:
>>>>>>>>>>>>>>> On 1/12/21 10:50 PM, Bjorn Helgaas wrote:
>>>>>> ... snip ...
>>>>>>
>>>>>>>
>>>>>>>> 	if (!zpci_global_kset)
>>>>>>>> 		return -ENOMEM;
>>>>>>>>
>>>>>>>> 	return sysfs_create_group(&zpci_global_kset->kobj, &zpci_attr_group_global);
>>>>>>>
>>>>>>> Huge hint, if in a driver, or bus subsystem, and you call sysfs_*,
>>>>>>> that's usually a huge clue that you are doing something wrong.
>>>>>>>
>>>>>>> Try the above again, with a simple attribute group, and name for it, and
>>>>>>> it should "just work".
>>>>>>
>>>>>> I'm probably missing something but I don't get how this could work
>>>>>> in this case. If I'm seeing this right the default attribute group
>>>>>> here is pci_bus_type.bus_groups and that is already set in
>>>>>> drivers/pci/pci-driver.c so I don't think I should set that.
>>>>>>
>>>>>> I did however find bus_create_file() which does work when using the
>>>>>> path /sys/bus/pci/uid_checking instead. This would work for us if
>>>>>> Bjorn is okay with that path and the code is really clean and simple
>>>>>> too.
>>>>>>
>>>>>> That said, I think we could also add something like
>>>>>> bus_create_group().  Then we could use that to also clean up
>>>>>> drivers/pci/slot.c:pci_slot_init() and get the original path
>>>>>> /sys/bus/pci/zpci/uid_checking.
>>>>>
>>>>> I don't think "uid_checking" is quite the right name.  It says
>>>>> something about the *implementation*, but it doesn't convey what that
>>>>> *means* to userspace.  IIUC this file tells userspace something about
>>>>> whether a given PCI device always has the same PCI domain/bus/dev/fn
>>>>> address (or maybe just the same domain?)
>>>>>
>>>>> It sounds like this feature could be useful beyond just s390, and
>>>>> other arches might implement it differently, without the UID concept.
>>>>> If so, I'm OK with something at the /sys/bus/pci/xxx level as long as
>>>>> the name is not s390-specific (and "uid" sounds s390-specific).
>>>>>
>>>>> I assume it would also help with the udev/systemd end if you could
>>>>> make this less s390 dependent.
>>>>
>>>> I've thought about this more and even implemented a proof of concept
>>>> patch for a global attribute using a pcibios_has_reproducible_addressing()
>>>> hook. 
>>>>
>>>> However after implementing it I think as a more general and
>>>> future proof concept it makes more sense to do this as a per device
>>>> attribute, maybe as another flag in "stuct pci_dev" named something
>>>> like "reliable_address". My reasoning behind this can be best be seen
>>>> with a QEMU example. While I expect that QEMU can easily guarantee
>>>> that one can always use "0000:01:00.0" for a virtio-pci NIC and
>>>> thus enp1s0 interface name, the same might be harder to guarantee
>>>> for a SR-IOV VF passed through with vfio-pci in that same VM and
>>>> even less so if a thunderbolt controller is passed through and
>>>> enumeration may depend on daisy chaining. The QEMU example
>>>> also applies to s390 and maybe others will in the future.
>>>
>>> I'm a little wary of using the PCI geographical address
>>> ("0000:01:00.0") as a stable name.  Even if you can make a way to use
>>> that to identify a specific device instance, regardless of how it is
>>> plugged in or passed through, it sounds like we could end up with
>>> "physical PCI addresses" and "virtual PCI addresses" that look the
>>> same and would cause confusion.
>>>
>>> This concept sounds similar to the udev concept of a "persistent
>>> device name".  What advantages does this s390 UID have over the udev
>>> approach?
>>>
>>> There are optional PCI device serial numbers that we currently don't
>>> really make use of.  Would that be a generic way to help with this?
>>>
>>
>> As far as I understand systemd/udev uses the PCI geographical address
>> by default ("enP<domain>p<bus>s<hotplug_slot_idx>...") for PCI attached
>> network interfaces in many cases and a lot of users have already built
>> their firewall/routing rules on these.
> 
> Which is fine as "normally" that does not change.  But on some machines,
> it is quite volatile so users pick a different naming scheme.
> 
> And this is all done in userspace, I really don't understand what you
> want to do in the kernel here.  If you want to expose another unique
> thing that the hardware knows about, wonderful, userspace can then use
> that if it wants to in how it names specific devices.  But don't put
> that naming in the kernel, that's not where it belongs.

Oh no I definitely don't want to put any naming in the kernel.
Rather this is very much "a thing that the hardware knows".
The thing being:

  We're a virtual platform and this PCI devices' address is generated
  from user configuration and we guarantee it's stable.

> 
>> Now taking this beyond s390 my idea is that under some circumstances
>> just as with UID Uniqueness for us, the platform can tell if a PCI
>> geographical address is a reliable identifier thus sytemd/udev
>> has more information about the quality of existing naming schemes
>> incorporating information from the geographical address.
> 
> The platform does not "know" if this is reliable or not, sorry.  That's
> not how PCI or UEFI works.

Yes I assumed as much which is why my examples were all about virtualized
platforms/hypervisors. I would say QEMU very much knows if the PCI address
is coming from its fluffy virtual sandbox or real wild hardware (pass-through).

> 
>> Looking at my personal KVM guests (Ubuntu, Arch Linux, Ubuntu ARM64)
>> as well as my workstation (Arch Linux) all of them use a scheme
>> with parts of the geographical address.
> 
> Because for the most part, yes, this works.  Until you plug another
> device into the system.  Or remove one.  Or plug a hotplug device in and
> then cold boot with it plugged in (or removed).  Or, my favorite system,
> just decide to renumber the PCI bus every other boot "just because".
> 
> None of that variability can be known by the kernel, that's only known by
> the user of that system, so again, they can make the best decision as to
> how to name their devices.  If you want to use the systemd default,
> wonderful, but know that it does not work for everyone, so systemd
> allows you to do whatever you want.
> 
>> So in essence my idea is all about either choosing the best existing
>> default name or making sure we at least know if it may not be reliable.
> 
> There is no reliability "score" here, sorry.  Hardware is fun :)

Well as said above, at the very least there is real hardware, emulated
hardware and our always virtualizing machines that keep things cozy
and reliable enough that you can keep running OSs from the 70s largely
unchanged on >5 GHz CPUs. So I think at least our platform has a pretty
good track record in this regard.

> 
> good luck!
> 
> greg k-h
>
diff mbox series

Patch

diff --git a/Documentation/ABI/testing/sysfs-bus-pci b/Documentation/ABI/testing/sysfs-bus-pci
index 25c9c39770c6..a174aac0ebb0 100644
--- a/Documentation/ABI/testing/sysfs-bus-pci
+++ b/Documentation/ABI/testing/sysfs-bus-pci
@@ -375,3 +375,14 @@  Description:
 		The value comes from the PCI kernel device state and can be one
 		of: "unknown", "error", "D0", D1", "D2", "D3hot", "D3cold".
 		The file is read only.
+What:		/sys/bus/pci/zpci/uid_checking
+Date:		December 2020
+Contact:	Niklas Schnelle <schnelle@linux.ibm.com>
+Description:
+		This attribute exposes the global state of UID Checking on
+		an s390 Linux system. If UID Checking is on this file
+		contains '1' otherwise '0'. If UID Checking is on the UID of
+		a zPCI device, or the UID of function zero for a multi-function
+		device will be used as its PCI Domain number. If UID Checking
+		is off PCI Domain numbers are generated automatically and
+		are not stable across reboots.
diff --git a/arch/s390/include/asm/pci.h b/arch/s390/include/asm/pci.h
index 212628932ddc..3cfa6cc701ba 100644
--- a/arch/s390/include/asm/pci.h
+++ b/arch/s390/include/asm/pci.h
@@ -285,6 +285,9 @@  void zpci_debug_exit_device(struct zpci_dev *);
 /* Error reporting */
 int zpci_report_error(struct pci_dev *, struct zpci_report_error_header *);
 
+/* Sysfs Entries */
+int zpci_sysfs_init(void);
+
 #ifdef CONFIG_NUMA
 
 /* Returns the node based on PCI bus */
diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c
index 41df8fcfddde..c16c93e5f9af 100644
--- a/arch/s390/pci/pci.c
+++ b/arch/s390/pci/pci.c
@@ -881,6 +881,10 @@  static int __init pci_base_init(void)
 	if (rc)
 		goto out_find;
 
+	rc = zpci_sysfs_init();
+	if (rc)
+		goto out_find;
+
 	s390_pci_initialized = 1;
 	return 0;
 
diff --git a/arch/s390/pci/pci_sysfs.c b/arch/s390/pci/pci_sysfs.c
index 5c028bee91b9..d00690f73539 100644
--- a/arch/s390/pci/pci_sysfs.c
+++ b/arch/s390/pci/pci_sysfs.c
@@ -172,3 +172,37 @@  const struct attribute_group *zpci_attr_groups[] = {
 	&pfip_attr_group,
 	NULL,
 };
+
+/* Global zPCI attributes */
+static ssize_t uid_checking_show(struct kobject *kobj,
+				 struct kobj_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%i\n", zpci_unique_uid);
+}
+
+static struct kobj_attribute sys_zpci_uid_checking_attr =
+	__ATTR(uid_checking, 0444, uid_checking_show, NULL);
+
+static struct kset *zpci_global_kset;
+
+static struct attribute *zpci_attrs_global[] = {
+	&sys_zpci_uid_checking_attr.attr,
+	NULL,
+};
+
+static struct attribute_group zpci_attr_group_global = {
+	.attrs = zpci_attrs_global,
+};
+
+int __init zpci_sysfs_init(void)
+{
+	struct kset *pci_bus_kset;
+
+	pci_bus_kset = bus_get_kset(&pci_bus_type);
+
+	zpci_global_kset = kset_create_and_add("zpci", NULL, &pci_bus_kset->kobj);
+	if (!zpci_global_kset)
+		return -ENOMEM;
+
+	return sysfs_create_group(&zpci_global_kset->kobj, &zpci_attr_group_global);
+}