Patchwork [RFC] PCI: export MSI mode using attributes, not kobjects

login
register
mail settings
Submitter Greg KH
Date Oct. 29, 2013, 9:46 p.m.
Message ID <20131029214631.GA19354@kroah.com>
Download mbox | patch
Permalink /patch/287054/
State Not Applicable
Headers show

Comments

Greg KH - Oct. 29, 2013, 9:46 p.m.
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

The PCI MSI sysfs code is a mess with kobjects for things that don't
really need to be kobjects.  This patch creates attributes dynamically
for the MSI interrupts instead of using kobjects.

Note, this does not delete the existing sysfs MSI code, but puts the
attributes under a "msi_irqs_2" directory for testing / example.

Also note, this removes a directory from the current MSI interrupt sysfs
code:

old MSI kobjects:
pci_device
   └── msi_irqs
       └── 40
           └── mode

new MSI attributes:
pci_device
   └── msi_irqs_2
       └── 40

As there was only one file "mode" with the kobject model, the interrupt
number is now a file that returns the "mode" of the interrupt (msi vs.
msix).

Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---

Bjorn, I can make up a patch that rips out the existing kobject code
here, but I figured this patch would make things easier to follow
instead of having to dig through the removed logic at the same time.

I'll clean up the error handling path for the create attribute logic as
well, this was just a proof of concept that this could be done.

Do you think that anyone cares about the current mode files in sysfs to
move things in this manner?

 drivers/pci/msi.c   |   85 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/pci.h |    1 
 2 files changed, 86 insertions(+)

--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Bjorn Helgaas - Nov. 1, 2013, 11:40 p.m.
On Tue, Oct 29, 2013 at 3:46 PM, Greg Kroah-Hartman
<gregkh@linuxfoundation.org> wrote:
> From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
>
> The PCI MSI sysfs code is a mess with kobjects for things that don't
> really need to be kobjects.  This patch creates attributes dynamically
> for the MSI interrupts instead of using kobjects.
>
> Note, this does not delete the existing sysfs MSI code, but puts the
> attributes under a "msi_irqs_2" directory for testing / example.
>
> Also note, this removes a directory from the current MSI interrupt sysfs
> code:
>
> old MSI kobjects:
> pci_device
>    └── msi_irqs
>        └── 40
>            └── mode
>
> new MSI attributes:
> pci_device
>    └── msi_irqs_2
>        └── 40
>
> As there was only one file "mode" with the kobject model, the interrupt
> number is now a file that returns the "mode" of the interrupt (msi vs.
> msix).
>
> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
> ---
>
> Bjorn, I can make up a patch that rips out the existing kobject code
> here, but I figured this patch would make things easier to follow
> instead of having to dig through the removed logic at the same time.
>
> I'll clean up the error handling path for the create attribute logic as
> well, this was just a proof of concept that this could be done.
>
> Do you think that anyone cares about the current mode files in sysfs to
> move things in this manner?

I like this a lot better than trying to fix all the holes in the
current kobject code.

I have no idea who, if anybody, cares about the "mode" files.  I
assume there's a way to create the "mode" files with attributes, too?
If so, we could replicate the existing structure with one patch, and
simplify it with a second patch, so it would be easier to revert the
directory change while keeping the fix.

Bjorn

>  drivers/pci/msi.c   |   85 ++++++++++++++++++++++++++++++++++++++++++++++++++++
>  include/linux/pci.h |    1
>  2 files changed, 86 insertions(+)
>
> diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
> index d5f90d63..53848ab9 100644
> --- a/drivers/pci/msi.c
> +++ b/drivers/pci/msi.c
> @@ -353,6 +353,9 @@ void write_msi_msg(unsigned int irq, struct msi_msg *msg)
>  static void free_msi_irqs(struct pci_dev *dev)
>  {
>         struct msi_desc *entry, *tmp;
> +       struct attribute **msi_attrs;
> +       struct device_attribute *dev_attr;
> +       int count = 0;
>
>         list_for_each_entry(entry, &dev->msi_list, list) {
>                 int i, nvec;
> @@ -388,6 +391,22 @@ static void free_msi_irqs(struct pci_dev *dev)
>                 list_del(&entry->list);
>                 kfree(entry);
>         }
> +
> +       if (dev->msi_irq_groups) {
> +               sysfs_remove_groups(&dev->dev.kobj, dev->msi_irq_groups);
> +               msi_attrs = dev->msi_irq_groups[0]->attrs;
> +               list_for_each_entry(entry, &dev->msi_list, list) {
> +                       dev_attr = container_of(msi_attrs[count],
> +                                               struct device_attribute, attr);
> +                       kfree(dev_attr->attr.name);
> +                       kfree(dev_attr);
> +                       ++count;
> +               }
> +               kfree(msi_attrs);
> +               kfree(dev->msi_irq_groups[0]);
> +               kfree(dev->msi_irq_groups);
> +               dev->msi_irq_groups = NULL;
> +       }
>  }
>
>  static struct msi_desc *alloc_msi_entry(struct pci_dev *dev)
> @@ -517,13 +536,79 @@ static struct kobj_type msi_irq_ktype = {
>         .default_attrs = msi_irq_default_attrs,
>  };
>
> +static ssize_t msi_mode_show(struct device *dev, struct device_attribute *attr,
> +                            char *buf)
> +{
> +       struct pci_dev *pdev = to_pci_dev(dev);
> +       struct msi_desc *entry;
> +       unsigned long irq;
> +       int retval;
> +
> +       retval = kstrtoul(attr->attr.name, 10, &irq);
> +       if (retval)
> +               return retval;
> +
> +       list_for_each_entry(entry, &pdev->msi_list, list) {
> +               if (entry->irq == irq) {
> +                       return sprintf(buf, "%s\n",
> +                                      entry->msi_attrib.is_msix ? "msix" : "msi");
> +               }
> +       }
> +       return -ENODEV;
> +}
> +
>  static int populate_msi_sysfs(struct pci_dev *pdev)
>  {
> +       struct attribute **msi_attrs;
> +       struct device_attribute *msi_dev_attr;
> +       struct attribute_group *msi_irq_group;
> +       const struct attribute_group **msi_irq_groups;
>         struct msi_desc *entry;
>         struct kobject *kobj;
>         int ret;
> +       int num_msi = 0;
>         int count = 0;
>
> +       /* Determine how many msi entries we have */
> +       list_for_each_entry(entry, &pdev->msi_list, list) {
> +               ++num_msi;
> +       }
> +       if (!num_msi)
> +               return 0;
> +
> +       /* Dynamically create the MSI attributes for the PCI device */
> +       msi_attrs = kzalloc(sizeof(void *) * (num_msi + 1), GFP_KERNEL);
> +       if (!msi_attrs)
> +               return -ENOMEM;
> +       list_for_each_entry(entry, &pdev->msi_list, list) {
> +               char *name = kmalloc(20, GFP_KERNEL);
> +               msi_dev_attr = kzalloc(sizeof(*msi_dev_attr), GFP_KERNEL);
> +               if (!msi_dev_attr)
> +                       return -ENOMEM;
> +               sprintf(name, "%d", entry->irq);
> +               msi_dev_attr->attr.name = name;
> +               msi_dev_attr->attr.mode = S_IRUGO;
> +               msi_dev_attr->show = msi_mode_show;
> +               msi_attrs[count] = &msi_dev_attr->attr;
> +               ++count;
> +       }
> +
> +       msi_irq_group = kzalloc(sizeof(*msi_irq_group), GFP_KERNEL);
> +       if (!msi_irq_group)
> +               return -ENOMEM;
> +       msi_irq_group->name = "msi_irqs_2";
> +       msi_irq_group->attrs = msi_attrs;
> +
> +       msi_irq_groups = kzalloc(sizeof(void *) * 2, GFP_KERNEL);
> +       if (!msi_irq_groups)
> +               return -ENOMEM;
> +       msi_irq_groups[0] = msi_irq_group;
> +
> +       ret = sysfs_create_groups(&pdev->dev.kobj, msi_irq_groups);
> +       if (ret)
> +               return ret;
> +       pdev->msi_irq_groups = msi_irq_groups;
> +
>         pdev->msi_kset = kset_create_and_add("msi_irqs", NULL, &pdev->dev.kobj);
>         if (!pdev->msi_kset)
>                 return -ENOMEM;
> diff --git a/include/linux/pci.h b/include/linux/pci.h
> index da172f95..9540110f 100644
> --- a/include/linux/pci.h
> +++ b/include/linux/pci.h
> @@ -354,6 +354,7 @@ struct pci_dev {
>  #ifdef CONFIG_PCI_MSI
>         struct list_head msi_list;
>         struct kset *msi_kset;
> +       const struct attribute_group **msi_irq_groups;
>  #endif
>         struct pci_vpd *vpd;
>  #ifdef CONFIG_PCI_ATS
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Neil Horman - Nov. 2, 2013, 2:09 a.m.
On Fri, Nov 01, 2013 at 05:40:02PM -0600, Bjorn Helgaas wrote:
> On Tue, Oct 29, 2013 at 3:46 PM, Greg Kroah-Hartman
> <gregkh@linuxfoundation.org> wrote:
> > From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
> >
> > The PCI MSI sysfs code is a mess with kobjects for things that don't
> > really need to be kobjects.  This patch creates attributes dynamically
> > for the MSI interrupts instead of using kobjects.
> >
> > Note, this does not delete the existing sysfs MSI code, but puts the
> > attributes under a "msi_irqs_2" directory for testing / example.
> >
> > Also note, this removes a directory from the current MSI interrupt sysfs
> > code:
> >
> > old MSI kobjects:
> > pci_device
> >    └── msi_irqs
> >        └── 40
> >            └── mode
> >
> > new MSI attributes:
> > pci_device
> >    └── msi_irqs_2
> >        └── 40
> >
> > As there was only one file "mode" with the kobject model, the interrupt
> > number is now a file that returns the "mode" of the interrupt (msi vs.
> > msix).
> >
> > Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
> > ---
> >
> > Bjorn, I can make up a patch that rips out the existing kobject code
> > here, but I figured this patch would make things easier to follow
> > instead of having to dig through the removed logic at the same time.
> >
> > I'll clean up the error handling path for the create attribute logic as
> > well, this was just a proof of concept that this could be done.
> >
> > Do you think that anyone cares about the current mode files in sysfs to
> > move things in this manner?
> 
> I like this a lot better than trying to fix all the holes in the
> current kobject code.
> 
> I have no idea who, if anybody, cares about the "mode" files.  I
> assume there's a way to create the "mode" files with attributes, too?
> If so, we could replicate the existing structure with one patch, and
> simplify it with a second patch, so it would be easier to revert the
> directory change while keeping the fix.
> 
> Bjorn
> 
FWIW, I created the mode files because I wanted to be able to add other
attributes to an irq in the future, in case we needed them.  I think time has
show that additional attributes seem unnecessecary, so it makes sense to me to
just include the mode attribute in the irq number file
Neil

--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Greg KH - Nov. 2, 2013, 3:50 p.m.
On Fri, Nov 01, 2013 at 05:40:02PM -0600, Bjorn Helgaas wrote:
> On Tue, Oct 29, 2013 at 3:46 PM, Greg Kroah-Hartman
> <gregkh@linuxfoundation.org> wrote:
> > From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
> >
> > The PCI MSI sysfs code is a mess with kobjects for things that don't
> > really need to be kobjects.  This patch creates attributes dynamically
> > for the MSI interrupts instead of using kobjects.
> >
> > Note, this does not delete the existing sysfs MSI code, but puts the
> > attributes under a "msi_irqs_2" directory for testing / example.
> >
> > Also note, this removes a directory from the current MSI interrupt sysfs
> > code:
> >
> > old MSI kobjects:
> > pci_device
> >    └── msi_irqs
> >        └── 40
> >            └── mode
> >
> > new MSI attributes:
> > pci_device
> >    └── msi_irqs_2
> >        └── 40
> >
> > As there was only one file "mode" with the kobject model, the interrupt
> > number is now a file that returns the "mode" of the interrupt (msi vs.
> > msix).
> >
> > Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
> > ---
> >
> > Bjorn, I can make up a patch that rips out the existing kobject code
> > here, but I figured this patch would make things easier to follow
> > instead of having to dig through the removed logic at the same time.
> >
> > I'll clean up the error handling path for the create attribute logic as
> > well, this was just a proof of concept that this could be done.
> >
> > Do you think that anyone cares about the current mode files in sysfs to
> > move things in this manner?
> 
> I like this a lot better than trying to fix all the holes in the
> current kobject code.

Great.

> I have no idea who, if anybody, cares about the "mode" files.  I
> assume there's a way to create the "mode" files with attributes, too?
> If so, we could replicate the existing structure with one patch, and
> simplify it with a second patch, so it would be easier to revert the
> directory change while keeping the fix.

No, we can't create a 2-level deep attribute at the moment, only one
level, like the patch does.

Based on Neil's comments, I think we should be fine with this as-is as
no one is messing with these files directly (which implies that we could
possibly just remove them entirely to save us the overall pain...)

Want me to redo this in a way that is acceptable (i.e. remove the
existing code at the same time?)

thanks,

greg k-h
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Bjorn Helgaas - Nov. 5, 2013, 6:55 p.m.
On Sat, Nov 2, 2013 at 9:50 AM, Greg Kroah-Hartman
<gregkh@linuxfoundation.org> wrote:
> On Fri, Nov 01, 2013 at 05:40:02PM -0600, Bjorn Helgaas wrote:
>> On Tue, Oct 29, 2013 at 3:46 PM, Greg Kroah-Hartman
>> <gregkh@linuxfoundation.org> wrote:
>> > From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
>> >
>> > The PCI MSI sysfs code is a mess with kobjects for things that don't
>> > really need to be kobjects.  This patch creates attributes dynamically
>> > for the MSI interrupts instead of using kobjects.
>> >
>> > Note, this does not delete the existing sysfs MSI code, but puts the
>> > attributes under a "msi_irqs_2" directory for testing / example.
>> >
>> > Also note, this removes a directory from the current MSI interrupt sysfs
>> > code:
>> >
>> > old MSI kobjects:
>> > pci_device
>> >    └── msi_irqs
>> >        └── 40
>> >            └── mode
>> >
>> > new MSI attributes:
>> > pci_device
>> >    └── msi_irqs_2
>> >        └── 40
>> >
>> > As there was only one file "mode" with the kobject model, the interrupt
>> > number is now a file that returns the "mode" of the interrupt (msi vs.
>> > msix).
>> >
>> > Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
>> > ---
>> >
>> > Bjorn, I can make up a patch that rips out the existing kobject code
>> > here, but I figured this patch would make things easier to follow
>> > instead of having to dig through the removed logic at the same time.
>> >
>> > I'll clean up the error handling path for the create attribute logic as
>> > well, this was just a proof of concept that this could be done.
>> >
>> > Do you think that anyone cares about the current mode files in sysfs to
>> > move things in this manner?
>>
>> I like this a lot better than trying to fix all the holes in the
>> current kobject code.
>
> Great.
>
>> I have no idea who, if anybody, cares about the "mode" files.  I
>> assume there's a way to create the "mode" files with attributes, too?
>> If so, we could replicate the existing structure with one patch, and
>> simplify it with a second patch, so it would be easier to revert the
>> directory change while keeping the fix.
>
> No, we can't create a 2-level deep attribute at the moment, only one
> level, like the patch does.
>
> Based on Neil's comments, I think we should be fine with this as-is as
> no one is messing with these files directly (which implies that we could
> possibly just remove them entirely to save us the overall pain...)

Hmmm.  https://bugzilla.redhat.com/show_bug.cgi?id=744012 suggests
that irqbalance might be reading these files.

> Want me to redo this in a way that is acceptable (i.e. remove the
> existing code at the same time?)
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Bjorn Helgaas - Nov. 14, 2013, 7:33 p.m.
On Tue, Nov 5, 2013 at 11:55 AM, Bjorn Helgaas <bhelgaas@google.com> wrote:
> On Sat, Nov 2, 2013 at 9:50 AM, Greg Kroah-Hartman
> <gregkh@linuxfoundation.org> wrote:
>> On Fri, Nov 01, 2013 at 05:40:02PM -0600, Bjorn Helgaas wrote:
>>> On Tue, Oct 29, 2013 at 3:46 PM, Greg Kroah-Hartman
>>> <gregkh@linuxfoundation.org> wrote:
>>> > From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
>>> >
>>> > The PCI MSI sysfs code is a mess with kobjects for things that don't
>>> > really need to be kobjects.  This patch creates attributes dynamically
>>> > for the MSI interrupts instead of using kobjects.
>>> >
>>> > Note, this does not delete the existing sysfs MSI code, but puts the
>>> > attributes under a "msi_irqs_2" directory for testing / example.
>>> >
>>> > Also note, this removes a directory from the current MSI interrupt sysfs
>>> > code:
>>> >
>>> > old MSI kobjects:
>>> > pci_device
>>> >    └── msi_irqs
>>> >        └── 40
>>> >            └── mode
>>> >
>>> > new MSI attributes:
>>> > pci_device
>>> >    └── msi_irqs_2
>>> >        └── 40
>>> >
>>> > As there was only one file "mode" with the kobject model, the interrupt
>>> > number is now a file that returns the "mode" of the interrupt (msi vs.
>>> > msix).
>>> >
>>> > Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
>>> > ---
>>> >
>>> > Bjorn, I can make up a patch that rips out the existing kobject code
>>> > here, but I figured this patch would make things easier to follow
>>> > instead of having to dig through the removed logic at the same time.
>>> >
>>> > I'll clean up the error handling path for the create attribute logic as
>>> > well, this was just a proof of concept that this could be done.
>>> >
>>> > Do you think that anyone cares about the current mode files in sysfs to
>>> > move things in this manner?
>>>
>>> I like this a lot better than trying to fix all the holes in the
>>> current kobject code.
>>
>> Great.
>>
>>> I have no idea who, if anybody, cares about the "mode" files.  I
>>> assume there's a way to create the "mode" files with attributes, too?
>>> If so, we could replicate the existing structure with one patch, and
>>> simplify it with a second patch, so it would be easier to revert the
>>> directory change while keeping the fix.
>>
>> No, we can't create a 2-level deep attribute at the moment, only one
>> level, like the patch does.
>>
>> Based on Neil's comments, I think we should be fine with this as-is as
>> no one is messing with these files directly (which implies that we could
>> possibly just remove them entirely to save us the overall pain...)
>
> Hmmm.  https://bugzilla.redhat.com/show_bug.cgi?id=744012 suggests
> that irqbalance might be reading these files.

I looked at the current irqbalance on github [1], and I *think* it
never reads the "mode" files.  It reads the entries in the "msi_irqs"
directory, which you're proposing to change from directories to files,
but I think it only uses the names.

It looks like it should be safe at least for irqbalance to make this a
one-level attribute.  It's possible we'll break somebody's scripts,
but I'm willing to try making this change because it really makes the
refcounting much simpler.

Bjorn

[1] https://github.com/Irqbalance/irqbalance/blob/master/classify.c#L357
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Neil Horman - Nov. 14, 2013, 8:17 p.m.
On Thu, Nov 14, 2013 at 12:33:11PM -0700, Bjorn Helgaas wrote:
> On Tue, Nov 5, 2013 at 11:55 AM, Bjorn Helgaas <bhelgaas@google.com> wrote:
> > On Sat, Nov 2, 2013 at 9:50 AM, Greg Kroah-Hartman
> > <gregkh@linuxfoundation.org> wrote:
> >> On Fri, Nov 01, 2013 at 05:40:02PM -0600, Bjorn Helgaas wrote:
> >>> On Tue, Oct 29, 2013 at 3:46 PM, Greg Kroah-Hartman
> >>> <gregkh@linuxfoundation.org> wrote:
> >>> > From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
> >>> >
> >>> > The PCI MSI sysfs code is a mess with kobjects for things that don't
> >>> > really need to be kobjects.  This patch creates attributes dynamically
> >>> > for the MSI interrupts instead of using kobjects.
> >>> >
> >>> > Note, this does not delete the existing sysfs MSI code, but puts the
> >>> > attributes under a "msi_irqs_2" directory for testing / example.
> >>> >
> >>> > Also note, this removes a directory from the current MSI interrupt sysfs
> >>> > code:
> >>> >
> >>> > old MSI kobjects:
> >>> > pci_device
> >>> >    └── msi_irqs
> >>> >        └── 40
> >>> >            └── mode
> >>> >
> >>> > new MSI attributes:
> >>> > pci_device
> >>> >    └── msi_irqs_2
> >>> >        └── 40
> >>> >
> >>> > As there was only one file "mode" with the kobject model, the interrupt
> >>> > number is now a file that returns the "mode" of the interrupt (msi vs.
> >>> > msix).
> >>> >
> >>> > Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
> >>> > ---
> >>> >
> >>> > Bjorn, I can make up a patch that rips out the existing kobject code
> >>> > here, but I figured this patch would make things easier to follow
> >>> > instead of having to dig through the removed logic at the same time.
> >>> >
> >>> > I'll clean up the error handling path for the create attribute logic as
> >>> > well, this was just a proof of concept that this could be done.
> >>> >
> >>> > Do you think that anyone cares about the current mode files in sysfs to
> >>> > move things in this manner?
> >>>
> >>> I like this a lot better than trying to fix all the holes in the
> >>> current kobject code.
> >>
> >> Great.
> >>
> >>> I have no idea who, if anybody, cares about the "mode" files.  I
> >>> assume there's a way to create the "mode" files with attributes, too?
> >>> If so, we could replicate the existing structure with one patch, and
> >>> simplify it with a second patch, so it would be easier to revert the
> >>> directory change while keeping the fix.
> >>
> >> No, we can't create a 2-level deep attribute at the moment, only one
> >> level, like the patch does.
> >>
> >> Based on Neil's comments, I think we should be fine with this as-is as
> >> no one is messing with these files directly (which implies that we could
> >> possibly just remove them entirely to save us the overall pain...)
> >
> > Hmmm.  https://bugzilla.redhat.com/show_bug.cgi?id=744012 suggests
> > that irqbalance might be reading these files.
> 
> I looked at the current irqbalance on github [1], and I *think* it
> never reads the "mode" files.  It reads the entries in the "msi_irqs"
> directory, which you're proposing to change from directories to files,
> but I think it only uses the names.
> 
It doesn't read the mode file (I had intended for it to, but all the information
irqbalance needs currently is implied by the fact that it appears in the sysfs
tree under msi_irqs in the first place).

> It looks like it should be safe at least for irqbalance to make this a
> one-level attribute.  It's possible we'll break somebody's scripts,
> but I'm willing to try making this change because it really makes the
> refcounting much simpler.
> 
ACK, if you cc me on the patch that will change the sysfs directory structure,
I'll make the corresponding changes needed to irqblanace in parallel.

Thanks!
Neil

> Bjorn
> 
> [1] https://github.com/Irqbalance/irqbalance/blob/master/classify.c#L357
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Bjorn Helgaas - Nov. 14, 2013, 9:16 p.m.
On Thu, Nov 14, 2013 at 1:17 PM, Neil Horman <nhorman@tuxdriver.com> wrote:
> On Thu, Nov 14, 2013 at 12:33:11PM -0700, Bjorn Helgaas wrote:
>> On Tue, Nov 5, 2013 at 11:55 AM, Bjorn Helgaas <bhelgaas@google.com> wrote:
>> > On Sat, Nov 2, 2013 at 9:50 AM, Greg Kroah-Hartman
>> > <gregkh@linuxfoundation.org> wrote:
>> >> On Fri, Nov 01, 2013 at 05:40:02PM -0600, Bjorn Helgaas wrote:
>> >>> On Tue, Oct 29, 2013 at 3:46 PM, Greg Kroah-Hartman
>> >>> <gregkh@linuxfoundation.org> wrote:
>> >>> > From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
>> >>> >
>> >>> > The PCI MSI sysfs code is a mess with kobjects for things that don't
>> >>> > really need to be kobjects.  This patch creates attributes dynamically
>> >>> > for the MSI interrupts instead of using kobjects.
>> >>> >
>> >>> > Note, this does not delete the existing sysfs MSI code, but puts the
>> >>> > attributes under a "msi_irqs_2" directory for testing / example.
>> >>> >
>> >>> > Also note, this removes a directory from the current MSI interrupt sysfs
>> >>> > code:
>> >>> >
>> >>> > old MSI kobjects:
>> >>> > pci_device
>> >>> >    └── msi_irqs
>> >>> >        └── 40
>> >>> >            └── mode
>> >>> >
>> >>> > new MSI attributes:
>> >>> > pci_device
>> >>> >    └── msi_irqs_2
>> >>> >        └── 40
>> >>> >
>> >>> > As there was only one file "mode" with the kobject model, the interrupt
>> >>> > number is now a file that returns the "mode" of the interrupt (msi vs.
>> >>> > msix).
>> >>> >
>> >>> > Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
>> >>> > ---
>> >>> >
>> >>> > Bjorn, I can make up a patch that rips out the existing kobject code
>> >>> > here, but I figured this patch would make things easier to follow
>> >>> > instead of having to dig through the removed logic at the same time.
>> >>> >
>> >>> > I'll clean up the error handling path for the create attribute logic as
>> >>> > well, this was just a proof of concept that this could be done.
>> >>> >
>> >>> > Do you think that anyone cares about the current mode files in sysfs to
>> >>> > move things in this manner?
>> >>>
>> >>> I like this a lot better than trying to fix all the holes in the
>> >>> current kobject code.
>> >>
>> >> Great.
>> >>
>> >>> I have no idea who, if anybody, cares about the "mode" files.  I
>> >>> assume there's a way to create the "mode" files with attributes, too?
>> >>> If so, we could replicate the existing structure with one patch, and
>> >>> simplify it with a second patch, so it would be easier to revert the
>> >>> directory change while keeping the fix.
>> >>
>> >> No, we can't create a 2-level deep attribute at the moment, only one
>> >> level, like the patch does.
>> >>
>> >> Based on Neil's comments, I think we should be fine with this as-is as
>> >> no one is messing with these files directly (which implies that we could
>> >> possibly just remove them entirely to save us the overall pain...)
>> >
>> > Hmmm.  https://bugzilla.redhat.com/show_bug.cgi?id=744012 suggests
>> > that irqbalance might be reading these files.
>>
>> I looked at the current irqbalance on github [1], and I *think* it
>> never reads the "mode" files.  It reads the entries in the "msi_irqs"
>> directory, which you're proposing to change from directories to files,
>> but I think it only uses the names.
>>
> It doesn't read the mode file (I had intended for it to, but all the information
> irqbalance needs currently is implied by the fact that it appears in the sysfs
> tree under msi_irqs in the first place).
>
>> It looks like it should be safe at least for irqbalance to make this a
>> one-level attribute.  It's possible we'll break somebody's scripts,
>> but I'm willing to try making this change because it really makes the
>> refcounting much simpler.
>>
> ACK, if you cc me on the patch that will change the sysfs directory structure,
> I'll make the corresponding changes needed to irqblanace in parallel.

I'm hoping *no* changes will be required to irqbalance.  All it seems
to care about are the names of the entries inside "msi_irqs".  The
names will stay the same; they'll just change from being directories
to being files.

If an irqbalance change *is* required, then I'm much more hesitant.  I
don't want to force people to install a new irqbalance along with a
new kernel.

Bjorn
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Patch

diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index d5f90d63..53848ab9 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -353,6 +353,9 @@  void write_msi_msg(unsigned int irq, struct msi_msg *msg)
 static void free_msi_irqs(struct pci_dev *dev)
 {
 	struct msi_desc *entry, *tmp;
+	struct attribute **msi_attrs;
+	struct device_attribute *dev_attr;
+	int count = 0;
 
 	list_for_each_entry(entry, &dev->msi_list, list) {
 		int i, nvec;
@@ -388,6 +391,22 @@  static void free_msi_irqs(struct pci_dev *dev)
 		list_del(&entry->list);
 		kfree(entry);
 	}
+
+	if (dev->msi_irq_groups) {
+		sysfs_remove_groups(&dev->dev.kobj, dev->msi_irq_groups);
+		msi_attrs = dev->msi_irq_groups[0]->attrs;
+		list_for_each_entry(entry, &dev->msi_list, list) {
+			dev_attr = container_of(msi_attrs[count],
+						struct device_attribute, attr);
+			kfree(dev_attr->attr.name);
+			kfree(dev_attr);
+			++count;
+		}
+		kfree(msi_attrs);
+		kfree(dev->msi_irq_groups[0]);
+		kfree(dev->msi_irq_groups);
+		dev->msi_irq_groups = NULL;
+	}
 }
 
 static struct msi_desc *alloc_msi_entry(struct pci_dev *dev)
@@ -517,13 +536,79 @@  static struct kobj_type msi_irq_ktype = {
 	.default_attrs = msi_irq_default_attrs,
 };
 
+static ssize_t msi_mode_show(struct device *dev, struct device_attribute *attr,
+			     char *buf)
+{
+	struct pci_dev *pdev = to_pci_dev(dev);
+	struct msi_desc *entry;
+	unsigned long irq;
+	int retval;
+
+	retval = kstrtoul(attr->attr.name, 10, &irq);
+	if (retval)
+		return retval;
+
+	list_for_each_entry(entry, &pdev->msi_list, list) {
+		if (entry->irq == irq) {
+			return sprintf(buf, "%s\n",
+				       entry->msi_attrib.is_msix ? "msix" : "msi");
+		}
+	}
+	return -ENODEV;
+}
+
 static int populate_msi_sysfs(struct pci_dev *pdev)
 {
+	struct attribute **msi_attrs;
+	struct device_attribute *msi_dev_attr;
+	struct attribute_group *msi_irq_group;
+	const struct attribute_group **msi_irq_groups;
 	struct msi_desc *entry;
 	struct kobject *kobj;
 	int ret;
+	int num_msi = 0;
 	int count = 0;
 
+	/* Determine how many msi entries we have */
+	list_for_each_entry(entry, &pdev->msi_list, list) {
+		++num_msi;
+	}
+	if (!num_msi)
+		return 0;
+
+	/* Dynamically create the MSI attributes for the PCI device */
+	msi_attrs = kzalloc(sizeof(void *) * (num_msi + 1), GFP_KERNEL);
+	if (!msi_attrs)
+		return -ENOMEM;
+	list_for_each_entry(entry, &pdev->msi_list, list) {
+		char *name = kmalloc(20, GFP_KERNEL);
+		msi_dev_attr = kzalloc(sizeof(*msi_dev_attr), GFP_KERNEL);
+		if (!msi_dev_attr)
+			return -ENOMEM;
+		sprintf(name, "%d", entry->irq);
+		msi_dev_attr->attr.name = name;
+		msi_dev_attr->attr.mode = S_IRUGO;
+		msi_dev_attr->show = msi_mode_show;
+		msi_attrs[count] = &msi_dev_attr->attr;
+		++count;
+	}
+
+	msi_irq_group = kzalloc(sizeof(*msi_irq_group), GFP_KERNEL);
+	if (!msi_irq_group)
+		return -ENOMEM;
+	msi_irq_group->name = "msi_irqs_2";
+	msi_irq_group->attrs = msi_attrs;
+
+	msi_irq_groups = kzalloc(sizeof(void *) * 2, GFP_KERNEL);
+	if (!msi_irq_groups)
+		return -ENOMEM;
+	msi_irq_groups[0] = msi_irq_group;
+
+	ret = sysfs_create_groups(&pdev->dev.kobj, msi_irq_groups);
+	if (ret)
+		return ret;
+	pdev->msi_irq_groups = msi_irq_groups;
+
 	pdev->msi_kset = kset_create_and_add("msi_irqs", NULL, &pdev->dev.kobj);
 	if (!pdev->msi_kset)
 		return -ENOMEM;
diff --git a/include/linux/pci.h b/include/linux/pci.h
index da172f95..9540110f 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -354,6 +354,7 @@  struct pci_dev {
 #ifdef CONFIG_PCI_MSI
 	struct list_head msi_list;
 	struct kset *msi_kset;
+	const struct attribute_group **msi_irq_groups;
 #endif
 	struct pci_vpd *vpd;
 #ifdef CONFIG_PCI_ATS