diff mbox series

[V7,3/5] platform/x86: Intel PMT class driver

Message ID 20201001014250.26987-4-david.e.box@linux.intel.com
State New
Headers show
Series Intel Platform Monitoring Technology | expand

Commit Message

David E. Box Oct. 1, 2020, 1:42 a.m. UTC
From: Alexander Duyck <alexander.h.duyck@linux.intel.com>

Intel Platform Monitoring Technology is meant to provide a common way to
access telemetry and system metrics.

Register mappings are not provided by the driver. Instead, a GUID is read
from a header for each endpoint. The GUID identifies the device and is to
be used with an XML, provided by the vendor, to discover the available set
of metrics and their register mapping.  This allows firmware updates to
modify the register space without needing to update the driver every time
with new mappings. Firmware writes a new GUID in this case to specify the
new mapping.  Software tools with access to the associated XML file can
then interpret the changes.

The module manages access to all Intel PMT endpoints on a system,
independent of the device exporting them. It creates an intel_pmt class to
manage the devices. For each telemetry endpoint, sysfs files provide GUID
and size information as well as a pointer to the parent device the
telemetry came from. Software may discover the association between
endpoints and devices by iterating through the list in sysfs, or by looking
for the existence of the class folder under the device of interest.  A
binary sysfs attribute of the same name allows software to then read or map
the telemetry space for direct access.

Signed-off-by: Alexander Duyck <alexander.h.duyck@linux.intel.com>
---
 .../ABI/testing/sysfs-class-intel_pmt         |  54 ++++
 MAINTAINERS                                   |   1 +
 drivers/platform/x86/Kconfig                  |   9 +
 drivers/platform/x86/Makefile                 |   1 +
 drivers/platform/x86/intel_pmt_class.c        | 286 ++++++++++++++++++
 drivers/platform/x86/intel_pmt_class.h        |  57 ++++
 6 files changed, 408 insertions(+)
 create mode 100644 Documentation/ABI/testing/sysfs-class-intel_pmt
 create mode 100644 drivers/platform/x86/intel_pmt_class.c
 create mode 100644 drivers/platform/x86/intel_pmt_class.h

Comments

Andy Shevchenko Oct. 1, 2020, 4:23 p.m. UTC | #1
On Thu, Oct 1, 2020 at 4:43 AM David E. Box <david.e.box@linux.intel.com> wrote:
>
> From: Alexander Duyck <alexander.h.duyck@linux.intel.com>
>
> Intel Platform Monitoring Technology is meant to provide a common way to
> access telemetry and system metrics.
>
> Register mappings are not provided by the driver. Instead, a GUID is read
> from a header for each endpoint. The GUID identifies the device and is to
> be used with an XML, provided by the vendor, to discover the available set
> of metrics and their register mapping.  This allows firmware updates to
> modify the register space without needing to update the driver every time
> with new mappings. Firmware writes a new GUID in this case to specify the
> new mapping.  Software tools with access to the associated XML file can
> then interpret the changes.

Where one may find a database of these reserved GUIDs / XMLs?
How do you prevent a chaos which happens with other registries?

> The module manages access to all Intel PMT endpoints on a system,
> independent of the device exporting them. It creates an intel_pmt class to
> manage the devices. For each telemetry endpoint, sysfs files provide GUID
> and size information as well as a pointer to the parent device the
> telemetry came from. Software may discover the association between
> endpoints and devices by iterating through the list in sysfs, or by looking
> for the existence of the class folder under the device of interest.  A
> binary sysfs attribute of the same name allows software to then read or map
> the telemetry space for direct access.

What are the security implications by direct access?

...

> +static const struct pci_device_id pmt_telem_early_client_pci_ids[] = {
> +       { PCI_VDEVICE(INTEL, 0x9a0d) }, /* TGL */
> +       { }
> +};
> +bool intel_pmt_is_early_client_hw(struct device *dev)
> +{
> +       struct pci_dev *parent = to_pci_dev(dev->parent);
> +
> +       return !!pci_match_id(pmt_telem_early_client_pci_ids, parent);
> +}
> +EXPORT_SYMBOL_GPL(intel_pmt_is_early_client_hw);

What is this and why is it in the class driver?

> +static ssize_t
> +intel_pmt_read(struct file *filp, struct kobject *kobj,
> +              struct bin_attribute *attr, char *buf, loff_t off,
> +              size_t count)
> +{
> +       struct intel_pmt_entry *entry = container_of(attr,
> +                                                    struct intel_pmt_entry,
> +                                                    pmt_bin_attr);

> +       if (off < 0)
> +               return -EINVAL;

Is this real or theoretical?

> +       if (count)

Useless.

> +               memcpy_fromio(buf, entry->base + off, count);
> +
> +       return count;
> +}

...

> +       psize = (PFN_UP(entry->base_addr + entry->size) - pfn) * PAGE_SIZE;

PFN_PHYS(PFN_UP(...)) ?

...

> +static struct attribute *intel_pmt_attrs[] = {
> +       &dev_attr_guid.attr,
> +       &dev_attr_size.attr,
> +       &dev_attr_offset.attr,
> +       NULL
> +};

> +

Unneeded blank line.

> +ATTRIBUTE_GROUPS(intel_pmt);

...

> +       /* if size is 0 assume no data buffer, so no file needed */
> +       if (!entry->size)
> +               return 0;

Hmm... But presence of the file is also an information that might be
useful for user, no?

...

> +       entry->base = devm_ioremap_resource(dev, &res);

(1)

> +       if (IS_ERR(entry->base)) {

> +               dev_err(dev, "Failed to ioremap device region\n");

Duplicates core message.

> +               ret = -EIO;

Why shadowing real error code?

> +               goto fail_ioremap;
> +       }

> +       iounmap(entry->base);

This is interesting. How do you avoid double unmap with (1)?

> +#include <linux/platform_device.h>
> +#include <linux/xarray.h>
> +
> +/* PMT access types */
> +#define ACCESS_BARID           2
> +#define ACCESS_LOCAL           3
> +
> +/* PMT discovery base address/offset register layout */
> +#define GET_BIR(v)             ((v) & GENMASK(2, 0))
> +#define GET_ADDRESS(v)         ((v) & GENMASK(31, 3))

bits.h

> +struct intel_pmt_entry {
> +       struct bin_attribute    pmt_bin_attr;
> +       struct kobject          *kobj;
> +       void __iomem            *disc_table;
> +       void __iomem            *base;
> +       unsigned long           base_addr;
> +       size_t                  size;

> +       u32                     guid;

types.h

> +       int                     devid;
> +};

> +static inline int
> +intel_pmt_ioremap_discovery_table(struct intel_pmt_entry *entry,
> +                                 struct platform_device *pdev,  int i)
> +{

> +       entry->disc_table = devm_platform_ioremap_resource(pdev, i);

io.h ?

> +
> +       return PTR_ERR_OR_ZERO(entry->disc_table);

err.h

> +}

The rule of thumb is to include all headers that you have direct users of.
Then you may optimize by removing those which are guaranteed to be
included by others, like
bits.h always included by bitops.h.
Alexander H Duyck Oct. 1, 2020, 5:44 p.m. UTC | #2
On Thu, Oct 1, 2020 at 9:26 AM Andy Shevchenko
<andy.shevchenko@gmail.com> wrote:
>
> On Thu, Oct 1, 2020 at 4:43 AM David E. Box <david.e.box@linux.intel.com> wrote:
> >
> > From: Alexander Duyck <alexander.h.duyck@linux.intel.com>
> >
> > Intel Platform Monitoring Technology is meant to provide a common way to
> > access telemetry and system metrics.
> >
> > Register mappings are not provided by the driver. Instead, a GUID is read
> > from a header for each endpoint. The GUID identifies the device and is to
> > be used with an XML, provided by the vendor, to discover the available set
> > of metrics and their register mapping.  This allows firmware updates to
> > modify the register space without needing to update the driver every time
> > with new mappings. Firmware writes a new GUID in this case to specify the
> > new mapping.  Software tools with access to the associated XML file can
> > then interpret the changes.
>
> Where one may find a database of these reserved GUIDs / XMLs?
> How do you prevent a chaos which happens with other registries?

The database will be posted on intel.com eventually. Although I don't
believe the URL is public yet.

> > The module manages access to all Intel PMT endpoints on a system,
> > independent of the device exporting them. It creates an intel_pmt class to
> > manage the devices. For each telemetry endpoint, sysfs files provide GUID
> > and size information as well as a pointer to the parent device the
> > telemetry came from. Software may discover the association between
> > endpoints and devices by iterating through the list in sysfs, or by looking
> > for the existence of the class folder under the device of interest.  A
> > binary sysfs attribute of the same name allows software to then read or map
> > the telemetry space for direct access.
>
> What are the security implications by direct access?

In this case minimal as it would really be no different than the read.
The registers in the memory regions themselves are read-only with no
read side effects.

> ...
>
> > +static const struct pci_device_id pmt_telem_early_client_pci_ids[] = {
> > +       { PCI_VDEVICE(INTEL, 0x9a0d) }, /* TGL */
> > +       { }
> > +};
> > +bool intel_pmt_is_early_client_hw(struct device *dev)
> > +{
> > +       struct pci_dev *parent = to_pci_dev(dev->parent);
> > +
> > +       return !!pci_match_id(pmt_telem_early_client_pci_ids, parent);
> > +}
> > +EXPORT_SYMBOL_GPL(intel_pmt_is_early_client_hw);
>
> What is this and why is it in the class driver?

I chose to use the class driver as a central place to store code
common to all of the instances of the class. In this case we have
quirks that are specific to Tiger Lake and so I chose to store the
function to test for the device here.

> > +static ssize_t
> > +intel_pmt_read(struct file *filp, struct kobject *kobj,
> > +              struct bin_attribute *attr, char *buf, loff_t off,
> > +              size_t count)
> > +{
> > +       struct intel_pmt_entry *entry = container_of(attr,
> > +                                                    struct intel_pmt_entry,
> > +                                                    pmt_bin_attr);
>
> > +       if (off < 0)
> > +               return -EINVAL;
> Is this real or theoretical?

Not sure. I am not that familiar with the interface. It was something
I copied from read_bmof which is what I based this code on based on an
earlier suggestion.

> > +       if (count)
>
> Useless.

I'm assuming that is because memcpy_fromio is assumed to handle this case?

> > +               memcpy_fromio(buf, entry->base + off, count);
> > +
> > +       return count;
> > +}
>
> ...
>
> > +       psize = (PFN_UP(entry->base_addr + entry->size) - pfn) * PAGE_SIZE;
>
> PFN_PHYS(PFN_UP(...)) ?

I'm not sure how that would work. Basically what we are doing here is
determining the size of the mapping based on the number of pages that
will be needed. So we wake the pfn of the start of the region,
subtract that from the pfn for the end of the region and multiply by
the size of a page.

> > +static struct attribute *intel_pmt_attrs[] = {
> > +       &dev_attr_guid.attr,
> > +       &dev_attr_size.attr,
> > +       &dev_attr_offset.attr,
> > +       NULL
> > +};
>
> > +
>
> Unneeded blank line.
>
> > +ATTRIBUTE_GROUPS(intel_pmt);
>
> ...
>
> > +       /* if size is 0 assume no data buffer, so no file needed */
> > +       if (!entry->size)
> > +               return 0;
>
> Hmm... But presence of the file is also an information that might be
> useful for user, no?

I'm not sure what you mean? If the size of the region is zero it means
there is no data there. There are cases in future devices where we may
have controls for a telemetry function that is actually streaming the
data elsewhere. That will be the use case for the entry size of 0 and
in that case it doesn't make any sense to have a file as there is no
data to present in it.

> ...
>
> > +       entry->base = devm_ioremap_resource(dev, &res);
>
> (1)
>
> > +       if (IS_ERR(entry->base)) {
>
> > +               dev_err(dev, "Failed to ioremap device region\n");
>
> Duplicates core message.

I'll drop it since it is a redundant.

> > +               ret = -EIO;
>
> Why shadowing real error code?

I will just convert it instead. I think there might have been a bug
here in an earlier version where it was testing for NULL instead.

> > +               goto fail_ioremap;
> > +       }
>
> > +       iounmap(entry->base);
>
> This is interesting. How do you avoid double unmap with (1)?

I think I get what you are trying to say. This is redundant since we
used the devm_ioremap_resource it will already be freed when the
driver is detached, correct?

> > +#include <linux/platform_device.h>
> > +#include <linux/xarray.h>
> > +
> > +/* PMT access types */
> > +#define ACCESS_BARID           2
> > +#define ACCESS_LOCAL           3
> > +
> > +/* PMT discovery base address/offset register layout */
> > +#define GET_BIR(v)             ((v) & GENMASK(2, 0))
> > +#define GET_ADDRESS(v)         ((v) & GENMASK(31, 3))
>
> bits.h

That is already included from a few different sources.

> > +struct intel_pmt_entry {
> > +       struct bin_attribute    pmt_bin_attr;
> > +       struct kobject          *kobj;
> > +       void __iomem            *disc_tabl
> > +       void __iomem            *base;
> > +       unsigned long           base_addr;
> > +       size_t                  size;
>
> > +       u32                     guid;
>
> types.h

Is included through xarray.h.

> > +       int                     devid;
> > +};
>
> > +static inline int
> > +intel_pmt_ioremap_discovery_table(struct intel_pmt_entry *entry,
> > +                                 struct platform_device *pdev,  int i)
> > +{
>
> > +       entry->disc_table = devm_platform_ioremap_resource(pdev, i);
>
> io.h ?

That one I will move from the class.c file.

> > +
> > +       return PTR_ERR_OR_ZERO(entry->disc_table);
>
> err.h

That is included as a part of io.h.

> > +}
>
> The rule of thumb is to include all headers that you have direct users of.
> Then you may optimize by removing those which are guaranteed to be
> included by others, like bits.h always included by bitops.h.

Yeah, from what I can tell the only one I didn't have was io.h and
part of that is because this was something I had moved to the header
file in order to commonize it since it was being used in the other
drivers.
Andy Shevchenko Oct. 1, 2020, 6:06 p.m. UTC | #3
On Thu, Oct 1, 2020 at 8:44 PM Alexander Duyck
<alexander.duyck@gmail.com> wrote:
> On Thu, Oct 1, 2020 at 9:26 AM Andy Shevchenko
> <andy.shevchenko@gmail.com> wrote:
> > On Thu, Oct 1, 2020 at 4:43 AM David E. Box <david.e.box@linux.intel.com> wrote:

...

> > > Intel Platform Monitoring Technology is meant to provide a common way to
> > > access telemetry and system metrics.
> > >
> > > Register mappings are not provided by the driver. Instead, a GUID is read
> > > from a header for each endpoint. The GUID identifies the device and is to
> > > be used with an XML, provided by the vendor, to discover the available set
> > > of metrics and their register mapping.  This allows firmware updates to
> > > modify the register space without needing to update the driver every time
> > > with new mappings. Firmware writes a new GUID in this case to specify the
> > > new mapping.  Software tools with access to the associated XML file can
> > > then interpret the changes.
> >
> > Where one may find a database of these reserved GUIDs / XMLs?
> > How do you prevent a chaos which happens with other registries?
>
> The database will be posted on intel.com eventually. Although I don't
> believe the URL is public yet.

How can we be sure that this won't be forgotten? How can we be sure it
will be public at the end? Please, elaborate this in the commit
message.

...

> > > +static const struct pci_device_id pmt_telem_early_client_pci_ids[] = {
> > > +       { PCI_VDEVICE(INTEL, 0x9a0d) }, /* TGL */
> > > +       { }
> > > +};
> > > +bool intel_pmt_is_early_client_hw(struct device *dev)
> > > +{
> > > +       struct pci_dev *parent = to_pci_dev(dev->parent);
> > > +
> > > +       return !!pci_match_id(pmt_telem_early_client_pci_ids, parent);
> > > +}
> > > +EXPORT_SYMBOL_GPL(intel_pmt_is_early_client_hw);
> >
> > What is this and why is it in the class driver?
>
> I chose to use the class driver as a central place to store code
> common to all of the instances of the class. In this case we have
> quirks that are specific to Tiger Lake and so I chose to store the
> function to test for the device here.

Can it be done in another file module at least (let's say intel_pmt_quirks.c)?

...

> > > +       if (off < 0)
> > > +               return -EINVAL;
> > Is this real or theoretical?
>
> Not sure. I am not that familiar with the interface. It was something
> I copied from read_bmof which is what I based this code on based on an
> earlier suggestion.

I'm not a fan of cargo cult when there is no understanding why certain
code appears in the driver.

...

> > > +       if (count)
> >
> > Useless.
>
> I'm assuming that is because memcpy_fromio is assumed to handle this case?

Right.

> > > +               memcpy_fromio(buf, entry->base + off, count);

...

> > > +       psize = (PFN_UP(entry->base_addr + entry->size) - pfn) * PAGE_SIZE;
> >
> > PFN_PHYS(PFN_UP(...)) ?
>
> I'm not sure how that would work. Basically what we are doing here is
> determining the size of the mapping based on the number of pages that
> will be needed. So we wake the pfn of the start of the region,
> subtract that from the pfn for the end of the region and multiply by
> the size of a page.

PFN_PHYS() is a replacement for multiplication. You may check its
implementation.

...

> > > +       /* if size is 0 assume no data buffer, so no file needed */
> > > +       if (!entry->size)
> > > +               return 0;
> >
> > Hmm... But presence of the file is also an information that might be
> > useful for user, no?
>
> I'm not sure what you mean? If the size of the region is zero it means
> there is no data there. There are cases in future devices where we may
> have controls for a telemetry function that is actually streaming the
> data elsewhere. That will be the use case for the entry size of 0 and
> in that case it doesn't make any sense to have a file as there is no
> data to present in it.

This is not understandable from 'no data buffer' above. Can you
elaborate in the comment?

...

> > > +       entry->base = devm_ioremap_resource(dev, &res);
> > > +       if (IS_ERR(entry->base)) {
> > > +               dev_err(dev, "Failed to ioremap device region\n");
> > > +               goto fail_ioremap;
> > > +       }
> >
> > > +       iounmap(entry->base);
> >
> > This is interesting. How do you avoid double unmap with (1)?
>
> I think I get what you are trying to say. This is redundant since we
> used the devm_ioremap_resource it will already be freed when the
> driver is detached, correct?

Right. Above is the leftover that shows the poor testing of v7.

> > > +#include <linux/platform_device.h>
> > > +#include <linux/xarray.h>
> > > +
> > > +/* PMT access types */
> > > +#define ACCESS_BARID           2
> > > +#define ACCESS_LOCAL           3
> > > +
> > > +/* PMT discovery base address/offset register layout */
> > > +#define GET_BIR(v)             ((v) & GENMASK(2, 0))
> > > +#define GET_ADDRESS(v)         ((v) & GENMASK(31, 3))
> >
> > bits.h
>
> That is already included from a few different sources.

Please, read again what I wrote below. Ditto for other inclusions.

> > > +struct intel_pmt_entry {

> > > +       struct bin_attribute    pmt_bin_attr;
> > > +       struct kobject          *kobj;
> > > +       void __iomem            *disc_tabl
> > > +       void __iomem            *base;
> > > +       unsigned long           base_addr;
> > > +       size_t                  size;
> >
> > > +       u32                     guid;
> >
> > types.h
>
> Is included through xarray.h.
>
> > > +       int                     devid;
> > > +};
> >
> > > +static inline int
> > > +intel_pmt_ioremap_discovery_table(struct intel_pmt_entry *entry,
> > > +                                 struct platform_device *pdev,  int i)
> > > +{
> >
> > > +       entry->disc_table = devm_platform_ioremap_resource(pdev, i);
> >
> > io.h ?
>
> That one I will move from the class.c file.
>
> > > +
> > > +       return PTR_ERR_OR_ZERO(entry->disc_table);
> >
> > err.h
>
> That is included as a part of io.h.
>
> > > +}
> >
> > The rule of thumb is to include all headers that you have direct users of.
> > Then you may optimize by removing those which are guaranteed to be
> > included by others, like bits.h always included by bitops.h.
>
> Yeah, from what I can tell the only one I didn't have was io.h and
> part of that is because this was something I had moved to the header
> file in order to commonize it since it was being used in the other
> drivers.

types.h not necessarily be included by above and so on...
Alexander H Duyck Oct. 1, 2020, 7:02 p.m. UTC | #4
On Thu, Oct 1, 2020 at 11:06 AM Andy Shevchenko
<andy.shevchenko@gmail.com> wrote:
>
> On Thu, Oct 1, 2020 at 8:44 PM Alexander Duyck
> <alexander.duyck@gmail.com> wrote:
> > On Thu, Oct 1, 2020 at 9:26 AM Andy Shevchenko
> > <andy.shevchenko@gmail.com> wrote:
> > > On Thu, Oct 1, 2020 at 4:43 AM David E. Box <david.e.box@linux.intel.com> wrote:
>
> ...
>
> > > > Intel Platform Monitoring Technology is meant to provide a common way to
> > > > access telemetry and system metrics.
> > > >
> > > > Register mappings are not provided by the driver. Instead, a GUID is read
> > > > from a header for each endpoint. The GUID identifies the device and is to
> > > > be used with an XML, provided by the vendor, to discover the available set
> > > > of metrics and their register mapping.  This allows firmware updates to
> > > > modify the register space without needing to update the driver every time
> > > > with new mappings. Firmware writes a new GUID in this case to specify the
> > > > new mapping.  Software tools with access to the associated XML file can
> > > > then interpret the changes.
> > >
> > > Where one may find a database of these reserved GUIDs / XMLs?
> > > How do you prevent a chaos which happens with other registries?
> >
> > The database will be posted on intel.com eventually. Although I don't
> > believe the URL is public yet.
>
> How can we be sure that this won't be forgotten? How can we be sure it
> will be public at the end? Please, elaborate this in the commit
> message.

Okay, I will work with David on that.

> ...
>
> > > > +static const struct pci_device_id pmt_telem_early_client_pci_ids[] = {
> > > > +       { PCI_VDEVICE(INTEL, 0x9a0d) }, /* TGL */
> > > > +       { }
> > > > +};
> > > > +bool intel_pmt_is_early_client_hw(struct device *dev)
> > > > +{
> > > > +       struct pci_dev *parent = to_pci_dev(dev->parent);
> > > > +
> > > > +       return !!pci_match_id(pmt_telem_early_client_pci_ids, parent);
> > > > +}
> > > > +EXPORT_SYMBOL_GPL(intel_pmt_is_early_client_hw);
> > >
> > > What is this and why is it in the class driver?
> >
> > I chose to use the class driver as a central place to store code
> > common to all of the instances of the class. In this case we have
> > quirks that are specific to Tiger Lake and so I chose to store the
> > function to test for the device here.
>
> Can it be done in another file module at least (let's say intel_pmt_quirks.c)?

I suppose, but then it is adding a file for essentially 13 lines of
code. Maybe I will just move this back to intel_pmt_telemetry.c and we
can revisit where this should go if/when we add the watcher driver.

> ...
>
> > > > +       if (off < 0)
> > > > +               return -EINVAL;
> > > Is this real or theoretical?
> >
> > Not sure. I am not that familiar with the interface. It was something
> > I copied from read_bmof which is what I based this code on based on an
> > earlier suggestion.
>
> I'm not a fan of cargo cult when there is no understanding why certain
> code appears in the driver.

Well with something like this I usually question if it provides any
value. The problem is I don't know enough about binary sysfs
attributes to say one way or another. If you know that the offset
provided cannot be negative I can drop it. However I haven't seen
anything that seems to say one way or another.

> ...
>
> > > > +       if (count)
> > >
> > > Useless.
> >
> > I'm assuming that is because memcpy_fromio is assumed to handle this case?
>
> Right.
>
> > > > +               memcpy_fromio(buf, entry->base + off, count);
>
> ...
>
> > > > +       psize = (PFN_UP(entry->base_addr + entry->size) - pfn) * PAGE_SIZE;
> > >
> > > PFN_PHYS(PFN_UP(...)) ?
> >
> > I'm not sure how that would work. Basically what we are doing here is
> > determining the size of the mapping based on the number of pages that
> > will be needed. So we wake the pfn of the start of the region,
> > subtract that from the pfn for the end of the region and multiply by
> > the size of a page.
>
> PFN_PHYS() is a replacement for multiplication. You may check its
> implementation.

Ah, okay so you meant PFN_PHYS(PFN_UP(...)-pfn)

> ...
>
> > > > +       /* if size is 0 assume no data buffer, so no file needed */
> > > > +       if (!entry->size)
> > > > +               return 0;
> > >
> > > Hmm... But presence of the file is also an information that might be
> > > useful for user, no?
> >
> > I'm not sure what you mean? If the size of the region is zero it means
> > there is no data there. There are cases in future devices where we may
> > have controls for a telemetry function that is actually streaming the
> > data elsewhere. That will be the use case for the entry size of 0 and
> > in that case it doesn't make any sense to have a file as there is no
> > data to present in it.
>
> This is not understandable from 'no data buffer' above. Can you
> elaborate in the comment?

Having a file here implies there is something to read. There are
devices supported by PMT which are essentially just control only,
specifically some of the watchers. So the file would have nothing to
read/mmap. In such a case it wouldn't make sense to provide a binary
sysfs attribute as there is no data to associate with it.

> ...
>
> > > > +       entry->base = devm_ioremap_resource(dev, &res);
> > > > +       if (IS_ERR(entry->base)) {
> > > > +               dev_err(dev, "Failed to ioremap device region\n");
> > > > +               goto fail_ioremap;
> > > > +       }
> > >
> > > > +       iounmap(entry->base);
> > >
> > > This is interesting. How do you avoid double unmap with (1)?
> >
> > I think I get what you are trying to say. This is redundant since we
> > used the devm_ioremap_resource it will already be freed when the
> > driver is detached, correct?
>
> Right. Above is the leftover that shows the poor testing of v7.

I will admit I don't think we tested the case where
sysfs_create_bin_file failed.

> > > > +#include <linux/platform_device.h>
> > > > +#include <linux/xarray.h>
> > > > +
> > > > +/* PMT access types */
> > > > +#define ACCESS_BARID           2
> > > > +#define ACCESS_LOCAL           3
> > > > +
> > > > +/* PMT discovery base address/offset register layout */
> > > > +#define GET_BIR(v)             ((v) & GENMASK(2, 0))
> > > > +#define GET_ADDRESS(v)         ((v) & GENMASK(31, 3))
> > >
> > > bits.h
> >
> > That is already included from a few different sources.
>
> Please, read again what I wrote below. Ditto for other inclusions.
>
> > > > +struct intel_pmt_entry {
>
> > > > +       struct bin_attribute    pmt_bin_attr;
> > > > +       struct kobject          *kobj;
> > > > +       void __iomem            *disc_tabl
> > > > +       void __iomem            *base;
> > > > +       unsigned long           base_addr;
> > > > +       size_t                  size;
> > >
> > > > +       u32                     guid;
> > >
> > > types.h
> >
> > Is included through xarray.h.
> >
> > > > +       int                     devid;
> > > > +};
> > >
> > > > +static inline int
> > > > +intel_pmt_ioremap_discovery_table(struct intel_pmt_entry *entry,
> > > > +                                 struct platform_device *pdev,  int i)
> > > > +{
> > >
> > > > +       entry->disc_table = devm_platform_ioremap_resource(pdev, i);
> > >
> > > io.h ?
> >
> > That one I will move from the class.c file.
> >
> > > > +
> > > > +       return PTR_ERR_OR_ZERO(entry->disc_table);
> > >
> > > err.h
> >
> > That is included as a part of io.h.
> >
> > > > +}
> > >
> > > The rule of thumb is to include all headers that you have direct users of.
> > > Then you may optimize by removing those which are guaranteed to be
> > > included by others, like bits.h always included by bitops.h.
> >
> > Yeah, from what I can tell the only one I didn't have was io.h and
> > part of that is because this was something I had moved to the header
> > file in order to commonize it since it was being used in the other
> > drivers.
>
> types.h not necessarily be included by above and so on...

Okay, I will just explicitly add those headers then.
diff mbox series

Patch

diff --git a/Documentation/ABI/testing/sysfs-class-intel_pmt b/Documentation/ABI/testing/sysfs-class-intel_pmt
new file mode 100644
index 000000000000..926b5cf95fd1
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-class-intel_pmt
@@ -0,0 +1,54 @@ 
+What:		/sys/class/intel_pmt/
+Date:		October 2020
+KernelVersion:	5.10
+Contact:	David Box <david.e.box@linux.intel.com>
+Description:
+		The intel_pmt/ class directory contains information for
+		devices that expose hardware telemetry using Intel Platform
+		Monitoring Technology (PMT)
+
+What:		/sys/class/intel_pmt/telem<x>
+Date:		October 2020
+KernelVersion:	5.10
+Contact:	David Box <david.e.box@linux.intel.com>
+Description:
+		The telem<x> directory contains files describing an instance of
+		a PMT telemetry device that exposes hardware telemetry. Each
+		telem<x> directory has an associated telem file. This file
+		may be opened and mapped or read to access the telemetry space
+		of the device. The register layout of the telemetry space is
+		determined from an XML file that matches the PCI device id and
+		GUID for the device.
+
+What:		/sys/class/intel_pmt/telem<x>/telem
+Date:		October 2020
+KernelVersion:	5.10
+Contact:	David Box <david.e.box@linux.intel.com>
+Description:
+		(RO) The telemetry data for this telemetry device. This file
+		may be mapped or read to obtain the data.
+
+What:		/sys/class/intel_pmt/telem<x>/guid
+Date:		October 2020
+KernelVersion:	5.10
+Contact:	David Box <david.e.box@linux.intel.com>
+Description:
+		(RO) The GUID for this telemetry device. The GUID identifies
+		the version of the XML file for the parent device that is to
+		be used to get the register layout.
+
+What:		/sys/class/intel_pmt/telem<x>/size
+Date:		October 2020
+KernelVersion:	5.10
+Contact:	David Box <david.e.box@linux.intel.com>
+Description:
+		(RO) The size of telemetry region in bytes that corresponds to
+		the mapping size for the telem file.
+
+What:		/sys/class/intel_pmt/telem<x>/offset
+Date:		October 2020
+KernelVersion:	5.10
+Contact:	David Box <david.e.box@linux.intel.com>
+Description:
+		(RO) The offset of telemetry region in bytes that corresponds to
+		the mapping for the telem file.
diff --git a/MAINTAINERS b/MAINTAINERS
index 0f2663b1d376..47fdb8a6e151 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -8950,6 +8950,7 @@  INTEL PMT DRIVER
 M:	"David E. Box" <david.e.box@linux.intel.com>
 S:	Maintained
 F:	drivers/mfd/intel_pmt.c
+F:	drivers/platform/x86/intel_pmt_*
 
 INTEL PRO/WIRELESS 2100, 2200BG, 2915ABG NETWORK CONNECTION SUPPORT
 M:	Stanislav Yakovlev <stas.yakovlev@gmail.com>
diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig
index 40219bba6801..82465d0e8fd3 100644
--- a/drivers/platform/x86/Kconfig
+++ b/drivers/platform/x86/Kconfig
@@ -1360,6 +1360,15 @@  config INTEL_PMC_CORE
 		- LTR Ignore
 		- MPHY/PLL gating status (Sunrisepoint PCH only)
 
+config INTEL_PMT_CLASS
+	tristate "Intel Platform Monitoring Technology (PMT) Class driver"
+	help
+	  The Intel Platform Monitoring Technology (PMT) class driver provides
+	  the basic sysfs interface and file hierarchy uses by PMT devices.
+
+	  For more information, see:
+	  <file:Documentation/ABI/testing/sysfs-class-intel_pmt>
+
 config INTEL_PUNIT_IPC
 	tristate "Intel P-Unit IPC Driver"
 	help
diff --git a/drivers/platform/x86/Makefile b/drivers/platform/x86/Makefile
index 5f823f7eff45..f4b1f87f2401 100644
--- a/drivers/platform/x86/Makefile
+++ b/drivers/platform/x86/Makefile
@@ -140,6 +140,7 @@  obj-$(CONFIG_INTEL_MFLD_THERMAL)	+= intel_mid_thermal.o
 obj-$(CONFIG_INTEL_MID_POWER_BUTTON)	+= intel_mid_powerbtn.o
 obj-$(CONFIG_INTEL_MRFLD_PWRBTN)	+= intel_mrfld_pwrbtn.o
 obj-$(CONFIG_INTEL_PMC_CORE)		+= intel_pmc_core.o intel_pmc_core_pltdrv.o
+obj-$(CONFIG_INTEL_PMT_CLASS)		+= intel_pmt_class.o
 obj-$(CONFIG_INTEL_PUNIT_IPC)		+= intel_punit_ipc.o
 obj-$(CONFIG_INTEL_SCU_IPC)		+= intel_scu_ipc.o
 obj-$(CONFIG_INTEL_SCU_PCI)		+= intel_scu_pcidrv.o
diff --git a/drivers/platform/x86/intel_pmt_class.c b/drivers/platform/x86/intel_pmt_class.c
new file mode 100644
index 000000000000..d7726042d6dc
--- /dev/null
+++ b/drivers/platform/x86/intel_pmt_class.c
@@ -0,0 +1,286 @@ 
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Intel Platform Monitory Technology Telemetry driver
+ *
+ * Copyright (c) 2020, Intel Corporation.
+ * All Rights Reserved.
+ *
+ * Author: "Alexander Duyck" <alexander.h.duyck@linux.intel.com>
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/io.h>
+#include <linux/pci.h>
+
+#include "intel_pmt_class.h"
+
+#define PMT_XA_START		0
+#define PMT_XA_MAX		INT_MAX
+#define PMT_XA_LIMIT		XA_LIMIT(PMT_XA_START, PMT_XA_MAX)
+
+static const struct pci_device_id pmt_telem_early_client_pci_ids[] = {
+	{ PCI_VDEVICE(INTEL, 0x9a0d) }, /* TGL */
+	{ }
+};
+
+bool intel_pmt_is_early_client_hw(struct device *dev)
+{
+	struct pci_dev *parent = to_pci_dev(dev->parent);
+
+	return !!pci_match_id(pmt_telem_early_client_pci_ids, parent);
+}
+EXPORT_SYMBOL_GPL(intel_pmt_is_early_client_hw);
+
+/*
+ * sysfs
+ */
+static ssize_t
+intel_pmt_read(struct file *filp, struct kobject *kobj,
+	       struct bin_attribute *attr, char *buf, loff_t off,
+	       size_t count)
+{
+	struct intel_pmt_entry *entry = container_of(attr,
+						     struct intel_pmt_entry,
+						     pmt_bin_attr);
+
+	if (off < 0)
+		return -EINVAL;
+
+	if (off >= entry->size)
+		return 0;
+
+	if (count > entry->size - off)
+		count = entry->size - off;
+
+	if (count)
+		memcpy_fromio(buf, entry->base + off, count);
+
+	return count;
+}
+
+static int
+intel_pmt_mmap(struct file *filp, struct kobject *kobj,
+		struct bin_attribute *attr, struct vm_area_struct *vma)
+{
+	struct intel_pmt_entry *entry = container_of(attr,
+						     struct intel_pmt_entry,
+						     pmt_bin_attr);
+	unsigned long vsize = vma->vm_end - vma->vm_start;
+	struct device *dev = kobj_to_dev(kobj);
+	unsigned long phys = entry->base_addr;
+	unsigned long pfn = PFN_DOWN(phys);
+	unsigned long psize;
+
+	if (vma->vm_flags & (VM_WRITE | VM_MAYWRITE))
+		return -EROFS;
+
+	psize = (PFN_UP(entry->base_addr + entry->size) - pfn) * PAGE_SIZE;
+	if (vsize > psize) {
+		dev_err(dev, "Requested mmap size is too large\n");
+		return -EINVAL;
+	}
+
+	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+	if (io_remap_pfn_range(vma, vma->vm_start, pfn,
+		vsize, vma->vm_page_prot))
+		return -EAGAIN;
+
+	return 0;
+}
+
+static ssize_t
+guid_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct intel_pmt_entry *entry = dev_get_drvdata(dev);
+
+	return sprintf(buf, "0x%x\n", entry->guid);
+}
+static DEVICE_ATTR_RO(guid);
+
+static ssize_t size_show(struct device *dev, struct device_attribute *attr,
+			 char *buf)
+{
+	struct intel_pmt_entry *entry = dev_get_drvdata(dev);
+
+	return sprintf(buf, "%zu\n", entry->size);
+}
+static DEVICE_ATTR_RO(size);
+
+static ssize_t
+offset_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct intel_pmt_entry *entry = dev_get_drvdata(dev);
+
+	return sprintf(buf, "%lu\n", offset_in_page(entry->base_addr));
+}
+static DEVICE_ATTR_RO(offset);
+
+static struct attribute *intel_pmt_attrs[] = {
+	&dev_attr_guid.attr,
+	&dev_attr_size.attr,
+	&dev_attr_offset.attr,
+	NULL
+};
+
+ATTRIBUTE_GROUPS(intel_pmt);
+
+static struct class intel_pmt_class = {
+	.name = "intel_pmt",
+	.owner = THIS_MODULE,
+	.dev_groups = intel_pmt_groups,
+};
+
+int intel_pmt_populate_entry(struct intel_pmt_entry *entry,
+			     struct intel_pmt_header *header,
+			     struct device *dev, struct resource *disc_res)
+{
+	struct pci_dev *pci_dev = to_pci_dev(dev->parent);
+	u8 bir;
+
+	/*
+	 * The base offset should always be 8 byte aligned.
+	 *
+	 * For non-local access types the lower 3 bits of base offset
+	 * contains the index of the base address register where the
+	 * telemetry can be found.
+	 */
+	bir = GET_BIR(header->base_offset);
+
+	/* Local access and BARID only for now */
+	switch (header->access_type) {
+	case ACCESS_LOCAL:
+		if (bir) {
+			dev_err(dev,
+				"Unsupported BAR index %d for access type %d\n",
+				bir, header->access_type);
+			return -EINVAL;
+		}
+		/*
+		 * For access_type LOCAL, the base address is as follows:
+		 * base address = end of discovery region + base offset
+		 */
+		entry->base_addr = disc_res->end + 1 + header->base_offset;
+		break;
+	case ACCESS_BARID:
+		/*
+		 * If another BAR was specified then the base offset
+		 * represents the offset within that BAR. SO retrieve the
+		 * address from the parent PCI device and add offset.
+		 */
+		entry->base_addr = pci_resource_start(pci_dev, bir) +
+				   GET_ADDRESS(header->base_offset);
+		break;
+	default:
+		dev_err(dev, "Unsupported access type %d\n",
+			header->access_type);
+		return -EINVAL;
+	}
+
+	entry->guid = header->guid;
+	entry->size = header->size;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(intel_pmt_populate_entry);
+
+int intel_pmt_dev_create(struct intel_pmt_entry *entry,
+			 struct intel_pmt_namespace *ns, struct device *parent)
+{
+	struct resource res;
+	struct device *dev;
+	int ret;
+
+	ret = xa_alloc(ns->xa, &entry->devid, entry, PMT_XA_LIMIT, GFP_KERNEL);
+	if (ret)
+		return ret;
+
+	dev = device_create(&intel_pmt_class, parent, MKDEV(0, 0), entry,
+			    "%s%d", ns->name, entry->devid);
+
+	if (IS_ERR(dev)) {
+		dev_err(parent, "Could not create %s%d device node\n",
+			ns->name, entry->devid);
+		ret = PTR_ERR(dev);
+		goto fail_dev_create;
+	}
+
+	entry->kobj = &dev->kobj;
+
+	if (ns->attr_grp) {
+		ret = sysfs_create_group(entry->kobj, ns->attr_grp);
+		if (ret)
+			goto fail_sysfs;
+	}
+
+	/* if size is 0 assume no data buffer, so no file needed */
+	if (!entry->size)
+		return 0;
+
+	res.start = entry->base_addr;
+	res.end = res.start + entry->size - 1;
+	res.flags = IORESOURCE_MEM;
+
+	entry->base = devm_ioremap_resource(dev, &res);
+	if (IS_ERR(entry->base)) {
+		dev_err(dev, "Failed to ioremap device region\n");
+		ret = -EIO;
+		goto fail_ioremap;
+	}
+
+	sysfs_bin_attr_init(&entry->pmt_bin_attr);
+	entry->pmt_bin_attr.attr.name = ns->name;
+	entry->pmt_bin_attr.attr.mode = 0440;
+	entry->pmt_bin_attr.mmap = intel_pmt_mmap;
+	entry->pmt_bin_attr.read = intel_pmt_read;
+	entry->pmt_bin_attr.size = entry->size;
+
+	ret = sysfs_create_bin_file(&dev->kobj, &entry->pmt_bin_attr);
+	if (!ret)
+		return 0;
+
+	iounmap(entry->base);
+fail_ioremap:
+	sysfs_remove_group(entry->kobj, ns->attr_grp);
+fail_sysfs:
+	device_unregister(dev);
+fail_dev_create:
+	xa_erase(ns->xa, entry->devid);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(intel_pmt_dev_create);
+
+void intel_pmt_dev_destroy(struct intel_pmt_entry *entry,
+			   struct intel_pmt_namespace *ns)
+{
+	struct device *dev = kobj_to_dev(entry->kobj);
+
+	if (entry->size)
+		sysfs_remove_bin_file(entry->kobj, &entry->pmt_bin_attr);
+
+	if (ns->attr_grp)
+		sysfs_remove_group(entry->kobj, ns->attr_grp);
+
+	device_unregister(dev);
+	xa_erase(ns->xa, entry->devid);
+}
+EXPORT_SYMBOL_GPL(intel_pmt_dev_destroy);
+
+static int __init pmt_class_init(void)
+{
+	return class_register(&intel_pmt_class);
+}
+
+static void __exit pmt_class_exit(void)
+{
+	class_unregister(&intel_pmt_class);
+}
+
+module_init(pmt_class_init);
+module_exit(pmt_class_exit);
+
+MODULE_AUTHOR("Alexander Duyck <alexander.h.duyck@linux.intel.com>");
+MODULE_DESCRIPTION("Intel PMT Class driver");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/platform/x86/intel_pmt_class.h b/drivers/platform/x86/intel_pmt_class.h
new file mode 100644
index 000000000000..ccb4e56a1984
--- /dev/null
+++ b/drivers/platform/x86/intel_pmt_class.h
@@ -0,0 +1,57 @@ 
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _INTEL_PMT_CLASS_H
+#define _INTEL_PMT_CLASS_H
+
+#include <linux/platform_device.h>
+#include <linux/xarray.h>
+
+/* PMT access types */
+#define ACCESS_BARID		2
+#define ACCESS_LOCAL		3
+
+/* PMT discovery base address/offset register layout */
+#define GET_BIR(v)		((v) & GENMASK(2, 0))
+#define GET_ADDRESS(v)		((v) & GENMASK(31, 3))
+
+struct intel_pmt_entry {
+	struct bin_attribute	pmt_bin_attr;
+	struct kobject		*kobj;
+	void __iomem		*disc_table;
+	void __iomem		*base;
+	unsigned long		base_addr;
+	size_t			size;
+	u32			guid;
+	int			devid;
+};
+
+struct intel_pmt_header {
+	u32	base_offset;
+	u32	size;
+	u32	guid;
+	u8	access_type;
+};
+
+struct intel_pmt_namespace {
+	const char *name;
+	struct xarray *xa;
+	const struct attribute_group *attr_grp;
+};
+
+bool intel_pmt_is_early_client_hw(struct device *dev);
+int intel_pmt_populate_entry(struct intel_pmt_entry *entry,
+			     struct intel_pmt_header *header,
+			     struct device *dev, struct resource *disc_res);
+int intel_pmt_dev_create(struct intel_pmt_entry *entry,
+			 struct intel_pmt_namespace *ns, struct device *parent);
+void intel_pmt_dev_destroy(struct intel_pmt_entry *entry,
+			   struct intel_pmt_namespace *ns);
+
+static inline int
+intel_pmt_ioremap_discovery_table(struct intel_pmt_entry *entry,
+				  struct platform_device *pdev,  int i)
+{
+	entry->disc_table = devm_platform_ioremap_resource(pdev, i);
+
+	return PTR_ERR_OR_ZERO(entry->disc_table);
+}
+#endif