diff mbox series

[linux,dev-4.10,6/6] drivers/hwmon: Add a driver for a generic PECI hwmon

Message ID 20180109223126.13093-7-jae.hyun.yoo@linux.intel.com
State Not Applicable, archived
Headers show
Series Add support PECI and PECI hwmon drivers | expand

Commit Message

Jae Hyun Yoo Jan. 9, 2018, 10:31 p.m. UTC
This commit adds driver implementation for a generic PECI hwmon.

Signed-off-by: Jae Hyun Yoo <jae.hyun.yoo@linux.intel.com>
---
 drivers/hwmon/Kconfig      |   6 +
 drivers/hwmon/Makefile     |   1 +
 drivers/hwmon/peci-hwmon.c | 953 +++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 960 insertions(+)
 create mode 100644 drivers/hwmon/peci-hwmon.c

Comments

Arnd Bergmann Jan. 10, 2018, 12:29 p.m. UTC | #1
On Tue, Jan 9, 2018 at 11:31 PM, Jae Hyun Yoo
<jae.hyun.yoo@linux.intel.com> wrote:
> This commit adds driver implementation for a generic PECI hwmon.
>
> Signed-off-by: Jae Hyun Yoo <jae.hyun.yoo@linux.intel.com>

> +static int xfer_peci_msg(int cmd, void *pmsg)
> +{
> +       int rc;
> +
> +       mutex_lock(&peci_hwmon_lock);
> +       rc = peci_ioctl(NULL, cmd, (unsigned long)pmsg);
> +       mutex_unlock(&peci_hwmon_lock);
> +
> +       return rc;
> +}

I said earlier that peci_ioctl() looked unused, that was obviously
wrong, but what you have here
is not a proper way to abstract a bus.

Maybe this can be done more like an i2c bus: make the peci controller
a bus device
and register all known target/index pairs as devices with the peci bus
type, and have
them probed from DT. The driver can then bind to each of those individually.
Not sure if that is getting to granular at that point, I'd have to
understand better
how it is expected to get used, and what the variances are between
implementations.

       Arnd
Guenter Roeck Jan. 10, 2018, 9:47 p.m. UTC | #2
On Tue, Jan 09, 2018 at 02:31:26PM -0800, Jae Hyun Yoo wrote:
> This commit adds driver implementation for a generic PECI hwmon.
> 
> Signed-off-by: Jae Hyun Yoo <jae.hyun.yoo@linux.intel.com>
> ---
>  drivers/hwmon/Kconfig      |   6 +
>  drivers/hwmon/Makefile     |   1 +
>  drivers/hwmon/peci-hwmon.c | 953 +++++++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 960 insertions(+)
>  create mode 100644 drivers/hwmon/peci-hwmon.c
> 
> diff --git a/drivers/hwmon/Kconfig b/drivers/hwmon/Kconfig
> index 9256dd0..3a62c60 100644
> --- a/drivers/hwmon/Kconfig
> +++ b/drivers/hwmon/Kconfig
> @@ -1234,6 +1234,12 @@ config SENSORS_NCT7904
>  	  This driver can also be built as a module.  If so, the module
>  	  will be called nct7904.
>  
> +config SENSORS_PECI_HWMON
> +	tristate "PECI hwmon support"
> +	depends on ASPEED_PECI
> +	help
> +	  If you say yes here you get support for the generic PECI hwmon driver.
> +
>  config SENSORS_NSA320
>  	tristate "ZyXEL NSA320 and compatible fan speed and temperature sensors"
>  	depends on GPIOLIB && OF
> diff --git a/drivers/hwmon/Makefile b/drivers/hwmon/Makefile
> index 98000fc..41d43a5 100644
> --- a/drivers/hwmon/Makefile
> +++ b/drivers/hwmon/Makefile
> @@ -131,6 +131,7 @@ obj-$(CONFIG_SENSORS_NCT7802)	+= nct7802.o
>  obj-$(CONFIG_SENSORS_NCT7904)	+= nct7904.o
>  obj-$(CONFIG_SENSORS_NSA320)	+= nsa320-hwmon.o
>  obj-$(CONFIG_SENSORS_NTC_THERMISTOR)	+= ntc_thermistor.o
> +obj-$(CONFIG_SENSORS_PECI_HWMON)	+= peci-hwmon.o
>  obj-$(CONFIG_SENSORS_PC87360)	+= pc87360.o
>  obj-$(CONFIG_SENSORS_PC87427)	+= pc87427.o
>  obj-$(CONFIG_SENSORS_PCF8591)	+= pcf8591.o
> diff --git a/drivers/hwmon/peci-hwmon.c b/drivers/hwmon/peci-hwmon.c
> new file mode 100644
> index 0000000..2d2a288
> --- /dev/null
> +++ b/drivers/hwmon/peci-hwmon.c
> @@ -0,0 +1,953 @@
> +// SPDX-License-Identifier: GPL-2.0
> +// Copyright (c) 2017 Intel Corporation
> +
> +#include <linux/delay.h>
> +#include <linux/hwmon.h>
> +#include <linux/hwmon-sysfs.h>
> +#include <linux/jiffies.h>
> +#include <linux/module.h>
> +#include <linux/mutex.h>
> +#include <linux/of_platform.h>
> +#include <linux/of_device.h>
> +#include <linux/platform_device.h>
> +#include <linux/syscalls.h>
> +#include <misc/peci.h>

misc, not linux ? That seems wrong.

> +
> +#define DEVICE_NAME "peci-hwmon"
> +#define HWMON_NAME "peci_hwmon"
> +
> +#define CPU_ID_MAX           8   /* Max CPU number configured by socket ID */
> +#define DIMM_NUMS_MAX        16  /* Max DIMM numbers (channel ranks x 2) */
> +#define CORE_NUMS_MAX        28  /* Max core numbers (max on SKX Platinum) */

I won't insist, but it would be better if this were dynamic,
otherwise we'll end up having to increase the defines in the future.

> +#define TEMP_TYPE_PECI       6   /* Sensor type 6: Intel PECI */
> +#define CORE_INDEX_OFFSET    100 /* sysfs filename start offset for core temp */
> +#define DIMM_INDEX_OFFSET    200 /* sysfs filename start offset for DIMM temp */

Did you test with the "sensors" command to ensure that this works,
with the large gaps in index values ?

Overall, I am not very happy with the indexing. Since each sensor as
a label, it might be better to just make it dynamic.

> +#define TEMP_NAME_HEADER_LEN 4   /* sysfs temp type header length */
> +#define OF_DIMM_NUMS_DEFAULT 16  /* default dimm-nums setting */
> +
> +#define CORE_TEMP_ATTRS      5
> +#define DIMM_TEMP_ATTRS      2
> +#define ATTR_NAME_LEN        24
> +
> +#define UPDATE_INTERVAL_MIN  HZ
> +
> +enum sign_t {
> +	POS,
> +	NEG
> +};
> +
> +struct cpuinfo_t {
> +	bool valid;
> +	u32  dib;
> +	u8   cpuid;
> +	u8   platform_id;
> +	u32  microcode;
> +	u8   logical_thread_nums;
> +};
> +
> +struct temp_data_t {
> +	bool valid;
> +	s32  value;
> +	unsigned long last_updated;
> +};
> +
> +struct temp_group_t {
> +	struct temp_data_t tjmax;
> +	struct temp_data_t tcontrol;
> +	struct temp_data_t tthrottle;
> +	struct temp_data_t dts_margin;
> +	struct temp_data_t die;
> +	struct temp_data_t core[CORE_NUMS_MAX];
> +	struct temp_data_t dimm[DIMM_NUMS_MAX];
> +};
> +
> +struct core_temp_attr_group_t {
> +	struct sensor_device_attribute sd_attrs[CORE_NUMS_MAX][CORE_TEMP_ATTRS];
> +	char attr_name[CORE_NUMS_MAX][CORE_TEMP_ATTRS][ATTR_NAME_LEN];
> +	struct attribute *attrs[CORE_NUMS_MAX][CORE_TEMP_ATTRS + 1];
> +	struct attribute_group attr_group[CORE_NUMS_MAX];
> +};
> +
> +struct dimm_temp_attr_group_t {
> +	struct sensor_device_attribute sd_attrs[DIMM_NUMS_MAX][DIMM_TEMP_ATTRS];
> +	char attr_name[DIMM_NUMS_MAX][DIMM_TEMP_ATTRS][ATTR_NAME_LEN];
> +	struct attribute *attrs[DIMM_NUMS_MAX][DIMM_TEMP_ATTRS + 1];
> +	struct attribute_group attr_group[DIMM_NUMS_MAX];
> +};
> +
> +struct peci_hwmon {
> +	struct device *dev;
> +	struct device *hwmon_dev;
> +	char name[NAME_MAX];
> +	const struct attribute_group **groups;
> +	struct cpuinfo_t cpuinfo;
> +	struct temp_group_t temp;
> +	u32 cpu_id;
> +	bool show_core;
> +	u32 core_nums;
> +	u32 dimm_nums;
> +	atomic_t core_group_created;
> +	struct core_temp_attr_group_t core;
> +	struct dimm_temp_attr_group_t dimm;
> +};
> +
> +enum label_t {
> +	L_DIE,
> +	L_DTS,
> +	L_TCONTROL,
> +	L_TTHROTTLE,
> +	L_MAX
> +};
> +
> +static const char *peci_label[L_MAX] = {
> +	"Die temperature\n",
> +	"DTS thermal margin to Tcontrol\n",
> +	"Tcontrol temperature\n",
> +	"Tthrottle temperature\n",

"temperature" is redundant for a temperature label.

> +};
> +
> +static DEFINE_MUTEX(peci_hwmon_lock);
> +
> +static int create_core_temp_group(struct peci_hwmon *priv, int core_no);

Please avoid forward declarations.

> +
> +

Please run your patches throuch checkpatch --strict and fix what it reports,
or provide a reason why you don't.

> +static int xfer_peci_msg(int cmd, void *pmsg)
> +{
> +	int rc;
> +
> +	mutex_lock(&peci_hwmon_lock);
> +	rc = peci_ioctl(NULL, cmd, (unsigned long)pmsg);
> +	mutex_unlock(&peci_hwmon_lock);
> +
> +	return rc;
> +}
> +
> +static int get_cpuinfo(struct peci_hwmon *priv)
> +{
> +	struct peci_get_dib_msg dib_msg;
> +	struct peci_rd_pkg_cfg_msg cfg_msg;
> +	int rc, i;
> +
> +	if (!priv->cpuinfo.valid) {
> +		dib_msg.target = PECI_BASE_ADDR + priv->cpu_id;
> +
> +		rc = xfer_peci_msg(PECI_IOC_GET_DIB, (void *)&dib_msg);
> +		if (rc < 0)
> +			return rc;
> +
> +		priv->cpuinfo.dib = dib_msg.dib;
> +
> +		cfg_msg.target = PECI_BASE_ADDR + priv->cpu_id;
> +		cfg_msg.index = MBX_INDEX_CPU_ID;
> +		cfg_msg.param = 0;
> +		cfg_msg.rx_len = 4;
> +
> +		rc = xfer_peci_msg(PECI_IOC_RD_PKG_CFG, (void *)&cfg_msg);
> +		if (rc < 0)
> +			return rc;
> +
> +		priv->cpuinfo.cpuid = cfg_msg.pkg_config[0];
> +
> +		cfg_msg.target = PECI_BASE_ADDR + priv->cpu_id;
> +		cfg_msg.index = MBX_INDEX_CPU_ID;
> +		cfg_msg.param = 1;
> +		cfg_msg.rx_len = 4;
> +
> +		rc = xfer_peci_msg(PECI_IOC_RD_PKG_CFG, (void *)&cfg_msg);
> +		if (rc < 0)
> +			return rc;
> +
> +		priv->cpuinfo.platform_id = cfg_msg.pkg_config[0];
> +
> +		cfg_msg.target = PECI_BASE_ADDR + priv->cpu_id;
> +		cfg_msg.index = MBX_INDEX_CPU_ID;
> +		cfg_msg.param = 3;
> +		cfg_msg.rx_len = 4;
> +
> +		rc = xfer_peci_msg(PECI_IOC_RD_PKG_CFG, (void *)&cfg_msg);
> +		if (rc < 0)
> +			return rc;
> +
> +		priv->cpuinfo.logical_thread_nums = cfg_msg.pkg_config[0] + 1;
> +
> +		cfg_msg.target = PECI_BASE_ADDR + priv->cpu_id;
> +		cfg_msg.index = MBX_INDEX_CPU_ID;
> +		cfg_msg.param = 4;
> +		cfg_msg.rx_len = 4;
> +
> +		rc = xfer_peci_msg(PECI_IOC_RD_PKG_CFG, (void *)&cfg_msg);
> +		if (rc < 0)
> +			return rc;
> +
> +		priv->cpuinfo.microcode = (cfg_msg.pkg_config[3] << 24) |
> +					  (cfg_msg.pkg_config[2] << 16) |
> +					  (cfg_msg.pkg_config[1] << 8) |
> +					  cfg_msg.pkg_config[0];
> +
> +		priv->core_nums = priv->cpuinfo.logical_thread_nums / 2;

This seems to assume a 1:2 relationship between number of threads and
number of CPUs, which is incorrect.

> +
> +		if (priv->show_core &&
> +		    atomic_inc_return(&priv->core_group_created) == 1) {
> +			for (i = 0; i < priv->core_nums; i++) {
> +				rc = create_core_temp_group(priv, i);

This is messy. Sensor groups should be created before or during
hwmon registration, not at some arbitrary later time.

I don't know the logic behind this, but if it is supposed to track CPUs
coming online and going offline it is the wrong approach.

> +				if (rc != 0) {
> +					dev_err(priv->dev,
> +						"Failed to create core temp group\n");
> +					for (--i; i >= 0; i--) {
> +						sysfs_remove_group(
> +						     &priv->hwmon_dev->kobj,
> +						     &priv->core.attr_group[i]);
> +					}
> +					atomic_set(&priv->core_group_created,
> +						   0);
> +					return rc;
> +				}
> +			}
> +		}
> +
> +		priv->cpuinfo.valid = true;
> +	}
> +
> +	return 0;
> +}
> +
> +static int get_tjmax(struct peci_hwmon *priv)
> +{
> +	struct peci_rd_pkg_cfg_msg msg;
> +	int rc;
> +
> +	rc = get_cpuinfo(priv);
> +	if (rc < 0)
> +		return rc;
> +
> +	if (!priv->temp.tjmax.valid) {
> +		msg.target = PECI_BASE_ADDR + priv->cpu_id;
> +		msg.index = MBX_INDEX_TEMP_TARGET;
> +		msg.param = 0;
> +		msg.rx_len = 4;
> +
> +		rc = xfer_peci_msg(PECI_IOC_RD_PKG_CFG, (void *)&msg);
> +		if (rc < 0)
> +			return rc;
> +
> +		priv->temp.tjmax.value = (s32)msg.pkg_config[2] * 1000;
> +		priv->temp.tjmax.valid = true;
> +	}
> +
> +	return 0;
> +}
> +
> +static int get_tcontrol(struct peci_hwmon *priv)
> +{
> +	struct peci_rd_pkg_cfg_msg msg;
> +	s32 tcontrol_margin;
> +	int rc;
> +
> +	if (priv->temp.tcontrol.valid &&
> +	    time_before(jiffies, priv->temp.tcontrol.last_updated +
> +				 UPDATE_INTERVAL_MIN))
> +		return 0;
> +

Is the delay necessary ? Otherwise I would suggest to drop it.
It adds a lot of complexity to the driver. Also, if the user polls
values more often, that is presumably on purpose.

> +	rc = get_tjmax(priv);
> +	if (rc < 0)
> +		return rc;
> +
> +	msg.target = PECI_BASE_ADDR + priv->cpu_id;
> +	msg.index = MBX_INDEX_TEMP_TARGET;
> +	msg.param = 0;
> +	msg.rx_len = 4;
> +
> +	rc = xfer_peci_msg(PECI_IOC_RD_PKG_CFG, (void *)&msg);
> +	if (rc < 0)
> +		return rc;
> +
> +	tcontrol_margin = msg.pkg_config[1];
> +	tcontrol_margin = ((tcontrol_margin ^ 0x80) - 0x80) * 1000;
> +
> +	priv->temp.tcontrol.value = priv->temp.tjmax.value - tcontrol_margin;
> +
> +	if (!priv->temp.tcontrol.valid) {
> +		priv->temp.tcontrol.last_updated = INITIAL_JIFFIES;
> +		priv->temp.tcontrol.valid = true;
> +	} else {
> +		priv->temp.tcontrol.last_updated = jiffies;
> +	}
> +
> +	return 0;
> +}
> +
> +static int get_tthrottle(struct peci_hwmon *priv)
> +{
> +	struct peci_rd_pkg_cfg_msg msg;
> +	s32 tthrottle_offset;
> +	int rc;
> +
> +	if (priv->temp.tthrottle.valid &&
> +	    time_before(jiffies, priv->temp.tthrottle.last_updated +
> +				 UPDATE_INTERVAL_MIN))
> +		return 0;
> +
> +	rc = get_tjmax(priv);
> +	if (rc < 0)
> +		return rc;
> +
> +	msg.target = PECI_BASE_ADDR + priv->cpu_id;
> +	msg.index = MBX_INDEX_TEMP_TARGET;
> +	msg.param = 0;
> +	msg.rx_len = 4;
> +
> +	rc = xfer_peci_msg(PECI_IOC_RD_PKG_CFG, (void *)&msg);
> +	if (rc < 0)
> +		return rc;
> +
> +	tthrottle_offset = (msg.pkg_config[3] & 0x2f) * 1000;
> +	priv->temp.tthrottle.value = priv->temp.tjmax.value - tthrottle_offset;
> +
> +	if (!priv->temp.tthrottle.valid) {
> +		priv->temp.tthrottle.last_updated = INITIAL_JIFFIES;
> +		priv->temp.tthrottle.valid = true;
> +	} else {
> +		priv->temp.tthrottle.last_updated = jiffies;
> +	}
> +
> +	return 0;
> +}
> +
> +static int get_die_temp(struct peci_hwmon *priv)
> +{
> +	struct peci_get_temp_msg msg;
> +	int rc;
> +
> +	if (priv->temp.die.valid &&
> +	    time_before(jiffies, priv->temp.die.last_updated +
> +				 UPDATE_INTERVAL_MIN))
> +		return 0;
> +
> +	rc = get_tjmax(priv);
> +	if (rc < 0)
> +		return rc;
> +
> +	msg.target = PECI_BASE_ADDR + priv->cpu_id;
> +
> +	rc = xfer_peci_msg(PECI_IOC_GET_TEMP, (void *)&msg);
> +	if (rc < 0)
> +		return rc;
> +
> +	priv->temp.die.value = priv->temp.tjmax.value +
> +			       ((s32)msg.temp_raw * 1000 / 64);
> +
> +	if (!priv->temp.die.valid) {
> +		priv->temp.die.last_updated = INITIAL_JIFFIES;
> +		priv->temp.die.valid = true;
> +	} else {
> +		priv->temp.die.last_updated = jiffies;
> +	}
> +
> +	return 0;
> +}
> +
> +static int get_dts_margin(struct peci_hwmon *priv)
> +{
> +	struct peci_rd_pkg_cfg_msg msg;
> +	s32 dts_margin;
> +	int rc;
> +
> +	if (priv->temp.dts_margin.valid &&
> +	    time_before(jiffies, priv->temp.dts_margin.last_updated +
> +				 UPDATE_INTERVAL_MIN))
> +		return 0;
> +
Are all those values expected to change dynamically, or are some static ?
Static values do not have to be re-read repeatedly but can be cached
permanently.

> +	rc = get_cpuinfo(priv);
> +	if (rc < 0)
> +		return rc;
> +
> +	msg.target = PECI_BASE_ADDR + priv->cpu_id;
> +	msg.index = MBX_INDEX_DTS_MARGIN;
> +	msg.param = 0;
> +	msg.rx_len = 4;
> +
> +	rc = xfer_peci_msg(PECI_IOC_RD_PKG_CFG, (void *)&msg);
> +	if (rc < 0)
> +		return rc;
> +
> +	dts_margin = (msg.pkg_config[1] << 8) | msg.pkg_config[0];
> +
> +	/*
> +	 * Processors return a value of DTS reading in 10.6 format
> +	 * (10 bits signed decimal, 6 bits fractional).
> +	 * Error codes:
> +	 *   0x8000: General sensor error
> +	 *   0x8001: Reserved
> +	 *   0x8002: Underflow on reading value
> +	 *   0x8003-0x81ff: Reserved
> +	 */
> +	if (dts_margin >= 0x8000 && dts_margin <= 0x81ff)
> +		return -1;
> +
> +	dts_margin = ((dts_margin ^ 0x8000) - 0x8000) * 1000 / 64;
> +
The above code is repeated several times. Please consider moving it
into a function to reduce duplication.

> +	priv->temp.dts_margin.value = dts_margin;
> +
> +	if (!priv->temp.dts_margin.valid) {
> +		priv->temp.dts_margin.last_updated = INITIAL_JIFFIES;
> +		priv->temp.dts_margin.valid = true;
> +	} else {
> +		priv->temp.dts_margin.last_updated = jiffies;
> +	}
> +
> +	return 0;
> +}
> +
> +static int get_core_temp(struct peci_hwmon *priv, int core_index)
> +{
> +	struct peci_rd_pkg_cfg_msg msg;
> +	s32 core_dts_margin;
> +	int rc;
> +
> +	if (priv->temp.core[core_index].valid &&
> +	    time_before(jiffies, priv->temp.core[core_index].last_updated +
> +				 UPDATE_INTERVAL_MIN))
> +		return 0;
> +
> +	rc = get_tjmax(priv);
> +	if (rc < 0)
> +		return rc;
> +
> +	msg.target = PECI_BASE_ADDR + priv->cpu_id;
> +	msg.index = MBX_INDEX_PER_CORE_DTS_TEMP;
> +	msg.param = core_index;
> +	msg.rx_len = 4;
> +
> +	rc = xfer_peci_msg(PECI_IOC_RD_PKG_CFG, (void *)&msg);
> +	if (rc < 0)
> +		return rc;
> +
> +	core_dts_margin = (msg.pkg_config[1] << 8) | msg.pkg_config[0];
> +
> +	/*
> +	 * Processors return a value of the core DTS reading in 10.6 format
> +	 * (10 bits signed decimal, 6 bits fractional).
> +	 * Error codes:
> +	 *   0x8000: General sensor error
> +	 *   0x8001: Reserved
> +	 *   0x8002: Underflow on reading value
> +	 *   0x8003-0x81ff: Reserved
> +	 */
> +	if (core_dts_margin >= 0x8000 && core_dts_margin <= 0x81ff)
> +		return -1;
> +
> +	core_dts_margin = ((core_dts_margin ^ 0x8000) - 0x8000) * 1000 / 64;
> +
> +	priv->temp.core[core_index].value = priv->temp.tjmax.value +
> +					    core_dts_margin;
> +
> +	if (!priv->temp.core[core_index].valid) {
> +		priv->temp.core[core_index].last_updated = INITIAL_JIFFIES;
> +		priv->temp.core[core_index].valid = true;
> +	} else {
> +		priv->temp.core[core_index].last_updated = jiffies;
> +	}
> +
> +	return 0;
> +}
> +
> +static int get_dimm_temp(struct peci_hwmon *priv, int dimm_index)
> +{
> +	struct peci_rd_pkg_cfg_msg msg;
> +	int channel_rank = dimm_index / 2;
> +	int dimm_order = dimm_index % 2;
> +	int rc;
> +
> +	if (priv->temp.core[dimm_index].valid &&
> +	    time_before(jiffies, priv->temp.core[dimm_index].last_updated +
> +				 UPDATE_INTERVAL_MIN))
> +		return 0;
> +
> +	rc = get_cpuinfo(priv);
> +	if (rc < 0)
> +		return rc;
> +
> +	msg.target = PECI_BASE_ADDR + priv->cpu_id;
> +	msg.index = MBX_INDEX_DDR_DIMM_TEMP;
> +	msg.param = channel_rank;
> +	msg.rx_len = 4;
> +
> +	rc = xfer_peci_msg(PECI_IOC_RD_PKG_CFG, (void *)&msg);
> +	if (rc < 0)
> +		return rc;
> +
> +	priv->temp.dimm[dimm_index].value = msg.pkg_config[dimm_order] * 1000;
> +
> +	if (!priv->temp.dimm[dimm_index].valid) {
> +		priv->temp.dimm[dimm_index].last_updated = INITIAL_JIFFIES;
> +		priv->temp.dimm[dimm_index].valid = true;
> +	} else {
> +		priv->temp.dimm[dimm_index].last_updated = jiffies;
> +	}
> +
> +	return 0;
> +}
> +
> +static ssize_t show_info(struct device *dev,
> +			 struct device_attribute *attr,
> +			 char *buf)
> +{
> +	struct peci_hwmon *priv = dev_get_drvdata(dev);
> +	int rc;
> +
> +	rc = get_cpuinfo(priv);
> +	if (rc < 0)
> +		return rc;
> +
> +	return sprintf(buf, "dib         : 0x%08x\n"
> +			    "cpuid       : 0x%x\n"
> +			    "platform id : %d\n"
> +			    "stepping    : %d\n"
> +			    "microcode   : 0x%08x\n"
> +			    "logical thread nums : %d\n",
> +			    priv->cpuinfo.dib,
> +			    priv->cpuinfo.cpuid,
> +			    priv->cpuinfo.platform_id,
> +			    priv->cpuinfo.cpuid & 0xf,
> +			    priv->cpuinfo.microcode,
> +			    priv->cpuinfo.logical_thread_nums);
> +}

Please no non-standard attributes, much less attributes not following sysfs
attribute rules. If you want to display such information, consider using
debugfs. Besides, this information specifically appears to duplicate
the content of /proc/cpuid, which doesn't really add any value at all.

> +
> +static ssize_t show_tcontrol(struct device *dev,
> +			     struct device_attribute *attr,
> +			     char *buf)
> +{
> +	struct peci_hwmon *priv = dev_get_drvdata(dev);
> +	int rc;
> +
> +	rc = get_tcontrol(priv);
> +	if (rc < 0)
> +		return rc;
> +
> +	return sprintf(buf, "%d\n", priv->temp.tcontrol.value);
> +}
> +
> +static ssize_t show_tcontrol_margin(struct device *dev,
> +				    struct device_attribute *attr,
> +				    char *buf)
> +{
> +	struct peci_hwmon *priv = dev_get_drvdata(dev);
> +	struct sensor_device_attribute *sensor_attr = to_sensor_dev_attr(attr);
> +	int rc;
> +
> +	rc = get_tcontrol(priv);
> +	if (rc < 0)
> +		return rc;
> +
> +	return sprintf(buf, "%d\n", sensor_attr->index == POS ?
> +				    priv->temp.tjmax.value -
> +				    priv->temp.tcontrol.value :
> +				    priv->temp.tcontrol.value -
> +				    priv->temp.tjmax.value);
> +}
> +
> +static ssize_t show_tthrottle(struct device *dev,
> +			      struct device_attribute *attr,
> +			      char *buf)
> +{
> +	struct peci_hwmon *priv = dev_get_drvdata(dev);
> +	int rc;
> +
> +	rc = get_tthrottle(priv);
> +	if (rc < 0)
> +		return rc;
> +
> +	return sprintf(buf, "%d\n", priv->temp.tthrottle.value);
> +}
> +
> +static ssize_t show_tjmax(struct device *dev,
> +			  struct device_attribute *attr,
> +			  char *buf)
> +{
> +	struct peci_hwmon *priv = dev_get_drvdata(dev);
> +	int rc;
> +
> +	rc = get_tjmax(priv);
> +	if (rc < 0)
> +		return rc;
> +
> +	return sprintf(buf, "%d\n", priv->temp.tjmax.value);
> +}
> +
> +static ssize_t show_die_temp(struct device *dev,
> +			     struct device_attribute *attr,
> +			     char *buf)
> +{
> +	struct peci_hwmon *priv = dev_get_drvdata(dev);
> +	int rc;
> +
> +	rc = get_die_temp(priv);
> +	if (rc < 0)
> +		return rc;
> +
> +	return sprintf(buf, "%d\n", priv->temp.die.value);
> +}
> +
> +static ssize_t show_dts_therm_margin(struct device *dev,
> +				     struct device_attribute *attr,
> +				     char *buf)
> +{
> +	struct peci_hwmon *priv = dev_get_drvdata(dev);
> +	int rc;
> +
> +	rc = get_dts_margin(priv);
> +	if (rc < 0)
> +		return rc;
> +
> +	return sprintf(buf, "%d\n", priv->temp.dts_margin.value);
> +}
> +
> +static ssize_t show_core_temp(struct device *dev,
> +			      struct device_attribute *attr,
> +			      char *buf)
> +{
> +	struct peci_hwmon *priv = dev_get_drvdata(dev);
> +	struct sensor_device_attribute *sensor_attr = to_sensor_dev_attr(attr);
> +	int core_index = sensor_attr->index;
> +	int rc;
> +
> +	rc = get_core_temp(priv, core_index);
> +	if (rc < 0)
> +		return rc;
> +
> +	return sprintf(buf, "%d\n", priv->temp.core[core_index].value);
> +}
> +
> +static ssize_t show_dimm_temp(struct device *dev,
> +			      struct device_attribute *attr,
> +			      char *buf)
> +{
> +	struct peci_hwmon *priv = dev_get_drvdata(dev);
> +	struct sensor_device_attribute *sensor_attr = to_sensor_dev_attr(attr);
> +	int dimm_index = sensor_attr->index;
> +	int rc;
> +
> +	rc = get_dimm_temp(priv, dimm_index);
> +	if (rc < 0)
> +		return rc;
> +
> +	return sprintf(buf, "%d\n", priv->temp.dimm[dimm_index].value);
> +}
> +
> +static ssize_t show_value(struct device *dev,
> +			  struct device_attribute *attr,
> +			  char *buf)
> +{
> +	struct sensor_device_attribute *sensor_attr = to_sensor_dev_attr(attr);
> +
> +	return sprintf(buf, "%d\n", sensor_attr->index);
> +}
> +
> +static ssize_t show_label(struct device *dev,
> +			  struct device_attribute *attr,
> +			  char *buf)
> +{
> +	struct sensor_device_attribute *sensor_attr = to_sensor_dev_attr(attr);
> +
> +	return sprintf(buf, peci_label[sensor_attr->index]);
> +}
> +
> +static ssize_t show_core_label(struct device *dev,
> +			       struct device_attribute *attr,
> +			       char *buf)
> +{
> +	struct sensor_device_attribute *sensor_attr = to_sensor_dev_attr(attr);
> +
> +	return sprintf(buf, "Core #%d temperature\n", sensor_attr->index);
> +}

Your label strings are quite long. How does that look like with the
sensors command ?

Plus, again, "temperature" in a temperature label is redundant.

> +
> +static ssize_t show_dimm_label(struct device *dev,
> +			       struct device_attribute *attr,
> +			       char *buf)
> +{
> +	struct sensor_device_attribute *sensor_attr = to_sensor_dev_attr(attr);
> +
> +	char channel = 'A' + (sensor_attr->index / 2);
> +	int index = sensor_attr->index % 2;
> +
> +	return sprintf(buf, "Channel Rank %c DDR DIMM #%d temperature\n",
> +		       channel, index);
> +}
> +
> +/* Die temperature */
> +static SENSOR_DEVICE_ATTR(temp1_label, 0444, show_label, NULL, L_DIE);
> +static SENSOR_DEVICE_ATTR(temp1_input, 0444, show_die_temp, NULL, 0);
> +static SENSOR_DEVICE_ATTR(temp1_max, 0444, show_tcontrol, NULL, 0);
> +static SENSOR_DEVICE_ATTR(temp1_crit, 0444, show_tjmax, NULL, 0);
> +static SENSOR_DEVICE_ATTR(temp1_crit_hyst, 0444, show_tcontrol_margin, NULL,
> +			  POS);
> +
> +static struct attribute *die_temp_attrs[] = {
> +	&sensor_dev_attr_temp1_label.dev_attr.attr,
> +	&sensor_dev_attr_temp1_input.dev_attr.attr,
> +	&sensor_dev_attr_temp1_max.dev_attr.attr,
> +	&sensor_dev_attr_temp1_crit.dev_attr.attr,
> +	&sensor_dev_attr_temp1_crit_hyst.dev_attr.attr,
> +	NULL
> +};
> +
> +static const struct attribute_group die_temp_attr_group = {
> +	.attrs = die_temp_attrs,
> +};
> +
> +/* DTS thermal margin temperature */
> +static SENSOR_DEVICE_ATTR(temp2_label, 0444, show_label, NULL, L_DTS);
> +static SENSOR_DEVICE_ATTR(temp2_input, 0444, show_dts_therm_margin, NULL, 0);
> +static SENSOR_DEVICE_ATTR(temp2_min, 0444, show_value, NULL, 0);
> +static SENSOR_DEVICE_ATTR(temp2_lcrit, 0444, show_tcontrol_margin, NULL, NEG);
> +
> +static struct attribute *dts_margin_temp_attrs[] = {
> +	&sensor_dev_attr_temp2_label.dev_attr.attr,
> +	&sensor_dev_attr_temp2_input.dev_attr.attr,
> +	&sensor_dev_attr_temp2_min.dev_attr.attr,
> +	&sensor_dev_attr_temp2_lcrit.dev_attr.attr,
> +	NULL
> +};
> +
> +static const struct attribute_group dts_margin_temp_attr_group = {
> +	.attrs = dts_margin_temp_attrs,
> +};
> +
> +/* Tcontrol temperature */
> +static SENSOR_DEVICE_ATTR(temp3_label, 0444, show_label, NULL, L_TCONTROL);
> +static SENSOR_DEVICE_ATTR(temp3_input, 0444, show_tcontrol, NULL, 0);
> +static SENSOR_DEVICE_ATTR(temp3_crit, 0444, show_tjmax, NULL, 0);
> +
> +static struct attribute *tcontrol_temp_attrs[] = {
> +	&sensor_dev_attr_temp3_label.dev_attr.attr,
> +	&sensor_dev_attr_temp3_input.dev_attr.attr,
> +	&sensor_dev_attr_temp3_crit.dev_attr.attr,
> +	NULL
> +};
> +
> +static const struct attribute_group tcontrol_temp_attr_group = {
> +	.attrs = tcontrol_temp_attrs,
> +};
> +
> +/* Tthrottle temperature */
> +static SENSOR_DEVICE_ATTR(temp4_label, 0444, show_label, NULL, L_TTHROTTLE);
> +static SENSOR_DEVICE_ATTR(temp4_input, 0444, show_tthrottle, NULL, 0);
> +
> +static struct attribute *tthrottle_temp_attrs[] = {
> +	&sensor_dev_attr_temp4_label.dev_attr.attr,
> +	&sensor_dev_attr_temp4_input.dev_attr.attr,
> +	NULL
> +};
> +
> +static const struct attribute_group tthrottle_temp_attr_group = {
> +	.attrs = tthrottle_temp_attrs,
> +};
> +
> +/* CPU info */
> +static SENSOR_DEVICE_ATTR(info, 0444, show_info, NULL, 0);
> +
> +static struct attribute *info_attrs[] = {
> +	&sensor_dev_attr_info.dev_attr.attr,
> +	NULL
> +};
> +
> +static const struct attribute_group info_attr_group = {
> +	.attrs = info_attrs,
> +};
> +
> +const struct attribute_group *peci_hwmon_attr_groups[] = {
> +	&info_attr_group,
> +	&die_temp_attr_group,
> +	&dts_margin_temp_attr_group,
> +	&tcontrol_temp_attr_group,
> +	&tthrottle_temp_attr_group,
> +	NULL
> +};
> +
> +static ssize_t (*const core_show_fn[CORE_TEMP_ATTRS]) (struct device *dev,
> +		struct device_attribute *devattr, char *buf) = {
> +	show_core_label,
> +	show_core_temp,
> +	show_tcontrol,
> +	show_tjmax,
> +	show_tcontrol_margin,
> +};
> +
> +static const char *const core_suffix[CORE_TEMP_ATTRS] = {
> +	"label",
> +	"input",
> +	"max",
> +	"crit",
> +	"crit_hyst",
> +};
> +
> +static int create_core_temp_group(struct peci_hwmon *priv, int core_no)
> +{
> +	int i;
> +
> +	for (i = 0; i < CORE_TEMP_ATTRS; i++) {
> +		snprintf(priv->core.attr_name[core_no][i],
> +			 ATTR_NAME_LEN, "temp%d_%s",
> +			 CORE_INDEX_OFFSET + core_no, core_suffix[i]);
> +		sysfs_attr_init(
> +			    &priv->core.sd_attrs[core_no][i].dev_attr.attr);
> +		priv->core.sd_attrs[core_no][i].dev_attr.attr.name =
> +					       priv->core.attr_name[core_no][i];
> +		priv->core.sd_attrs[core_no][i].dev_attr.attr.mode = 0444;
> +		priv->core.sd_attrs[core_no][i].dev_attr.show = core_show_fn[i];
> +		if (i == 0 || i == 1) /* label or temp */
> +			priv->core.sd_attrs[core_no][i].index = core_no;
> +		priv->core.attrs[core_no][i] =
> +				 &priv->core.sd_attrs[core_no][i].dev_attr.attr;
> +	}
> +
> +	priv->core.attr_group[core_no].attrs = priv->core.attrs[core_no];
> +
> +	return sysfs_create_group(&priv->hwmon_dev->kobj,
> +				  &priv->core.attr_group[core_no]);
> +}
> +
> +static ssize_t (*const dimm_show_fn[DIMM_TEMP_ATTRS]) (struct device *dev,
> +		struct device_attribute *devattr, char *buf) = {
> +	show_dimm_label,
> +	show_dimm_temp,
> +};
> +
> +static const char *const dimm_suffix[DIMM_TEMP_ATTRS] = {
> +	"label",
> +	"input",
> +};
> +
> +static int create_dimm_temp_group(struct peci_hwmon *priv, int dimm_no)
> +{
> +	int i;
> +
> +	for (i = 0; i < DIMM_TEMP_ATTRS; i++) {
> +		snprintf(priv->dimm.attr_name[dimm_no][i],
> +			 ATTR_NAME_LEN, "temp%d_%s",
> +			 DIMM_INDEX_OFFSET + dimm_no, dimm_suffix[i]);
> +		sysfs_attr_init(&priv->dimm.sd_attrs[dimm_no][i].dev_attr.attr);
> +		priv->dimm.sd_attrs[dimm_no][i].dev_attr.attr.name =
> +					       priv->dimm.attr_name[dimm_no][i];
> +		priv->dimm.sd_attrs[dimm_no][i].dev_attr.attr.mode = 0444;
> +		priv->dimm.sd_attrs[dimm_no][i].dev_attr.show = dimm_show_fn[i];
> +		priv->dimm.sd_attrs[dimm_no][i].index = dimm_no;
> +		priv->dimm.attrs[dimm_no][i] =
> +				 &priv->dimm.sd_attrs[dimm_no][i].dev_attr.attr;
> +	}
> +
> +	priv->dimm.attr_group[dimm_no].attrs = priv->dimm.attrs[dimm_no];
> +
> +	return sysfs_create_group(&priv->hwmon_dev->kobj,
> +				  &priv->dimm.attr_group[dimm_no]);
> +}
> +
> +static int peci_hwmon_probe(struct platform_device *pdev)
> +{
> +	struct device *dev = &pdev->dev;
> +	struct device_node *np = dev->of_node;
> +	struct peci_hwmon *priv;
> +	struct device *hwmon;
> +	int rc, i;
> +
> +	priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL);
> +	if (!priv)
> +		return -ENOMEM;
> +
> +	dev_set_drvdata(dev, priv);
> +	priv->dev = dev;
> +
> +	rc = of_property_read_u32(np, "cpu-id", &priv->cpu_id);

What entity determines cpu-id ?

> +	if (rc || priv->cpu_id >= CPU_ID_MAX) {
> +		dev_err(dev, "Invalid cpu-id configuration\n");
> +		return rc;
> +	}
> +
> +	rc = of_property_read_u32(np, "dimm-nums", &priv->dimm_nums);

This is an odd devicetree attribute. Normally the number of DIMMs
is dynamic. Isn't there a means to get all that information dynamically
instead of having to set it through devicetree ? What if someone adds
or removes a DIMM ? Who updates the devicetree ?

> +	if (rc || priv->dimm_nums > DIMM_NUMS_MAX) {
> +		dev_warn(dev, "Invalid dimm-nums : %u. Use default : %u\n",
> +			 priv->dimm_nums, OF_DIMM_NUMS_DEFAULT);
> +		priv->dimm_nums = OF_DIMM_NUMS_DEFAULT;
> +	}
> +
> +	priv->show_core = of_property_read_bool(np, "show-core");

This does not look like an appropriate devicetree attribute.

> +
> +	priv->groups = peci_hwmon_attr_groups;
> +

This assignment (and the ->groups variable) is quite pointless.

> +	snprintf(priv->name, NAME_MAX, HWMON_NAME ".cpu%d", priv->cpu_id);
> +
> +	hwmon = devm_hwmon_device_register_with_groups(dev,
> +						       priv->name,
> +						       priv, priv->groups);

Please rewrite the driver to use devm_hwmon_device_register_with_info(),
and avoid dynamic attributes.

> +
> +	rc = PTR_ERR_OR_ZERO(hwmon);
> +	if (rc != 0) {
> +		dev_err(dev, "Failed to register peci hwmon\n");
> +		return rc;
> +	}
> +
> +	priv->hwmon_dev = hwmon;

Something is logically wrong if you need to store hwmon_dev in the
private data structure. Specifically, creating attributes dynamically
after hwmon registration is wrong.

> +
> +	for (i = 0; i < priv->dimm_nums; i++) {
> +		rc = create_dimm_temp_group(priv, i);

No. See earlier comments. All attribute groups must be created during
registration (or before, but I am not inclined to accept a new driver
doing that).

> +		if (rc != 0) {
> +			dev_err(dev, "Failed to create dimm temp group\n");
> +			for (--i; i >= 0; i--) {
> +				sysfs_remove_group(&priv->hwmon_dev->kobj,
> +						   &priv->dimm.attr_group[i]);
> +			}
> +			return rc;
> +		}
> +	}
> +
> +	/*
> +	 * Try to create core temp group now. It will be created if CPU is
> +	 * curretnly online or it will be created after the first reading of
> +	 * cpuinfo from the online CPU otherwise.

This is not how CPUs are supposed to be detected, and it does not handle CPUs
taken offline. If the driver is instantiated as a CPU comes online, or as it
goes offline, the driver should use the appropriate kernel interfaces to
trigger that instantiation or removal. However, if so, it may be inappropriate
to associate CPU temperatures with other system temperatures in the same
instance of the driver; after all, those are all independent of each other.

Overall, I suspect that there should be a callback or some other mechanism
in the peci core to trigger instantiation and removal of this driver, and
I am not sure if any of the devicetree properties makes sense at all.

For example, if an instance of this driver is associated with a PECI
agent (with assorted CPU/DIMM temperature reporting), the instantiation
could be triggered as soon as the PECI core detects that the agent is
available, and the PECI core could report what exactly that instance
supports.

> +	 */
> +	if (priv->show_core)
> +		(void) get_cpuinfo(priv);
> +
> +	dev_info(dev, "peci hwmon for CPU#%d registered\n", priv->cpu_id);

Is this logging noise necessary ? Besides, some of it is redundant.

> +
> +	return rc;
> +}
> +
> +static int peci_hwmon_remove(struct platform_device *pdev)
> +{
> +	struct peci_hwmon *priv = dev_get_drvdata(&pdev->dev);
> +	int i;
> +
> +	if (atomic_read(&priv->core_group_created))
> +		for (i = 0; i < priv->core_nums; i++) {
> +			sysfs_remove_group(&priv->hwmon_dev->kobj,
> +					   &priv->core.attr_group[i]);
> +		}
> +
> +	for (i = 0; i < priv->dimm_nums; i++) {
> +		sysfs_remove_group(&priv->hwmon_dev->kobj,
> +				   &priv->dimm.attr_group[i]);
> +	}

If you need to call sysfs_remove_group from here,
something is conceptually wrong in your driver.

> +
> +	return 0;
> +}
> +
> +static const struct of_device_id peci_of_table[] = {
> +	{ .compatible = "peci-hwmon", },

This does not look like a reference to some piece of hardware.

> +	{ }
> +};
> +MODULE_DEVICE_TABLE(of, peci_of_table);
> +
> +static struct platform_driver peci_hwmon_driver = {
> +	.probe = peci_hwmon_probe,
> +	.remove = peci_hwmon_remove,
> +	.driver = {
> +		.name           = DEVICE_NAME,
> +		.of_match_table = peci_of_table,
> +	},
> +};
> +
> +module_platform_driver(peci_hwmon_driver);
> +MODULE_AUTHOR("Jae Hyun Yoo <jae.hyun.yoo@linux.intel.com>");
> +MODULE_DESCRIPTION("PECI hwmon driver");
> +MODULE_LICENSE("GPL v2");
Jae Hyun Yoo Jan. 10, 2018, 11:45 p.m. UTC | #3
On 1/10/2018 4:29 AM, Arnd Bergmann wrote:
> On Tue, Jan 9, 2018 at 11:31 PM, Jae Hyun Yoo
> <jae.hyun.yoo@linux.intel.com> wrote:
>> This commit adds driver implementation for a generic PECI hwmon.
>>
>> Signed-off-by: Jae Hyun Yoo <jae.hyun.yoo@linux.intel.com>
> 
>> +static int xfer_peci_msg(int cmd, void *pmsg)
>> +{
>> +       int rc;
>> +
>> +       mutex_lock(&peci_hwmon_lock);
>> +       rc = peci_ioctl(NULL, cmd, (unsigned long)pmsg);
>> +       mutex_unlock(&peci_hwmon_lock);
>> +
>> +       return rc;
>> +}
> 
> I said earlier that peci_ioctl() looked unused, that was obviously
> wrong, but what you have here
> is not a proper way to abstract a bus.
> 
> Maybe this can be done more like an i2c bus: make the peci controller
> a bus device
> and register all known target/index pairs as devices with the peci bus
> type, and have
> them probed from DT. The driver can then bind to each of those individually.
> Not sure if that is getting to granular at that point, I'd have to
> understand better
> how it is expected to get used, and what the variances are between
> implementations.
> 
>         Arnd
> 

Thanks for sharing your opinion. In fact, this was also suggested by 
openbmc community so I should consider of redesigning it. I'm currently 
thinking about adding a new PECI device class as an abstract layer and 
any BMC chipset specific driver could be attached to the PECI class 
driver. Then, each CPU client could be registered as an individual 
device as you suggested. Will consider your suggestion.

Thanks a lot!
Jae
Arnd Bergmann Jan. 11, 2018, 1:22 p.m. UTC | #4
On Thu, Jan 11, 2018 at 12:45 AM, Jae Hyun Yoo
<jae.hyun.yoo@linux.intel.com> wrote:
> On 1/10/2018 4:29 AM, Arnd Bergmann wrote:
>>
>> On Tue, Jan 9, 2018 at 11:31 PM, Jae Hyun Yoo
>> <jae.hyun.yoo@linux.intel.com> wrote:
>>>
>>> This commit adds driver implementation for a generic PECI hwmon.
>>>
>>> Signed-off-by: Jae Hyun Yoo <jae.hyun.yoo@linux.intel.com>
>>
>>
>>> +static int xfer_peci_msg(int cmd, void *pmsg)
>>> +{
>>> +       int rc;
>>> +
>>> +       mutex_lock(&peci_hwmon_lock);
>>> +       rc = peci_ioctl(NULL, cmd, (unsigned long)pmsg);
>>> +       mutex_unlock(&peci_hwmon_lock);
>>> +
>>> +       return rc;
>>> +}
>>
>>
>> I said earlier that peci_ioctl() looked unused, that was obviously
>> wrong, but what you have here
>> is not a proper way to abstract a bus.
>>
>> Maybe this can be done more like an i2c bus: make the peci controller
>> a bus device
>> and register all known target/index pairs as devices with the peci bus
>> type, and have
>> them probed from DT. The driver can then bind to each of those
>> individually.
>> Not sure if that is getting to granular at that point, I'd have to
>> understand better
>> how it is expected to get used, and what the variances are between
>> implementations.
>>
>
> Thanks for sharing your opinion. In fact, this was also suggested by openbmc
> community so I should consider of redesigning it. I'm currently thinking
> about adding a new PECI device class as an abstract layer and any BMC
> chipset specific driver could be attached to the PECI class driver. Then,
> each CPU client could be registered as an individual device as you
> suggested. Will consider your suggestion.

Another idea might be to pretend that PECI was I2C. We already have a few
drivers for hardware that is not I2C but whose software interface looks
similar enough that it just works. No idea if that is the case for PECI, but
xfer_peci_msg might be close enough to i2c_xfer to make it work. If you
are able to do that, then the PECI controller would just register itself
as an i2c controller and it can be accessed using /dev/i2c from user space
or a high-level i2c_driver.

      Arnd
Jae Hyun Yoo Jan. 11, 2018, 7:47 p.m. UTC | #5
On 1/10/2018 1:47 PM, Guenter Roeck wrote:
> On Tue, Jan 09, 2018 at 02:31:26PM -0800, Jae Hyun Yoo wrote:
>> This commit adds driver implementation for a generic PECI hwmon.
>>
>> Signed-off-by: Jae Hyun Yoo <jae.hyun.yoo@linux.intel.com>
>> ---
>>   drivers/hwmon/Kconfig      |   6 +
>>   drivers/hwmon/Makefile     |   1 +
>>   drivers/hwmon/peci-hwmon.c | 953 +++++++++++++++++++++++++++++++++++++++++++++
>>   3 files changed, 960 insertions(+)
>>   create mode 100644 drivers/hwmon/peci-hwmon.c
>>
>> diff --git a/drivers/hwmon/Kconfig b/drivers/hwmon/Kconfig
>> index 9256dd0..3a62c60 100644
>> --- a/drivers/hwmon/Kconfig
>> +++ b/drivers/hwmon/Kconfig
>> @@ -1234,6 +1234,12 @@ config SENSORS_NCT7904
>>   	  This driver can also be built as a module.  If so, the module
>>   	  will be called nct7904.
>>   
>> +config SENSORS_PECI_HWMON
>> +	tristate "PECI hwmon support"
>> +	depends on ASPEED_PECI
>> +	help
>> +	  If you say yes here you get support for the generic PECI hwmon driver.
>> +
>>   config SENSORS_NSA320
>>   	tristate "ZyXEL NSA320 and compatible fan speed and temperature sensors"
>>   	depends on GPIOLIB && OF
>> diff --git a/drivers/hwmon/Makefile b/drivers/hwmon/Makefile
>> index 98000fc..41d43a5 100644
>> --- a/drivers/hwmon/Makefile
>> +++ b/drivers/hwmon/Makefile
>> @@ -131,6 +131,7 @@ obj-$(CONFIG_SENSORS_NCT7802)	+= nct7802.o
>>   obj-$(CONFIG_SENSORS_NCT7904)	+= nct7904.o
>>   obj-$(CONFIG_SENSORS_NSA320)	+= nsa320-hwmon.o
>>   obj-$(CONFIG_SENSORS_NTC_THERMISTOR)	+= ntc_thermistor.o
>> +obj-$(CONFIG_SENSORS_PECI_HWMON)	+= peci-hwmon.o
>>   obj-$(CONFIG_SENSORS_PC87360)	+= pc87360.o
>>   obj-$(CONFIG_SENSORS_PC87427)	+= pc87427.o
>>   obj-$(CONFIG_SENSORS_PCF8591)	+= pcf8591.o
>> diff --git a/drivers/hwmon/peci-hwmon.c b/drivers/hwmon/peci-hwmon.c
>> new file mode 100644
>> index 0000000..2d2a288
>> --- /dev/null
>> +++ b/drivers/hwmon/peci-hwmon.c
>> @@ -0,0 +1,953 @@
>> +// SPDX-License-Identifier: GPL-2.0
>> +// Copyright (c) 2017 Intel Corporation
>> +
>> +#include <linux/delay.h>
>> +#include <linux/hwmon.h>
>> +#include <linux/hwmon-sysfs.h>
>> +#include <linux/jiffies.h>
>> +#include <linux/module.h>
>> +#include <linux/mutex.h>
>> +#include <linux/of_platform.h>
>> +#include <linux/of_device.h>
>> +#include <linux/platform_device.h>
>> +#include <linux/syscalls.h>
>> +#include <misc/peci.h>
> 
> misc, not linux ? That seems wrong.
> 

You are right. I'll fix it.

>> +
>> +#define DEVICE_NAME "peci-hwmon"
>> +#define HWMON_NAME "peci_hwmon"
>> +
>> +#define CPU_ID_MAX           8   /* Max CPU number configured by socket ID */
>> +#define DIMM_NUMS_MAX        16  /* Max DIMM numbers (channel ranks x 2) */
>> +#define CORE_NUMS_MAX        28  /* Max core numbers (max on SKX Platinum) */
> 
> I won't insist, but it would be better if this were dynamic,
> otherwise we'll end up having to increase the defines in the future.
> 

Right. As you said, these values should be manually adjusted in the 
future if CPU architecture has been changed so better implement it as 
dynamic. I will check again a way of getting these values from client 
CPU thru PECI connection.

>> +#define TEMP_TYPE_PECI       6   /* Sensor type 6: Intel PECI */
>> +#define CORE_INDEX_OFFSET    100 /* sysfs filename start offset for core temp */
>> +#define DIMM_INDEX_OFFSET    200 /* sysfs filename start offset for DIMM temp */
> 
> Did you test with the "sensors" command to ensure that this works,
> with the large gaps in index values ?
> 
> Overall, I am not very happy with the indexing. Since each sensor as
> a label, it might be better to just make it dynamic.
> 

Okay, that makes sense. Since all attributes has its own label, indexing 
gap wouldn't be needed even in case of CPU architecture change happens. 
I'll remove the indexing gap.

>> +#define TEMP_NAME_HEADER_LEN 4   /* sysfs temp type header length */
>> +#define OF_DIMM_NUMS_DEFAULT 16  /* default dimm-nums setting */
>> +
>> +#define CORE_TEMP_ATTRS      5
>> +#define DIMM_TEMP_ATTRS      2
>> +#define ATTR_NAME_LEN        24
>> +
>> +#define UPDATE_INTERVAL_MIN  HZ
>> +
>> +enum sign_t {
>> +	POS,
>> +	NEG
>> +};
>> +
>> +struct cpuinfo_t {
>> +	bool valid;
>> +	u32  dib;
>> +	u8   cpuid;
>> +	u8   platform_id;
>> +	u32  microcode;
>> +	u8   logical_thread_nums;
>> +};
>> +
>> +struct temp_data_t {
>> +	bool valid;
>> +	s32  value;
>> +	unsigned long last_updated;
>> +};
>> +
>> +struct temp_group_t {
>> +	struct temp_data_t tjmax;
>> +	struct temp_data_t tcontrol;
>> +	struct temp_data_t tthrottle;
>> +	struct temp_data_t dts_margin;
>> +	struct temp_data_t die;
>> +	struct temp_data_t core[CORE_NUMS_MAX];
>> +	struct temp_data_t dimm[DIMM_NUMS_MAX];
>> +};
>> +
>> +struct core_temp_attr_group_t {
>> +	struct sensor_device_attribute sd_attrs[CORE_NUMS_MAX][CORE_TEMP_ATTRS];
>> +	char attr_name[CORE_NUMS_MAX][CORE_TEMP_ATTRS][ATTR_NAME_LEN];
>> +	struct attribute *attrs[CORE_NUMS_MAX][CORE_TEMP_ATTRS + 1];
>> +	struct attribute_group attr_group[CORE_NUMS_MAX];
>> +};
>> +
>> +struct dimm_temp_attr_group_t {
>> +	struct sensor_device_attribute sd_attrs[DIMM_NUMS_MAX][DIMM_TEMP_ATTRS];
>> +	char attr_name[DIMM_NUMS_MAX][DIMM_TEMP_ATTRS][ATTR_NAME_LEN];
>> +	struct attribute *attrs[DIMM_NUMS_MAX][DIMM_TEMP_ATTRS + 1];
>> +	struct attribute_group attr_group[DIMM_NUMS_MAX];
>> +};
>> +
>> +struct peci_hwmon {
>> +	struct device *dev;
>> +	struct device *hwmon_dev;
>> +	char name[NAME_MAX];
>> +	const struct attribute_group **groups;
>> +	struct cpuinfo_t cpuinfo;
>> +	struct temp_group_t temp;
>> +	u32 cpu_id;
>> +	bool show_core;
>> +	u32 core_nums;
>> +	u32 dimm_nums;
>> +	atomic_t core_group_created;
>> +	struct core_temp_attr_group_t core;
>> +	struct dimm_temp_attr_group_t dimm;
>> +};
>> +
>> +enum label_t {
>> +	L_DIE,
>> +	L_DTS,
>> +	L_TCONTROL,
>> +	L_TTHROTTLE,
>> +	L_MAX
>> +};
>> +
>> +static const char *peci_label[L_MAX] = {
>> +	"Die temperature\n",
>> +	"DTS thermal margin to Tcontrol\n",
>> +	"Tcontrol temperature\n",
>> +	"Tthrottle temperature\n",
> 
> "temperature" is redundant for a temperature label.
> 

Agreed. Will remove the redundant string.

>> +};
>> +
>> +static DEFINE_MUTEX(peci_hwmon_lock);
>> +
>> +static int create_core_temp_group(struct peci_hwmon *priv, int core_no);
> 
> Please avoid forward declarations.
> 

Will fix it.

>> +
>> +
> 
> Please run your patches throuch checkpatch --strict and fix what it reports,
> or provide a reason why you don't.
> 

Thanks for the tip. I didn't know about the --strict option before. 
Checked that the option reports more check points such as this multiple 
blank line case. I will run all patches again using --strict option.

>> +static int xfer_peci_msg(int cmd, void *pmsg)
>> +{
>> +	int rc;
>> +
>> +	mutex_lock(&peci_hwmon_lock);
>> +	rc = peci_ioctl(NULL, cmd, (unsigned long)pmsg);
>> +	mutex_unlock(&peci_hwmon_lock);
>> +
>> +	return rc;
>> +}
>> +
>> +static int get_cpuinfo(struct peci_hwmon *priv)
>> +{
>> +	struct peci_get_dib_msg dib_msg;
>> +	struct peci_rd_pkg_cfg_msg cfg_msg;
>> +	int rc, i;
>> +
>> +	if (!priv->cpuinfo.valid) {
>> +		dib_msg.target = PECI_BASE_ADDR + priv->cpu_id;
>> +
>> +		rc = xfer_peci_msg(PECI_IOC_GET_DIB, (void *)&dib_msg);
>> +		if (rc < 0)
>> +			return rc;
>> +
>> +		priv->cpuinfo.dib = dib_msg.dib;
>> +
>> +		cfg_msg.target = PECI_BASE_ADDR + priv->cpu_id;
>> +		cfg_msg.index = MBX_INDEX_CPU_ID;
>> +		cfg_msg.param = 0;
>> +		cfg_msg.rx_len = 4;
>> +
>> +		rc = xfer_peci_msg(PECI_IOC_RD_PKG_CFG, (void *)&cfg_msg);
>> +		if (rc < 0)
>> +			return rc;
>> +
>> +		priv->cpuinfo.cpuid = cfg_msg.pkg_config[0];
>> +
>> +		cfg_msg.target = PECI_BASE_ADDR + priv->cpu_id;
>> +		cfg_msg.index = MBX_INDEX_CPU_ID;
>> +		cfg_msg.param = 1;
>> +		cfg_msg.rx_len = 4;
>> +
>> +		rc = xfer_peci_msg(PECI_IOC_RD_PKG_CFG, (void *)&cfg_msg);
>> +		if (rc < 0)
>> +			return rc;
>> +
>> +		priv->cpuinfo.platform_id = cfg_msg.pkg_config[0];
>> +
>> +		cfg_msg.target = PECI_BASE_ADDR + priv->cpu_id;
>> +		cfg_msg.index = MBX_INDEX_CPU_ID;
>> +		cfg_msg.param = 3;
>> +		cfg_msg.rx_len = 4;
>> +
>> +		rc = xfer_peci_msg(PECI_IOC_RD_PKG_CFG, (void *)&cfg_msg);
>> +		if (rc < 0)
>> +			return rc;
>> +
>> +		priv->cpuinfo.logical_thread_nums = cfg_msg.pkg_config[0] + 1;
>> +
>> +		cfg_msg.target = PECI_BASE_ADDR + priv->cpu_id;
>> +		cfg_msg.index = MBX_INDEX_CPU_ID;
>> +		cfg_msg.param = 4;
>> +		cfg_msg.rx_len = 4;
>> +
>> +		rc = xfer_peci_msg(PECI_IOC_RD_PKG_CFG, (void *)&cfg_msg);
>> +		if (rc < 0)
>> +			return rc;
>> +
>> +		priv->cpuinfo.microcode = (cfg_msg.pkg_config[3] << 24) |
>> +					  (cfg_msg.pkg_config[2] << 16) |
>> +					  (cfg_msg.pkg_config[1] << 8) |
>> +					  cfg_msg.pkg_config[0];
>> +
>> +		priv->core_nums = priv->cpuinfo.logical_thread_nums / 2;
> 
> This seems to assume a 1:2 relationship between number of threads and
> number of CPUs, which is incorrect.
> 

You are right. This can't cover all CPUs. Will find a proper way.

>> +
>> +		if (priv->show_core &&
>> +		    atomic_inc_return(&priv->core_group_created) == 1) {
>> +			for (i = 0; i < priv->core_nums; i++) {
>> +				rc = create_core_temp_group(priv, i);
> 
> This is messy. Sensor groups should be created before or during
> hwmon registration, not at some arbitrary later time.
> 
> I don't know the logic behind this, but if it is supposed to track CPUs
> coming online and going offline it is the wrong approach.
> 

Agreed. This driver wouldn't make communication with CPUs at all if CPUs 
are powered down so we don't need to leave this driver as inserted. As 
you commented below, if a agent does insert/remove this driver module 
while tracking CPU power state, this delayed creation logic wouldn't be 
needed. I will rewrite it.

>> +				if (rc != 0) {
>> +					dev_err(priv->dev,
>> +						"Failed to create core temp group\n");
>> +					for (--i; i >= 0; i--) {
>> +						sysfs_remove_group(
>> +						     &priv->hwmon_dev->kobj,
>> +						     &priv->core.attr_group[i]);
>> +					}
>> +					atomic_set(&priv->core_group_created,
>> +						   0);
>> +					return rc;
>> +				}
>> +			}
>> +		}
>> +
>> +		priv->cpuinfo.valid = true;
>> +	}
>> +
>> +	return 0;
>> +}
>> +
>> +static int get_tjmax(struct peci_hwmon *priv)
>> +{
>> +	struct peci_rd_pkg_cfg_msg msg;
>> +	int rc;
>> +
>> +	rc = get_cpuinfo(priv);
>> +	if (rc < 0)
>> +		return rc;
>> +
>> +	if (!priv->temp.tjmax.valid) {
>> +		msg.target = PECI_BASE_ADDR + priv->cpu_id;
>> +		msg.index = MBX_INDEX_TEMP_TARGET;
>> +		msg.param = 0;
>> +		msg.rx_len = 4;
>> +
>> +		rc = xfer_peci_msg(PECI_IOC_RD_PKG_CFG, (void *)&msg);
>> +		if (rc < 0)
>> +			return rc;
>> +
>> +		priv->temp.tjmax.value = (s32)msg.pkg_config[2] * 1000;
>> +		priv->temp.tjmax.valid = true;
>> +	}
>> +
>> +	return 0;
>> +}
>> +
>> +static int get_tcontrol(struct peci_hwmon *priv)
>> +{
>> +	struct peci_rd_pkg_cfg_msg msg;
>> +	s32 tcontrol_margin;
>> +	int rc;
>> +
>> +	if (priv->temp.tcontrol.valid &&
>> +	    time_before(jiffies, priv->temp.tcontrol.last_updated +
>> +				 UPDATE_INTERVAL_MIN))
>> +		return 0;
>> +
> 
> Is the delay necessary ? Otherwise I would suggest to drop it.
> It adds a lot of complexity to the driver. Also, if the user polls
> values more often, that is presumably on purpose.
> 

I was intended to reduce traffic on PECI bus because it's low speed 
single wired bus, and temperature values don't change frequently because 
the value is sampled and averaged in CPU itself. I'll keep this.

>> +	rc = get_tjmax(priv);
>> +	if (rc < 0)
>> +		return rc;
>> +
>> +	msg.target = PECI_BASE_ADDR + priv->cpu_id;
>> +	msg.index = MBX_INDEX_TEMP_TARGET;
>> +	msg.param = 0;
>> +	msg.rx_len = 4;
>> +
>> +	rc = xfer_peci_msg(PECI_IOC_RD_PKG_CFG, (void *)&msg);
>> +	if (rc < 0)
>> +		return rc;
>> +
>> +	tcontrol_margin = msg.pkg_config[1];
>> +	tcontrol_margin = ((tcontrol_margin ^ 0x80) - 0x80) * 1000;
>> +
>> +	priv->temp.tcontrol.value = priv->temp.tjmax.value - tcontrol_margin;
>> +
>> +	if (!priv->temp.tcontrol.valid) {
>> +		priv->temp.tcontrol.last_updated = INITIAL_JIFFIES;
>> +		priv->temp.tcontrol.valid = true;
>> +	} else {
>> +		priv->temp.tcontrol.last_updated = jiffies;
>> +	}
>> +
>> +	return 0;
>> +}
>> +
>> +static int get_tthrottle(struct peci_hwmon *priv)
>> +{
>> +	struct peci_rd_pkg_cfg_msg msg;
>> +	s32 tthrottle_offset;
>> +	int rc;
>> +
>> +	if (priv->temp.tthrottle.valid &&
>> +	    time_before(jiffies, priv->temp.tthrottle.last_updated +
>> +				 UPDATE_INTERVAL_MIN))
>> +		return 0;
>> +
>> +	rc = get_tjmax(priv);
>> +	if (rc < 0)
>> +		return rc;
>> +
>> +	msg.target = PECI_BASE_ADDR + priv->cpu_id;
>> +	msg.index = MBX_INDEX_TEMP_TARGET;
>> +	msg.param = 0;
>> +	msg.rx_len = 4;
>> +
>> +	rc = xfer_peci_msg(PECI_IOC_RD_PKG_CFG, (void *)&msg);
>> +	if (rc < 0)
>> +		return rc;
>> +
>> +	tthrottle_offset = (msg.pkg_config[3] & 0x2f) * 1000;
>> +	priv->temp.tthrottle.value = priv->temp.tjmax.value - tthrottle_offset;
>> +
>> +	if (!priv->temp.tthrottle.valid) {
>> +		priv->temp.tthrottle.last_updated = INITIAL_JIFFIES;
>> +		priv->temp.tthrottle.valid = true;
>> +	} else {
>> +		priv->temp.tthrottle.last_updated = jiffies;
>> +	}
>> +
>> +	return 0;
>> +}
>> +
>> +static int get_die_temp(struct peci_hwmon *priv)
>> +{
>> +	struct peci_get_temp_msg msg;
>> +	int rc;
>> +
>> +	if (priv->temp.die.valid &&
>> +	    time_before(jiffies, priv->temp.die.last_updated +
>> +				 UPDATE_INTERVAL_MIN))
>> +		return 0;
>> +
>> +	rc = get_tjmax(priv);
>> +	if (rc < 0)
>> +		return rc;
>> +
>> +	msg.target = PECI_BASE_ADDR + priv->cpu_id;
>> +
>> +	rc = xfer_peci_msg(PECI_IOC_GET_TEMP, (void *)&msg);
>> +	if (rc < 0)
>> +		return rc;
>> +
>> +	priv->temp.die.value = priv->temp.tjmax.value +
>> +			       ((s32)msg.temp_raw * 1000 / 64);
>> +
>> +	if (!priv->temp.die.valid) {
>> +		priv->temp.die.last_updated = INITIAL_JIFFIES;
>> +		priv->temp.die.valid = true;
>> +	} else {
>> +		priv->temp.die.last_updated = jiffies;
>> +	}
>> +
>> +	return 0;
>> +}
>> +
>> +static int get_dts_margin(struct peci_hwmon *priv)
>> +{
>> +	struct peci_rd_pkg_cfg_msg msg;
>> +	s32 dts_margin;
>> +	int rc;
>> +
>> +	if (priv->temp.dts_margin.valid &&
>> +	    time_before(jiffies, priv->temp.dts_margin.last_updated +
>> +				 UPDATE_INTERVAL_MIN))
>> +		return 0;
>> +
> Are all those values expected to change dynamically, or are some static ?
> Static values do not have to be re-read repeatedly but can be cached
> permanently.
> 

All values expected to change dynamically except the Tjmax value which 
is fused in CPU package, but the Tjmax varies on each CPU package so we 
need to read the Tjmax at least once. Current implementation uses cached 
Tjmax value after the first reading on the value.

>> +	rc = get_cpuinfo(priv);
>> +	if (rc < 0)
>> +		return rc;
>> +
>> +	msg.target = PECI_BASE_ADDR + priv->cpu_id;
>> +	msg.index = MBX_INDEX_DTS_MARGIN;
>> +	msg.param = 0;
>> +	msg.rx_len = 4;
>> +
>> +	rc = xfer_peci_msg(PECI_IOC_RD_PKG_CFG, (void *)&msg);
>> +	if (rc < 0)
>> +		return rc;
>> +
>> +	dts_margin = (msg.pkg_config[1] << 8) | msg.pkg_config[0];
>> +
>> +	/*
>> +	 * Processors return a value of DTS reading in 10.6 format
>> +	 * (10 bits signed decimal, 6 bits fractional).
>> +	 * Error codes:
>> +	 *   0x8000: General sensor error
>> +	 *   0x8001: Reserved
>> +	 *   0x8002: Underflow on reading value
>> +	 *   0x8003-0x81ff: Reserved
>> +	 */
>> +	if (dts_margin >= 0x8000 && dts_margin <= 0x81ff)
>> +		return -1;
>> +
>> +	dts_margin = ((dts_margin ^ 0x8000) - 0x8000) * 1000 / 64;
>> +
> The above code is repeated several times. Please consider moving it
> into a function to reduce duplication.
> 

Okay. I will move it to a function.

>> +	priv->temp.dts_margin.value = dts_margin;
>> +
>> +	if (!priv->temp.dts_margin.valid) {
>> +		priv->temp.dts_margin.last_updated = INITIAL_JIFFIES;
>> +		priv->temp.dts_margin.valid = true;
>> +	} else {
>> +		priv->temp.dts_margin.last_updated = jiffies;
>> +	}
>> +
>> +	return 0;
>> +}
>> +
>> +static int get_core_temp(struct peci_hwmon *priv, int core_index)
>> +{
>> +	struct peci_rd_pkg_cfg_msg msg;
>> +	s32 core_dts_margin;
>> +	int rc;
>> +
>> +	if (priv->temp.core[core_index].valid &&
>> +	    time_before(jiffies, priv->temp.core[core_index].last_updated +
>> +				 UPDATE_INTERVAL_MIN))
>> +		return 0;
>> +
>> +	rc = get_tjmax(priv);
>> +	if (rc < 0)
>> +		return rc;
>> +
>> +	msg.target = PECI_BASE_ADDR + priv->cpu_id;
>> +	msg.index = MBX_INDEX_PER_CORE_DTS_TEMP;
>> +	msg.param = core_index;
>> +	msg.rx_len = 4;
>> +
>> +	rc = xfer_peci_msg(PECI_IOC_RD_PKG_CFG, (void *)&msg);
>> +	if (rc < 0)
>> +		return rc;
>> +
>> +	core_dts_margin = (msg.pkg_config[1] << 8) | msg.pkg_config[0];
>> +
>> +	/*
>> +	 * Processors return a value of the core DTS reading in 10.6 format
>> +	 * (10 bits signed decimal, 6 bits fractional).
>> +	 * Error codes:
>> +	 *   0x8000: General sensor error
>> +	 *   0x8001: Reserved
>> +	 *   0x8002: Underflow on reading value
>> +	 *   0x8003-0x81ff: Reserved
>> +	 */
>> +	if (core_dts_margin >= 0x8000 && core_dts_margin <= 0x81ff)
>> +		return -1;
>> +
>> +	core_dts_margin = ((core_dts_margin ^ 0x8000) - 0x8000) * 1000 / 64;
>> +
>> +	priv->temp.core[core_index].value = priv->temp.tjmax.value +
>> +					    core_dts_margin;
>> +
>> +	if (!priv->temp.core[core_index].valid) {
>> +		priv->temp.core[core_index].last_updated = INITIAL_JIFFIES;
>> +		priv->temp.core[core_index].valid = true;
>> +	} else {
>> +		priv->temp.core[core_index].last_updated = jiffies;
>> +	}
>> +
>> +	return 0;
>> +}
>> +
>> +static int get_dimm_temp(struct peci_hwmon *priv, int dimm_index)
>> +{
>> +	struct peci_rd_pkg_cfg_msg msg;
>> +	int channel_rank = dimm_index / 2;
>> +	int dimm_order = dimm_index % 2;
>> +	int rc;
>> +
>> +	if (priv->temp.core[dimm_index].valid &&
>> +	    time_before(jiffies, priv->temp.core[dimm_index].last_updated +
>> +				 UPDATE_INTERVAL_MIN))
>> +		return 0;
>> +
>> +	rc = get_cpuinfo(priv);
>> +	if (rc < 0)
>> +		return rc;
>> +
>> +	msg.target = PECI_BASE_ADDR + priv->cpu_id;
>> +	msg.index = MBX_INDEX_DDR_DIMM_TEMP;
>> +	msg.param = channel_rank;
>> +	msg.rx_len = 4;
>> +
>> +	rc = xfer_peci_msg(PECI_IOC_RD_PKG_CFG, (void *)&msg);
>> +	if (rc < 0)
>> +		return rc;
>> +
>> +	priv->temp.dimm[dimm_index].value = msg.pkg_config[dimm_order] * 1000;
>> +
>> +	if (!priv->temp.dimm[dimm_index].valid) {
>> +		priv->temp.dimm[dimm_index].last_updated = INITIAL_JIFFIES;
>> +		priv->temp.dimm[dimm_index].valid = true;
>> +	} else {
>> +		priv->temp.dimm[dimm_index].last_updated = jiffies;
>> +	}
>> +
>> +	return 0;
>> +}
>> +
>> +static ssize_t show_info(struct device *dev,
>> +			 struct device_attribute *attr,
>> +			 char *buf)
>> +{
>> +	struct peci_hwmon *priv = dev_get_drvdata(dev);
>> +	int rc;
>> +
>> +	rc = get_cpuinfo(priv);
>> +	if (rc < 0)
>> +		return rc;
>> +
>> +	return sprintf(buf, "dib         : 0x%08x\n"
>> +			    "cpuid       : 0x%x\n"
>> +			    "platform id : %d\n"
>> +			    "stepping    : %d\n"
>> +			    "microcode   : 0x%08x\n"
>> +			    "logical thread nums : %d\n",
>> +			    priv->cpuinfo.dib,
>> +			    priv->cpuinfo.cpuid,
>> +			    priv->cpuinfo.platform_id,
>> +			    priv->cpuinfo.cpuid & 0xf,
>> +			    priv->cpuinfo.microcode,
>> +			    priv->cpuinfo.logical_thread_nums);
>> +}
> 
> Please no non-standard attributes, much less attributes not following sysfs
> attribute rules. If you want to display such information, consider using
> debugfs. Besides, this information specifically appears to duplicate
> the content of /proc/cpuid, which doesn't really add any value at all.
> 

Got it. Will drop this non-standard attribute.

>> +
>> +static ssize_t show_tcontrol(struct device *dev,
>> +			     struct device_attribute *attr,
>> +			     char *buf)
>> +{
>> +	struct peci_hwmon *priv = dev_get_drvdata(dev);
>> +	int rc;
>> +
>> +	rc = get_tcontrol(priv);
>> +	if (rc < 0)
>> +		return rc;
>> +
>> +	return sprintf(buf, "%d\n", priv->temp.tcontrol.value);
>> +}
>> +
>> +static ssize_t show_tcontrol_margin(struct device *dev,
>> +				    struct device_attribute *attr,
>> +				    char *buf)
>> +{
>> +	struct peci_hwmon *priv = dev_get_drvdata(dev);
>> +	struct sensor_device_attribute *sensor_attr = to_sensor_dev_attr(attr);
>> +	int rc;
>> +
>> +	rc = get_tcontrol(priv);
>> +	if (rc < 0)
>> +		return rc;
>> +
>> +	return sprintf(buf, "%d\n", sensor_attr->index == POS ?
>> +				    priv->temp.tjmax.value -
>> +				    priv->temp.tcontrol.value :
>> +				    priv->temp.tcontrol.value -
>> +				    priv->temp.tjmax.value);
>> +}
>> +
>> +static ssize_t show_tthrottle(struct device *dev,
>> +			      struct device_attribute *attr,
>> +			      char *buf)
>> +{
>> +	struct peci_hwmon *priv = dev_get_drvdata(dev);
>> +	int rc;
>> +
>> +	rc = get_tthrottle(priv);
>> +	if (rc < 0)
>> +		return rc;
>> +
>> +	return sprintf(buf, "%d\n", priv->temp.tthrottle.value);
>> +}
>> +
>> +static ssize_t show_tjmax(struct device *dev,
>> +			  struct device_attribute *attr,
>> +			  char *buf)
>> +{
>> +	struct peci_hwmon *priv = dev_get_drvdata(dev);
>> +	int rc;
>> +
>> +	rc = get_tjmax(priv);
>> +	if (rc < 0)
>> +		return rc;
>> +
>> +	return sprintf(buf, "%d\n", priv->temp.tjmax.value);
>> +}
>> +
>> +static ssize_t show_die_temp(struct device *dev,
>> +			     struct device_attribute *attr,
>> +			     char *buf)
>> +{
>> +	struct peci_hwmon *priv = dev_get_drvdata(dev);
>> +	int rc;
>> +
>> +	rc = get_die_temp(priv);
>> +	if (rc < 0)
>> +		return rc;
>> +
>> +	return sprintf(buf, "%d\n", priv->temp.die.value);
>> +}
>> +
>> +static ssize_t show_dts_therm_margin(struct device *dev,
>> +				     struct device_attribute *attr,
>> +				     char *buf)
>> +{
>> +	struct peci_hwmon *priv = dev_get_drvdata(dev);
>> +	int rc;
>> +
>> +	rc = get_dts_margin(priv);
>> +	if (rc < 0)
>> +		return rc;
>> +
>> +	return sprintf(buf, "%d\n", priv->temp.dts_margin.value);
>> +}
>> +
>> +static ssize_t show_core_temp(struct device *dev,
>> +			      struct device_attribute *attr,
>> +			      char *buf)
>> +{
>> +	struct peci_hwmon *priv = dev_get_drvdata(dev);
>> +	struct sensor_device_attribute *sensor_attr = to_sensor_dev_attr(attr);
>> +	int core_index = sensor_attr->index;
>> +	int rc;
>> +
>> +	rc = get_core_temp(priv, core_index);
>> +	if (rc < 0)
>> +		return rc;
>> +
>> +	return sprintf(buf, "%d\n", priv->temp.core[core_index].value);
>> +}
>> +
>> +static ssize_t show_dimm_temp(struct device *dev,
>> +			      struct device_attribute *attr,
>> +			      char *buf)
>> +{
>> +	struct peci_hwmon *priv = dev_get_drvdata(dev);
>> +	struct sensor_device_attribute *sensor_attr = to_sensor_dev_attr(attr);
>> +	int dimm_index = sensor_attr->index;
>> +	int rc;
>> +
>> +	rc = get_dimm_temp(priv, dimm_index);
>> +	if (rc < 0)
>> +		return rc;
>> +
>> +	return sprintf(buf, "%d\n", priv->temp.dimm[dimm_index].value);
>> +}
>> +
>> +static ssize_t show_value(struct device *dev,
>> +			  struct device_attribute *attr,
>> +			  char *buf)
>> +{
>> +	struct sensor_device_attribute *sensor_attr = to_sensor_dev_attr(attr);
>> +
>> +	return sprintf(buf, "%d\n", sensor_attr->index);
>> +}
>> +
>> +static ssize_t show_label(struct device *dev,
>> +			  struct device_attribute *attr,
>> +			  char *buf)
>> +{
>> +	struct sensor_device_attribute *sensor_attr = to_sensor_dev_attr(attr);
>> +
>> +	return sprintf(buf, peci_label[sensor_attr->index]);
>> +}
>> +
>> +static ssize_t show_core_label(struct device *dev,
>> +			       struct device_attribute *attr,
>> +			       char *buf)
>> +{
>> +	struct sensor_device_attribute *sensor_attr = to_sensor_dev_attr(attr);
>> +
>> +	return sprintf(buf, "Core #%d temperature\n", sensor_attr->index);
>> +}
> 
> Your label strings are quite long. How does that look like with the
> sensors command ?
> 
> Plus, again, "temperature" in a temperature label is redundant.
> 

Agreed. Will remove the redundant string.

>> +
>> +static ssize_t show_dimm_label(struct device *dev,
>> +			       struct device_attribute *attr,
>> +			       char *buf)
>> +{
>> +	struct sensor_device_attribute *sensor_attr = to_sensor_dev_attr(attr);
>> +
>> +	char channel = 'A' + (sensor_attr->index / 2);
>> +	int index = sensor_attr->index % 2;
>> +
>> +	return sprintf(buf, "Channel Rank %c DDR DIMM #%d temperature\n",
>> +		       channel, index);
>> +}
>> +
>> +/* Die temperature */
>> +static SENSOR_DEVICE_ATTR(temp1_label, 0444, show_label, NULL, L_DIE);
>> +static SENSOR_DEVICE_ATTR(temp1_input, 0444, show_die_temp, NULL, 0);
>> +static SENSOR_DEVICE_ATTR(temp1_max, 0444, show_tcontrol, NULL, 0);
>> +static SENSOR_DEVICE_ATTR(temp1_crit, 0444, show_tjmax, NULL, 0);
>> +static SENSOR_DEVICE_ATTR(temp1_crit_hyst, 0444, show_tcontrol_margin, NULL,
>> +			  POS);
>> +
>> +static struct attribute *die_temp_attrs[] = {
>> +	&sensor_dev_attr_temp1_label.dev_attr.attr,
>> +	&sensor_dev_attr_temp1_input.dev_attr.attr,
>> +	&sensor_dev_attr_temp1_max.dev_attr.attr,
>> +	&sensor_dev_attr_temp1_crit.dev_attr.attr,
>> +	&sensor_dev_attr_temp1_crit_hyst.dev_attr.attr,
>> +	NULL
>> +};
>> +
>> +static const struct attribute_group die_temp_attr_group = {
>> +	.attrs = die_temp_attrs,
>> +};
>> +
>> +/* DTS thermal margin temperature */
>> +static SENSOR_DEVICE_ATTR(temp2_label, 0444, show_label, NULL, L_DTS);
>> +static SENSOR_DEVICE_ATTR(temp2_input, 0444, show_dts_therm_margin, NULL, 0);
>> +static SENSOR_DEVICE_ATTR(temp2_min, 0444, show_value, NULL, 0);
>> +static SENSOR_DEVICE_ATTR(temp2_lcrit, 0444, show_tcontrol_margin, NULL, NEG);
>> +
>> +static struct attribute *dts_margin_temp_attrs[] = {
>> +	&sensor_dev_attr_temp2_label.dev_attr.attr,
>> +	&sensor_dev_attr_temp2_input.dev_attr.attr,
>> +	&sensor_dev_attr_temp2_min.dev_attr.attr,
>> +	&sensor_dev_attr_temp2_lcrit.dev_attr.attr,
>> +	NULL
>> +};
>> +
>> +static const struct attribute_group dts_margin_temp_attr_group = {
>> +	.attrs = dts_margin_temp_attrs,
>> +};
>> +
>> +/* Tcontrol temperature */
>> +static SENSOR_DEVICE_ATTR(temp3_label, 0444, show_label, NULL, L_TCONTROL);
>> +static SENSOR_DEVICE_ATTR(temp3_input, 0444, show_tcontrol, NULL, 0);
>> +static SENSOR_DEVICE_ATTR(temp3_crit, 0444, show_tjmax, NULL, 0);
>> +
>> +static struct attribute *tcontrol_temp_attrs[] = {
>> +	&sensor_dev_attr_temp3_label.dev_attr.attr,
>> +	&sensor_dev_attr_temp3_input.dev_attr.attr,
>> +	&sensor_dev_attr_temp3_crit.dev_attr.attr,
>> +	NULL
>> +};
>> +
>> +static const struct attribute_group tcontrol_temp_attr_group = {
>> +	.attrs = tcontrol_temp_attrs,
>> +};
>> +
>> +/* Tthrottle temperature */
>> +static SENSOR_DEVICE_ATTR(temp4_label, 0444, show_label, NULL, L_TTHROTTLE);
>> +static SENSOR_DEVICE_ATTR(temp4_input, 0444, show_tthrottle, NULL, 0);
>> +
>> +static struct attribute *tthrottle_temp_attrs[] = {
>> +	&sensor_dev_attr_temp4_label.dev_attr.attr,
>> +	&sensor_dev_attr_temp4_input.dev_attr.attr,
>> +	NULL
>> +};
>> +
>> +static const struct attribute_group tthrottle_temp_attr_group = {
>> +	.attrs = tthrottle_temp_attrs,
>> +};
>> +
>> +/* CPU info */
>> +static SENSOR_DEVICE_ATTR(info, 0444, show_info, NULL, 0);
>> +
>> +static struct attribute *info_attrs[] = {
>> +	&sensor_dev_attr_info.dev_attr.attr,
>> +	NULL
>> +};
>> +
>> +static const struct attribute_group info_attr_group = {
>> +	.attrs = info_attrs,
>> +};
>> +
>> +const struct attribute_group *peci_hwmon_attr_groups[] = {
>> +	&info_attr_group,
>> +	&die_temp_attr_group,
>> +	&dts_margin_temp_attr_group,
>> +	&tcontrol_temp_attr_group,
>> +	&tthrottle_temp_attr_group,
>> +	NULL
>> +};
>> +
>> +static ssize_t (*const core_show_fn[CORE_TEMP_ATTRS]) (struct device *dev,
>> +		struct device_attribute *devattr, char *buf) = {
>> +	show_core_label,
>> +	show_core_temp,
>> +	show_tcontrol,
>> +	show_tjmax,
>> +	show_tcontrol_margin,
>> +};
>> +
>> +static const char *const core_suffix[CORE_TEMP_ATTRS] = {
>> +	"label",
>> +	"input",
>> +	"max",
>> +	"crit",
>> +	"crit_hyst",
>> +};
>> +
>> +static int create_core_temp_group(struct peci_hwmon *priv, int core_no)
>> +{
>> +	int i;
>> +
>> +	for (i = 0; i < CORE_TEMP_ATTRS; i++) {
>> +		snprintf(priv->core.attr_name[core_no][i],
>> +			 ATTR_NAME_LEN, "temp%d_%s",
>> +			 CORE_INDEX_OFFSET + core_no, core_suffix[i]);
>> +		sysfs_attr_init(
>> +			    &priv->core.sd_attrs[core_no][i].dev_attr.attr);
>> +		priv->core.sd_attrs[core_no][i].dev_attr.attr.name =
>> +					       priv->core.attr_name[core_no][i];
>> +		priv->core.sd_attrs[core_no][i].dev_attr.attr.mode = 0444;
>> +		priv->core.sd_attrs[core_no][i].dev_attr.show = core_show_fn[i];
>> +		if (i == 0 || i == 1) /* label or temp */
>> +			priv->core.sd_attrs[core_no][i].index = core_no;
>> +		priv->core.attrs[core_no][i] =
>> +				 &priv->core.sd_attrs[core_no][i].dev_attr.attr;
>> +	}
>> +
>> +	priv->core.attr_group[core_no].attrs = priv->core.attrs[core_no];
>> +
>> +	return sysfs_create_group(&priv->hwmon_dev->kobj,
>> +				  &priv->core.attr_group[core_no]);
>> +}
>> +
>> +static ssize_t (*const dimm_show_fn[DIMM_TEMP_ATTRS]) (struct device *dev,
>> +		struct device_attribute *devattr, char *buf) = {
>> +	show_dimm_label,
>> +	show_dimm_temp,
>> +};
>> +
>> +static const char *const dimm_suffix[DIMM_TEMP_ATTRS] = {
>> +	"label",
>> +	"input",
>> +};
>> +
>> +static int create_dimm_temp_group(struct peci_hwmon *priv, int dimm_no)
>> +{
>> +	int i;
>> +
>> +	for (i = 0; i < DIMM_TEMP_ATTRS; i++) {
>> +		snprintf(priv->dimm.attr_name[dimm_no][i],
>> +			 ATTR_NAME_LEN, "temp%d_%s",
>> +			 DIMM_INDEX_OFFSET + dimm_no, dimm_suffix[i]);
>> +		sysfs_attr_init(&priv->dimm.sd_attrs[dimm_no][i].dev_attr.attr);
>> +		priv->dimm.sd_attrs[dimm_no][i].dev_attr.attr.name =
>> +					       priv->dimm.attr_name[dimm_no][i];
>> +		priv->dimm.sd_attrs[dimm_no][i].dev_attr.attr.mode = 0444;
>> +		priv->dimm.sd_attrs[dimm_no][i].dev_attr.show = dimm_show_fn[i];
>> +		priv->dimm.sd_attrs[dimm_no][i].index = dimm_no;
>> +		priv->dimm.attrs[dimm_no][i] =
>> +				 &priv->dimm.sd_attrs[dimm_no][i].dev_attr.attr;
>> +	}
>> +
>> +	priv->dimm.attr_group[dimm_no].attrs = priv->dimm.attrs[dimm_no];
>> +
>> +	return sysfs_create_group(&priv->hwmon_dev->kobj,
>> +				  &priv->dimm.attr_group[dimm_no]);
>> +}
>> +
>> +static int peci_hwmon_probe(struct platform_device *pdev)
>> +{
>> +	struct device *dev = &pdev->dev;
>> +	struct device_node *np = dev->of_node;
>> +	struct peci_hwmon *priv;
>> +	struct device *hwmon;
>> +	int rc, i;
>> +
>> +	priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL);
>> +	if (!priv)
>> +		return -ENOMEM;
>> +
>> +	dev_set_drvdata(dev, priv);
>> +	priv->dev = dev;
>> +
>> +	rc = of_property_read_u32(np, "cpu-id", &priv->cpu_id);
> 
> What entity determines cpu-id ?
> 

CPU ID numbering is determined by hardware SOCKET_ID strap pins. In this 
driver implementation, cpu-id is being used as CPU client indexing.

>> +	if (rc || priv->cpu_id >= CPU_ID_MAX) {
>> +		dev_err(dev, "Invalid cpu-id configuration\n");
>> +		return rc;
>> +	}
>> +
>> +	rc = of_property_read_u32(np, "dimm-nums", &priv->dimm_nums);
> 
> This is an odd devicetree attribute. Normally the number of DIMMs
> is dynamic. Isn't there a means to get all that information dynamically
> instead of having to set it through devicetree ? What if someone adds
> or removes a DIMM ? Who updates the devicetree ?
> 

It means the number of DIMM slots each CPU has, doesn't mean the number 
of currently installed DIMM components. If a DIMM is inserted a slot, 
CPU reports its actual temperature but on empty slot, CPU reports 0 
instead of reporting an error so it is the reason why this driver 
enumerates all DIMM slots' attribute.

>> +	if (rc || priv->dimm_nums > DIMM_NUMS_MAX) {
>> +		dev_warn(dev, "Invalid dimm-nums : %u. Use default : %u\n",
>> +			 priv->dimm_nums, OF_DIMM_NUMS_DEFAULT);
>> +		priv->dimm_nums = OF_DIMM_NUMS_DEFAULT;
>> +	}
>> +
>> +	priv->show_core = of_property_read_bool(np, "show-core");
> 
> This does not look like an appropriate devicetree attribute.
> 

Okay. I will remove this devicetree attribute and make this driver 
enable core temperature attributes always.

>> +
>> +	priv->groups = peci_hwmon_attr_groups;
>> +
> 
> This assignment (and the ->groups variable) is quite pointless.
> 

Will fix it.

>> +	snprintf(priv->name, NAME_MAX, HWMON_NAME ".cpu%d", priv->cpu_id);
>> +
>> +	hwmon = devm_hwmon_device_register_with_groups(dev,
>> +						       priv->name,
>> +						       priv, priv->groups);
> 
> Please rewrite the driver to use devm_hwmon_device_register_with_info(),
> and avoid dynamic attributes.
> 

I will rewrite it.

>> +
>> +	rc = PTR_ERR_OR_ZERO(hwmon);
>> +	if (rc != 0) {
>> +		dev_err(dev, "Failed to register peci hwmon\n");
>> +		return rc;
>> +	}
>> +
>> +	priv->hwmon_dev = hwmon;
> 
> Something is logically wrong if you need to store hwmon_dev in the
> private data structure. Specifically, creating attributes dynamically
> after hwmon registration is wrong.
> 
>> +
>> +	for (i = 0; i < priv->dimm_nums; i++) {
>> +		rc = create_dimm_temp_group(priv, i);
> 
> No. See earlier comments. All attribute groups must be created during
> registration (or before, but I am not inclined to accept a new driver
> doing that).
> 
>> +		if (rc != 0) {
>> +			dev_err(dev, "Failed to create dimm temp group\n");
>> +			for (--i; i >= 0; i--) {
>> +				sysfs_remove_group(&priv->hwmon_dev->kobj,
>> +						   &priv->dimm.attr_group[i]);
>> +			}
>> +			return rc;
>> +		}
>> +	}
>> +
>> +	/*
>> +	 * Try to create core temp group now. It will be created if CPU is
>> +	 * curretnly online or it will be created after the first reading of
>> +	 * cpuinfo from the online CPU otherwise.
> 
> This is not how CPUs are supposed to be detected, and it does not handle CPUs
> taken offline. If the driver is instantiated as a CPU comes online, or as it
> goes offline, the driver should use the appropriate kernel interfaces to
> trigger that instantiation or removal. However, if so, it may be inappropriate
> to associate CPU temperatures with other system temperatures in the same
> instance of the driver; after all, those are all independent of each other.
> 
> Overall, I suspect that there should be a callback or some other mechanism
> in the peci core to trigger instantiation and removal of this driver, and
> I am not sure if any of the devicetree properties makes sense at all.
> 
> For example, if an instance of this driver is associated with a PECI
> agent (with assorted CPU/DIMM temperature reporting), the instantiation
> could be triggered as soon as the PECI core detects that the agent is
> available, and the PECI core could report what exactly that instance
> supports.
> 

Thanks for sharing your detailed comment. In fact, PECI doesn't have any 
mechanism to get a callback for checking whether CPU is online or not. 
The only way PECI can do is, polling a CPU using the Ping PECI command. 
Also, a BMC controller can't make any PECI communication with offline 
CPU so this implementation uses delayed creation for some attributes 
which is messy as you said.

Like you suggested, a PECI agent could check CPU power state using some 
other mechanism and this driver module could be dynamically 
inserted/removed by the agent according to the CPU power state. This way 
could make all attributes creation possible at probing time so no need 
to use the delayed creation.

Also, I'll check feasibility of dynamic checking for maximum supportable 
numbers on each component type so that we can dynamically set the values 
as you suggested above.

>> +	 */
>> +	if (priv->show_core)
>> +		(void) get_cpuinfo(priv);
>> +
>> +	dev_info(dev, "peci hwmon for CPU#%d registered\n", priv->cpu_id);
> 
> Is this logging noise necessary ? Besides, some of it is redundant.
> 

No, it isn't necessary. I will remove it.

>> +
>> +	return rc;
>> +}
>> +
>> +static int peci_hwmon_remove(struct platform_device *pdev)
>> +{
>> +	struct peci_hwmon *priv = dev_get_drvdata(&pdev->dev);
>> +	int i;
>> +
>> +	if (atomic_read(&priv->core_group_created))
>> +		for (i = 0; i < priv->core_nums; i++) {
>> +			sysfs_remove_group(&priv->hwmon_dev->kobj,
>> +					   &priv->core.attr_group[i]);
>> +		}
>> +
>> +	for (i = 0; i < priv->dimm_nums; i++) {
>> +		sysfs_remove_group(&priv->hwmon_dev->kobj,
>> +				   &priv->dimm.attr_group[i]);
>> +	}
> 
> If you need to call sysfs_remove_group from here,
> something is conceptually wrong in your driver.
> 

I'll remove it while rewriting the driver.

>> +
>> +	return 0;
>> +}
>> +
>> +static const struct of_device_id peci_of_table[] = {
>> +	{ .compatible = "peci-hwmon", },
> 
> This does not look like a reference to some piece of hardware.
> 

This driver provides generic PECI hwmon function to which controller has 
PECI HW such as Aspeed or Nuvoton BMC chip so it's not dependant on a 
specific hardware. Should I remove this or any suggestion?

>> +	{ }
>> +};
>> +MODULE_DEVICE_TABLE(of, peci_of_table);
>> +
>> +static struct platform_driver peci_hwmon_driver = {
>> +	.probe = peci_hwmon_probe,
>> +	.remove = peci_hwmon_remove,
>> +	.driver = {
>> +		.name           = DEVICE_NAME,
>> +		.of_match_table = peci_of_table,
>> +	},
>> +};
>> +
>> +module_platform_driver(peci_hwmon_driver);
>> +MODULE_AUTHOR("Jae Hyun Yoo <jae.hyun.yoo@linux.intel.com>");
>> +MODULE_DESCRIPTION("PECI hwmon driver");
>> +MODULE_LICENSE("GPL v2");
Jae Hyun Yoo Jan. 11, 2018, 8:49 p.m. UTC | #6
On 1/11/2018 5:22 AM, Arnd Bergmann wrote:
> On Thu, Jan 11, 2018 at 12:45 AM, Jae Hyun Yoo
> <jae.hyun.yoo@linux.intel.com> wrote:
>> On 1/10/2018 4:29 AM, Arnd Bergmann wrote:
>>>
>>> On Tue, Jan 9, 2018 at 11:31 PM, Jae Hyun Yoo
>>> <jae.hyun.yoo@linux.intel.com> wrote:
>>>>
>>>> This commit adds driver implementation for a generic PECI hwmon.
>>>>
>>>> Signed-off-by: Jae Hyun Yoo <jae.hyun.yoo@linux.intel.com>
>>>
>>>
>>>> +static int xfer_peci_msg(int cmd, void *pmsg)
>>>> +{
>>>> +       int rc;
>>>> +
>>>> +       mutex_lock(&peci_hwmon_lock);
>>>> +       rc = peci_ioctl(NULL, cmd, (unsigned long)pmsg);
>>>> +       mutex_unlock(&peci_hwmon_lock);
>>>> +
>>>> +       return rc;
>>>> +}
>>>
>>>
>>> I said earlier that peci_ioctl() looked unused, that was obviously
>>> wrong, but what you have here
>>> is not a proper way to abstract a bus.
>>>
>>> Maybe this can be done more like an i2c bus: make the peci controller
>>> a bus device
>>> and register all known target/index pairs as devices with the peci bus
>>> type, and have
>>> them probed from DT. The driver can then bind to each of those
>>> individually.
>>> Not sure if that is getting to granular at that point, I'd have to
>>> understand better
>>> how it is expected to get used, and what the variances are between
>>> implementations.
>>>
>>
>> Thanks for sharing your opinion. In fact, this was also suggested by openbmc
>> community so I should consider of redesigning it. I'm currently thinking
>> about adding a new PECI device class as an abstract layer and any BMC
>> chipset specific driver could be attached to the PECI class driver. Then,
>> each CPU client could be registered as an individual device as you
>> suggested. Will consider your suggestion.
> 
> Another idea might be to pretend that PECI was I2C. We already have a few
> drivers for hardware that is not I2C but whose software interface looks
> similar enough that it just works. No idea if that is the case for PECI, but
> xfer_peci_msg might be close enough to i2c_xfer to make it work. If you
> are able to do that, then the PECI controller would just register itself
> as an i2c controller and it can be accessed using /dev/i2c from user space
> or a high-level i2c_driver.
> 
>        Arnd
> 

Thanks for the good idea. It looks like one of possible options. I'll 
check this idea as well. :)

Thanks,
Jae
Guenter Roeck Jan. 11, 2018, 9:40 p.m. UTC | #7
On Thu, Jan 11, 2018 at 11:47:01AM -0800, Jae Hyun Yoo wrote:
> On 1/10/2018 1:47 PM, Guenter Roeck wrote:
> >On Tue, Jan 09, 2018 at 02:31:26PM -0800, Jae Hyun Yoo wrote:
> >>This commit adds driver implementation for a generic PECI hwmon.
> >>
> >>Signed-off-by: Jae Hyun Yoo <jae.hyun.yoo@linux.intel.com>

[ ... ]

> >>+
> >>+	if (priv->temp.tcontrol.valid &&
> >>+	    time_before(jiffies, priv->temp.tcontrol.last_updated +
> >>+				 UPDATE_INTERVAL_MIN))
> >>+		return 0;
> >>+
> >
> >Is the delay necessary ? Otherwise I would suggest to drop it.
> >It adds a lot of complexity to the driver. Also, if the user polls
> >values more often, that is presumably on purpose.
> >
> 
> I was intended to reduce traffic on PECI bus because it's low speed single
> wired bus, and temperature values don't change frequently because the value
> is sampled and averaged in CPU itself. I'll keep this.
> 
Then please try to move the common code into a single function.

[ ... ]

> >>+
> >>+	rc = of_property_read_u32(np, "cpu-id", &priv->cpu_id);
> >
> >What entity determines cpu-id ?
> >
> 
> CPU ID numbering is determined by hardware SOCKET_ID strap pins. In this
> driver implementation, cpu-id is being used as CPU client indexing.
> 
Seems to me the necessary information to identify a given CPU should
be provided by the PECI core. Also, there are already "cpu" nodes
in devicetree which, if I recall correctly, may include information
such as CPU Ids.

> >>+	if (rc || priv->cpu_id >= CPU_ID_MAX) {
> >>+		dev_err(dev, "Invalid cpu-id configuration\n");
> >>+		return rc;
> >>+	}
> >>+
> >>+	rc = of_property_read_u32(np, "dimm-nums", &priv->dimm_nums);
> >
> >This is an odd devicetree attribute. Normally the number of DIMMs
> >is dynamic. Isn't there a means to get all that information dynamically
> >instead of having to set it through devicetree ? What if someone adds
> >or removes a DIMM ? Who updates the devicetree ?
> >
> 
> It means the number of DIMM slots each CPU has, doesn't mean the number of
> currently installed DIMM components. If a DIMM is inserted a slot, CPU
> reports its actual temperature but on empty slot, CPU reports 0 instead of
> reporting an error so it is the reason why this driver enumerates all DIMM
> slots' attribute.
> 
And there is no other means to get the number of DIMM slots per CPU ?
It just seems to be that this is the wrong location to provide such
information.

[ ... ]

> >>+
> >>+static const struct of_device_id peci_of_table[] = {
> >>+	{ .compatible = "peci-hwmon", },
> >
> >This does not look like a reference to some piece of hardware.
> >
> 
> This driver provides generic PECI hwmon function to which controller has
> PECI HW such as Aspeed or Nuvoton BMC chip so it's not dependant on a
> specific hardware. Should I remove this or any suggestion?
> 

I don't really know enough about the system to make a recommendation.
It seems to me that the PECI core should identify which functionality
it supports and instantiate the necessary driver(s). Maybe there should
be sub-nodes to the peci node with relevant information. Those sub-nodes
should specify the supported functionality in more detail, though -
such as indicating the supported CPU and/or DIMM sensors.

Guenter
Andrew Lunn Jan. 11, 2018, 10:18 p.m. UTC | #8
> > >>+static const struct of_device_id peci_of_table[] = {
> > >>+	{ .compatible = "peci-hwmon", },
> > >
> > >This does not look like a reference to some piece of hardware.
> > >
> > 
> > This driver provides generic PECI hwmon function to which controller has
> > PECI HW such as Aspeed or Nuvoton BMC chip so it's not dependant on a
> > specific hardware. Should I remove this or any suggestion?

PECI seems to be an Intel thing. So at least it should be

 { .compatible = "intel,peci-hwmon", }

assuming it is actually compatible with the Intel specification.

	 Andrew
Jae Hyun Yoo Jan. 11, 2018, 11:03 p.m. UTC | #9
On 1/11/2018 1:40 PM, Guenter Roeck wrote:
> On Thu, Jan 11, 2018 at 11:47:01AM -0800, Jae Hyun Yoo wrote:
>> On 1/10/2018 1:47 PM, Guenter Roeck wrote:
>>> On Tue, Jan 09, 2018 at 02:31:26PM -0800, Jae Hyun Yoo wrote:
>>>> This commit adds driver implementation for a generic PECI hwmon.
>>>>
>>>> Signed-off-by: Jae Hyun Yoo <jae.hyun.yoo@linux.intel.com>
> 
> [ ... ]
> 
>>>> +
>>>> +	if (priv->temp.tcontrol.valid &&
>>>> +	    time_before(jiffies, priv->temp.tcontrol.last_updated +
>>>> +				 UPDATE_INTERVAL_MIN))
>>>> +		return 0;
>>>> +
>>>
>>> Is the delay necessary ? Otherwise I would suggest to drop it.
>>> It adds a lot of complexity to the driver. Also, if the user polls
>>> values more often, that is presumably on purpose.
>>>
>>
>> I was intended to reduce traffic on PECI bus because it's low speed single
>> wired bus, and temperature values don't change frequently because the value
>> is sampled and averaged in CPU itself. I'll keep this.
>>
> Then please try to move the common code into a single function.
> 

That makes sense. I'll move the common code into a single function.

> [ ... ]
> 
>>>> +
>>>> +	rc = of_property_read_u32(np, "cpu-id", &priv->cpu_id);
>>>
>>> What entity determines cpu-id ?
>>>
>>
>> CPU ID numbering is determined by hardware SOCKET_ID strap pins. In this
>> driver implementation, cpu-id is being used as CPU client indexing.
>>
> Seems to me the necessary information to identify a given CPU should
> be provided by the PECI core. Also, there are already "cpu" nodes
> in devicetree which, if I recall correctly, may include information
> such as CPU Ids.
> 

This driver is implemented to support only BMC (Baseboard Management 
Controllers) chipset which is running on a separated linux kernel from a 
host server system. Through a PECI connection between them, this driver 
collects the host server system's CPU and DIMM temperature information 
which is running on a separated OS. That could be a linux, a Windows OS 
or any other OSes. I mean, there is no shared devicetree data between 
them so it is why the 'cpu_id' was added into dt configuration of this 
driver.

Using quite limited hardware connections such as PECI, KSC, eSPI and 
SMBus, the BMC manages the host server and this hwmon is one of features 
of BMC.

>>>> +	if (rc || priv->cpu_id >= CPU_ID_MAX) {
>>>> +		dev_err(dev, "Invalid cpu-id configuration\n");
>>>> +		return rc;
>>>> +	}
>>>> +
>>>> +	rc = of_property_read_u32(np, "dimm-nums", &priv->dimm_nums);
>>>
>>> This is an odd devicetree attribute. Normally the number of DIMMs
>>> is dynamic. Isn't there a means to get all that information dynamically
>>> instead of having to set it through devicetree ? What if someone adds
>>> or removes a DIMM ? Who updates the devicetree ?
>>>
>>
>> It means the number of DIMM slots each CPU has, doesn't mean the number of
>> currently installed DIMM components. If a DIMM is inserted a slot, CPU
>> reports its actual temperature but on empty slot, CPU reports 0 instead of
>> reporting an error so it is the reason why this driver enumerates all DIMM
>> slots' attribute.
>>
> And there is no other means to get the number of DIMM slots per CPU ?
> It just seems to be that this is the wrong location to provide such
> information.
> 

This devicetree attribute will be configured at runtime using dt overlay 
based on the host server's hardware configuration which will be parsed 
and managed by a userspace BMC service.

> [ ... ]
> 
>>>> +
>>>> +static const struct of_device_id peci_of_table[] = {
>>>> +	{ .compatible = "peci-hwmon", },
>>>
>>> This does not look like a reference to some piece of hardware.
>>>
>>
>> This driver provides generic PECI hwmon function to which controller has
>> PECI HW such as Aspeed or Nuvoton BMC chip so it's not dependant on a
>> specific hardware. Should I remove this or any suggestion?
>>
> 
> I don't really know enough about the system to make a recommendation.
> It seems to me that the PECI core should identify which functionality
> it supports and instantiate the necessary driver(s). Maybe there should
> be sub-nodes to the peci node with relevant information. Those sub-nodes
> should specify the supported functionality in more detail, though -
> such as indicating the supported CPU and/or DIMM sensors.
> 
> Guenter
> 

As I explained above, BMC and host server are running on separated OSes 
so this driver cannot be (actually, doesn't need to be) directly 
associated with other kernel modules in the BMC side OS for identifying 
the host server system's functionality. My thought is, this driver will 
use PECI only for identifying the host server's CPU and DIMM information 
and the userspace BMC service could deliver any additional host server 
information thru dt overlay if needed before the BMC service initiates 
this driver at runtime.

Thanks a lot,

Jae
Jae Hyun Yoo Jan. 11, 2018, 11:14 p.m. UTC | #10
On 1/11/2018 2:18 PM, Andrew Lunn wrote:
>>>>> +static const struct of_device_id peci_of_table[] = {
>>>>> +	{ .compatible = "peci-hwmon", },
>>>>
>>>> This does not look like a reference to some piece of hardware.
>>>>
>>>
>>> This driver provides generic PECI hwmon function to which controller has
>>> PECI HW such as Aspeed or Nuvoton BMC chip so it's not dependant on a
>>> specific hardware. Should I remove this or any suggestion?
> 
> PECI seems to be an Intel thing. So at least it should be
> 
>   { .compatible = "intel,peci-hwmon", }
> 
> assuming it is actually compatible with the Intel specification.
> 
> 	 Andrew
> 

Yes, PECI is an Intel thing but this driver is running on an ARM kernel 
on Aspeed or Nuvoton chipsets for now. This driver will be monitoring a 
host server's Intel CPU and DIMM which is running on a separated OS.

Thanks,
Jae
Andrew Lunn Jan. 11, 2018, 11:53 p.m. UTC | #11
On Thu, Jan 11, 2018 at 03:14:37PM -0800, Jae Hyun Yoo wrote:
> On 1/11/2018 2:18 PM, Andrew Lunn wrote:
> >>>>>+static const struct of_device_id peci_of_table[] = {
> >>>>>+	{ .compatible = "peci-hwmon", },
> >>>>
> >>>>This does not look like a reference to some piece of hardware.
> >>>>
> >>>
> >>>This driver provides generic PECI hwmon function to which controller has
> >>>PECI HW such as Aspeed or Nuvoton BMC chip so it's not dependant on a
> >>>specific hardware. Should I remove this or any suggestion?
> >
> >PECI seems to be an Intel thing. So at least it should be
> >
> >  { .compatible = "intel,peci-hwmon", }
> >
> >assuming it is actually compatible with the Intel specification.
> >
> >	 Andrew
> >
> 
> Yes, PECI is an Intel thing but this driver is running on an ARM kernel on
> Aspeed or Nuvoton chipsets for now. This driver will be monitoring a host
> server's Intel CPU and DIMM which is running on a separated OS.

Hi Jae

You need to be careful with the name then. You should not claim the
name 'peci' in case somebody actually implements a PECI driver which
is compatible with Intel PECI.

However, looking at other comments, it seems like this part is going
away, if you turn your code into a bus driver.

      Andrew
Jae Hyun Yoo Jan. 12, 2018, 12:26 a.m. UTC | #12
On 1/11/2018 3:53 PM, Andrew Lunn wrote:
> On Thu, Jan 11, 2018 at 03:14:37PM -0800, Jae Hyun Yoo wrote:
>> On 1/11/2018 2:18 PM, Andrew Lunn wrote:
>>>>>>> +static const struct of_device_id peci_of_table[] = {
>>>>>>> +	{ .compatible = "peci-hwmon", },
>>>>>>
>>>>>> This does not look like a reference to some piece of hardware.
>>>>>>
>>>>>
>>>>> This driver provides generic PECI hwmon function to which controller has
>>>>> PECI HW such as Aspeed or Nuvoton BMC chip so it's not dependant on a
>>>>> specific hardware. Should I remove this or any suggestion?
>>>
>>> PECI seems to be an Intel thing. So at least it should be
>>>
>>>   { .compatible = "intel,peci-hwmon", }
>>>
>>> assuming it is actually compatible with the Intel specification.
>>>
>>> 	 Andrew
>>>
>>
>> Yes, PECI is an Intel thing but this driver is running on an ARM kernel on
>> Aspeed or Nuvoton chipsets for now. This driver will be monitoring a host
>> server's Intel CPU and DIMM which is running on a separated OS.
> 
> Hi Jae
> 
> You need to be careful with the name then. You should not claim the
> name 'peci' in case somebody actually implements a PECI driver which
> is compatible with Intel PECI.
> 
> However, looking at other comments, it seems like this part is going
> away, if you turn your code into a bus driver.
> 
>        Andrew
> 

Hi Andrew,

I see. I'll keep that in mind and will keep finding if there is any 
better way. Thanks for sharing your thought.

Jae
diff mbox series

Patch

diff --git a/drivers/hwmon/Kconfig b/drivers/hwmon/Kconfig
index 9256dd0..3a62c60 100644
--- a/drivers/hwmon/Kconfig
+++ b/drivers/hwmon/Kconfig
@@ -1234,6 +1234,12 @@  config SENSORS_NCT7904
 	  This driver can also be built as a module.  If so, the module
 	  will be called nct7904.
 
+config SENSORS_PECI_HWMON
+	tristate "PECI hwmon support"
+	depends on ASPEED_PECI
+	help
+	  If you say yes here you get support for the generic PECI hwmon driver.
+
 config SENSORS_NSA320
 	tristate "ZyXEL NSA320 and compatible fan speed and temperature sensors"
 	depends on GPIOLIB && OF
diff --git a/drivers/hwmon/Makefile b/drivers/hwmon/Makefile
index 98000fc..41d43a5 100644
--- a/drivers/hwmon/Makefile
+++ b/drivers/hwmon/Makefile
@@ -131,6 +131,7 @@  obj-$(CONFIG_SENSORS_NCT7802)	+= nct7802.o
 obj-$(CONFIG_SENSORS_NCT7904)	+= nct7904.o
 obj-$(CONFIG_SENSORS_NSA320)	+= nsa320-hwmon.o
 obj-$(CONFIG_SENSORS_NTC_THERMISTOR)	+= ntc_thermistor.o
+obj-$(CONFIG_SENSORS_PECI_HWMON)	+= peci-hwmon.o
 obj-$(CONFIG_SENSORS_PC87360)	+= pc87360.o
 obj-$(CONFIG_SENSORS_PC87427)	+= pc87427.o
 obj-$(CONFIG_SENSORS_PCF8591)	+= pcf8591.o
diff --git a/drivers/hwmon/peci-hwmon.c b/drivers/hwmon/peci-hwmon.c
new file mode 100644
index 0000000..2d2a288
--- /dev/null
+++ b/drivers/hwmon/peci-hwmon.c
@@ -0,0 +1,953 @@ 
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2017 Intel Corporation
+
+#include <linux/delay.h>
+#include <linux/hwmon.h>
+#include <linux/hwmon-sysfs.h>
+#include <linux/jiffies.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/of_platform.h>
+#include <linux/of_device.h>
+#include <linux/platform_device.h>
+#include <linux/syscalls.h>
+#include <misc/peci.h>
+
+#define DEVICE_NAME "peci-hwmon"
+#define HWMON_NAME "peci_hwmon"
+
+#define CPU_ID_MAX           8   /* Max CPU number configured by socket ID */
+#define DIMM_NUMS_MAX        16  /* Max DIMM numbers (channel ranks x 2) */
+#define CORE_NUMS_MAX        28  /* Max core numbers (max on SKX Platinum) */
+#define TEMP_TYPE_PECI       6   /* Sensor type 6: Intel PECI */
+#define CORE_INDEX_OFFSET    100 /* sysfs filename start offset for core temp */
+#define DIMM_INDEX_OFFSET    200 /* sysfs filename start offset for DIMM temp */
+#define TEMP_NAME_HEADER_LEN 4   /* sysfs temp type header length */
+#define OF_DIMM_NUMS_DEFAULT 16  /* default dimm-nums setting */
+
+#define CORE_TEMP_ATTRS      5
+#define DIMM_TEMP_ATTRS      2
+#define ATTR_NAME_LEN        24
+
+#define UPDATE_INTERVAL_MIN  HZ
+
+enum sign_t {
+	POS,
+	NEG
+};
+
+struct cpuinfo_t {
+	bool valid;
+	u32  dib;
+	u8   cpuid;
+	u8   platform_id;
+	u32  microcode;
+	u8   logical_thread_nums;
+};
+
+struct temp_data_t {
+	bool valid;
+	s32  value;
+	unsigned long last_updated;
+};
+
+struct temp_group_t {
+	struct temp_data_t tjmax;
+	struct temp_data_t tcontrol;
+	struct temp_data_t tthrottle;
+	struct temp_data_t dts_margin;
+	struct temp_data_t die;
+	struct temp_data_t core[CORE_NUMS_MAX];
+	struct temp_data_t dimm[DIMM_NUMS_MAX];
+};
+
+struct core_temp_attr_group_t {
+	struct sensor_device_attribute sd_attrs[CORE_NUMS_MAX][CORE_TEMP_ATTRS];
+	char attr_name[CORE_NUMS_MAX][CORE_TEMP_ATTRS][ATTR_NAME_LEN];
+	struct attribute *attrs[CORE_NUMS_MAX][CORE_TEMP_ATTRS + 1];
+	struct attribute_group attr_group[CORE_NUMS_MAX];
+};
+
+struct dimm_temp_attr_group_t {
+	struct sensor_device_attribute sd_attrs[DIMM_NUMS_MAX][DIMM_TEMP_ATTRS];
+	char attr_name[DIMM_NUMS_MAX][DIMM_TEMP_ATTRS][ATTR_NAME_LEN];
+	struct attribute *attrs[DIMM_NUMS_MAX][DIMM_TEMP_ATTRS + 1];
+	struct attribute_group attr_group[DIMM_NUMS_MAX];
+};
+
+struct peci_hwmon {
+	struct device *dev;
+	struct device *hwmon_dev;
+	char name[NAME_MAX];
+	const struct attribute_group **groups;
+	struct cpuinfo_t cpuinfo;
+	struct temp_group_t temp;
+	u32 cpu_id;
+	bool show_core;
+	u32 core_nums;
+	u32 dimm_nums;
+	atomic_t core_group_created;
+	struct core_temp_attr_group_t core;
+	struct dimm_temp_attr_group_t dimm;
+};
+
+enum label_t {
+	L_DIE,
+	L_DTS,
+	L_TCONTROL,
+	L_TTHROTTLE,
+	L_MAX
+};
+
+static const char *peci_label[L_MAX] = {
+	"Die temperature\n",
+	"DTS thermal margin to Tcontrol\n",
+	"Tcontrol temperature\n",
+	"Tthrottle temperature\n",
+};
+
+static DEFINE_MUTEX(peci_hwmon_lock);
+
+static int create_core_temp_group(struct peci_hwmon *priv, int core_no);
+
+
+static int xfer_peci_msg(int cmd, void *pmsg)
+{
+	int rc;
+
+	mutex_lock(&peci_hwmon_lock);
+	rc = peci_ioctl(NULL, cmd, (unsigned long)pmsg);
+	mutex_unlock(&peci_hwmon_lock);
+
+	return rc;
+}
+
+static int get_cpuinfo(struct peci_hwmon *priv)
+{
+	struct peci_get_dib_msg dib_msg;
+	struct peci_rd_pkg_cfg_msg cfg_msg;
+	int rc, i;
+
+	if (!priv->cpuinfo.valid) {
+		dib_msg.target = PECI_BASE_ADDR + priv->cpu_id;
+
+		rc = xfer_peci_msg(PECI_IOC_GET_DIB, (void *)&dib_msg);
+		if (rc < 0)
+			return rc;
+
+		priv->cpuinfo.dib = dib_msg.dib;
+
+		cfg_msg.target = PECI_BASE_ADDR + priv->cpu_id;
+		cfg_msg.index = MBX_INDEX_CPU_ID;
+		cfg_msg.param = 0;
+		cfg_msg.rx_len = 4;
+
+		rc = xfer_peci_msg(PECI_IOC_RD_PKG_CFG, (void *)&cfg_msg);
+		if (rc < 0)
+			return rc;
+
+		priv->cpuinfo.cpuid = cfg_msg.pkg_config[0];
+
+		cfg_msg.target = PECI_BASE_ADDR + priv->cpu_id;
+		cfg_msg.index = MBX_INDEX_CPU_ID;
+		cfg_msg.param = 1;
+		cfg_msg.rx_len = 4;
+
+		rc = xfer_peci_msg(PECI_IOC_RD_PKG_CFG, (void *)&cfg_msg);
+		if (rc < 0)
+			return rc;
+
+		priv->cpuinfo.platform_id = cfg_msg.pkg_config[0];
+
+		cfg_msg.target = PECI_BASE_ADDR + priv->cpu_id;
+		cfg_msg.index = MBX_INDEX_CPU_ID;
+		cfg_msg.param = 3;
+		cfg_msg.rx_len = 4;
+
+		rc = xfer_peci_msg(PECI_IOC_RD_PKG_CFG, (void *)&cfg_msg);
+		if (rc < 0)
+			return rc;
+
+		priv->cpuinfo.logical_thread_nums = cfg_msg.pkg_config[0] + 1;
+
+		cfg_msg.target = PECI_BASE_ADDR + priv->cpu_id;
+		cfg_msg.index = MBX_INDEX_CPU_ID;
+		cfg_msg.param = 4;
+		cfg_msg.rx_len = 4;
+
+		rc = xfer_peci_msg(PECI_IOC_RD_PKG_CFG, (void *)&cfg_msg);
+		if (rc < 0)
+			return rc;
+
+		priv->cpuinfo.microcode = (cfg_msg.pkg_config[3] << 24) |
+					  (cfg_msg.pkg_config[2] << 16) |
+					  (cfg_msg.pkg_config[1] << 8) |
+					  cfg_msg.pkg_config[0];
+
+		priv->core_nums = priv->cpuinfo.logical_thread_nums / 2;
+
+		if (priv->show_core &&
+		    atomic_inc_return(&priv->core_group_created) == 1) {
+			for (i = 0; i < priv->core_nums; i++) {
+				rc = create_core_temp_group(priv, i);
+				if (rc != 0) {
+					dev_err(priv->dev,
+						"Failed to create core temp group\n");
+					for (--i; i >= 0; i--) {
+						sysfs_remove_group(
+						     &priv->hwmon_dev->kobj,
+						     &priv->core.attr_group[i]);
+					}
+					atomic_set(&priv->core_group_created,
+						   0);
+					return rc;
+				}
+			}
+		}
+
+		priv->cpuinfo.valid = true;
+	}
+
+	return 0;
+}
+
+static int get_tjmax(struct peci_hwmon *priv)
+{
+	struct peci_rd_pkg_cfg_msg msg;
+	int rc;
+
+	rc = get_cpuinfo(priv);
+	if (rc < 0)
+		return rc;
+
+	if (!priv->temp.tjmax.valid) {
+		msg.target = PECI_BASE_ADDR + priv->cpu_id;
+		msg.index = MBX_INDEX_TEMP_TARGET;
+		msg.param = 0;
+		msg.rx_len = 4;
+
+		rc = xfer_peci_msg(PECI_IOC_RD_PKG_CFG, (void *)&msg);
+		if (rc < 0)
+			return rc;
+
+		priv->temp.tjmax.value = (s32)msg.pkg_config[2] * 1000;
+		priv->temp.tjmax.valid = true;
+	}
+
+	return 0;
+}
+
+static int get_tcontrol(struct peci_hwmon *priv)
+{
+	struct peci_rd_pkg_cfg_msg msg;
+	s32 tcontrol_margin;
+	int rc;
+
+	if (priv->temp.tcontrol.valid &&
+	    time_before(jiffies, priv->temp.tcontrol.last_updated +
+				 UPDATE_INTERVAL_MIN))
+		return 0;
+
+	rc = get_tjmax(priv);
+	if (rc < 0)
+		return rc;
+
+	msg.target = PECI_BASE_ADDR + priv->cpu_id;
+	msg.index = MBX_INDEX_TEMP_TARGET;
+	msg.param = 0;
+	msg.rx_len = 4;
+
+	rc = xfer_peci_msg(PECI_IOC_RD_PKG_CFG, (void *)&msg);
+	if (rc < 0)
+		return rc;
+
+	tcontrol_margin = msg.pkg_config[1];
+	tcontrol_margin = ((tcontrol_margin ^ 0x80) - 0x80) * 1000;
+
+	priv->temp.tcontrol.value = priv->temp.tjmax.value - tcontrol_margin;
+
+	if (!priv->temp.tcontrol.valid) {
+		priv->temp.tcontrol.last_updated = INITIAL_JIFFIES;
+		priv->temp.tcontrol.valid = true;
+	} else {
+		priv->temp.tcontrol.last_updated = jiffies;
+	}
+
+	return 0;
+}
+
+static int get_tthrottle(struct peci_hwmon *priv)
+{
+	struct peci_rd_pkg_cfg_msg msg;
+	s32 tthrottle_offset;
+	int rc;
+
+	if (priv->temp.tthrottle.valid &&
+	    time_before(jiffies, priv->temp.tthrottle.last_updated +
+				 UPDATE_INTERVAL_MIN))
+		return 0;
+
+	rc = get_tjmax(priv);
+	if (rc < 0)
+		return rc;
+
+	msg.target = PECI_BASE_ADDR + priv->cpu_id;
+	msg.index = MBX_INDEX_TEMP_TARGET;
+	msg.param = 0;
+	msg.rx_len = 4;
+
+	rc = xfer_peci_msg(PECI_IOC_RD_PKG_CFG, (void *)&msg);
+	if (rc < 0)
+		return rc;
+
+	tthrottle_offset = (msg.pkg_config[3] & 0x2f) * 1000;
+	priv->temp.tthrottle.value = priv->temp.tjmax.value - tthrottle_offset;
+
+	if (!priv->temp.tthrottle.valid) {
+		priv->temp.tthrottle.last_updated = INITIAL_JIFFIES;
+		priv->temp.tthrottle.valid = true;
+	} else {
+		priv->temp.tthrottle.last_updated = jiffies;
+	}
+
+	return 0;
+}
+
+static int get_die_temp(struct peci_hwmon *priv)
+{
+	struct peci_get_temp_msg msg;
+	int rc;
+
+	if (priv->temp.die.valid &&
+	    time_before(jiffies, priv->temp.die.last_updated +
+				 UPDATE_INTERVAL_MIN))
+		return 0;
+
+	rc = get_tjmax(priv);
+	if (rc < 0)
+		return rc;
+
+	msg.target = PECI_BASE_ADDR + priv->cpu_id;
+
+	rc = xfer_peci_msg(PECI_IOC_GET_TEMP, (void *)&msg);
+	if (rc < 0)
+		return rc;
+
+	priv->temp.die.value = priv->temp.tjmax.value +
+			       ((s32)msg.temp_raw * 1000 / 64);
+
+	if (!priv->temp.die.valid) {
+		priv->temp.die.last_updated = INITIAL_JIFFIES;
+		priv->temp.die.valid = true;
+	} else {
+		priv->temp.die.last_updated = jiffies;
+	}
+
+	return 0;
+}
+
+static int get_dts_margin(struct peci_hwmon *priv)
+{
+	struct peci_rd_pkg_cfg_msg msg;
+	s32 dts_margin;
+	int rc;
+
+	if (priv->temp.dts_margin.valid &&
+	    time_before(jiffies, priv->temp.dts_margin.last_updated +
+				 UPDATE_INTERVAL_MIN))
+		return 0;
+
+	rc = get_cpuinfo(priv);
+	if (rc < 0)
+		return rc;
+
+	msg.target = PECI_BASE_ADDR + priv->cpu_id;
+	msg.index = MBX_INDEX_DTS_MARGIN;
+	msg.param = 0;
+	msg.rx_len = 4;
+
+	rc = xfer_peci_msg(PECI_IOC_RD_PKG_CFG, (void *)&msg);
+	if (rc < 0)
+		return rc;
+
+	dts_margin = (msg.pkg_config[1] << 8) | msg.pkg_config[0];
+
+	/*
+	 * Processors return a value of DTS reading in 10.6 format
+	 * (10 bits signed decimal, 6 bits fractional).
+	 * Error codes:
+	 *   0x8000: General sensor error
+	 *   0x8001: Reserved
+	 *   0x8002: Underflow on reading value
+	 *   0x8003-0x81ff: Reserved
+	 */
+	if (dts_margin >= 0x8000 && dts_margin <= 0x81ff)
+		return -1;
+
+	dts_margin = ((dts_margin ^ 0x8000) - 0x8000) * 1000 / 64;
+
+	priv->temp.dts_margin.value = dts_margin;
+
+	if (!priv->temp.dts_margin.valid) {
+		priv->temp.dts_margin.last_updated = INITIAL_JIFFIES;
+		priv->temp.dts_margin.valid = true;
+	} else {
+		priv->temp.dts_margin.last_updated = jiffies;
+	}
+
+	return 0;
+}
+
+static int get_core_temp(struct peci_hwmon *priv, int core_index)
+{
+	struct peci_rd_pkg_cfg_msg msg;
+	s32 core_dts_margin;
+	int rc;
+
+	if (priv->temp.core[core_index].valid &&
+	    time_before(jiffies, priv->temp.core[core_index].last_updated +
+				 UPDATE_INTERVAL_MIN))
+		return 0;
+
+	rc = get_tjmax(priv);
+	if (rc < 0)
+		return rc;
+
+	msg.target = PECI_BASE_ADDR + priv->cpu_id;
+	msg.index = MBX_INDEX_PER_CORE_DTS_TEMP;
+	msg.param = core_index;
+	msg.rx_len = 4;
+
+	rc = xfer_peci_msg(PECI_IOC_RD_PKG_CFG, (void *)&msg);
+	if (rc < 0)
+		return rc;
+
+	core_dts_margin = (msg.pkg_config[1] << 8) | msg.pkg_config[0];
+
+	/*
+	 * Processors return a value of the core DTS reading in 10.6 format
+	 * (10 bits signed decimal, 6 bits fractional).
+	 * Error codes:
+	 *   0x8000: General sensor error
+	 *   0x8001: Reserved
+	 *   0x8002: Underflow on reading value
+	 *   0x8003-0x81ff: Reserved
+	 */
+	if (core_dts_margin >= 0x8000 && core_dts_margin <= 0x81ff)
+		return -1;
+
+	core_dts_margin = ((core_dts_margin ^ 0x8000) - 0x8000) * 1000 / 64;
+
+	priv->temp.core[core_index].value = priv->temp.tjmax.value +
+					    core_dts_margin;
+
+	if (!priv->temp.core[core_index].valid) {
+		priv->temp.core[core_index].last_updated = INITIAL_JIFFIES;
+		priv->temp.core[core_index].valid = true;
+	} else {
+		priv->temp.core[core_index].last_updated = jiffies;
+	}
+
+	return 0;
+}
+
+static int get_dimm_temp(struct peci_hwmon *priv, int dimm_index)
+{
+	struct peci_rd_pkg_cfg_msg msg;
+	int channel_rank = dimm_index / 2;
+	int dimm_order = dimm_index % 2;
+	int rc;
+
+	if (priv->temp.core[dimm_index].valid &&
+	    time_before(jiffies, priv->temp.core[dimm_index].last_updated +
+				 UPDATE_INTERVAL_MIN))
+		return 0;
+
+	rc = get_cpuinfo(priv);
+	if (rc < 0)
+		return rc;
+
+	msg.target = PECI_BASE_ADDR + priv->cpu_id;
+	msg.index = MBX_INDEX_DDR_DIMM_TEMP;
+	msg.param = channel_rank;
+	msg.rx_len = 4;
+
+	rc = xfer_peci_msg(PECI_IOC_RD_PKG_CFG, (void *)&msg);
+	if (rc < 0)
+		return rc;
+
+	priv->temp.dimm[dimm_index].value = msg.pkg_config[dimm_order] * 1000;
+
+	if (!priv->temp.dimm[dimm_index].valid) {
+		priv->temp.dimm[dimm_index].last_updated = INITIAL_JIFFIES;
+		priv->temp.dimm[dimm_index].valid = true;
+	} else {
+		priv->temp.dimm[dimm_index].last_updated = jiffies;
+	}
+
+	return 0;
+}
+
+static ssize_t show_info(struct device *dev,
+			 struct device_attribute *attr,
+			 char *buf)
+{
+	struct peci_hwmon *priv = dev_get_drvdata(dev);
+	int rc;
+
+	rc = get_cpuinfo(priv);
+	if (rc < 0)
+		return rc;
+
+	return sprintf(buf, "dib         : 0x%08x\n"
+			    "cpuid       : 0x%x\n"
+			    "platform id : %d\n"
+			    "stepping    : %d\n"
+			    "microcode   : 0x%08x\n"
+			    "logical thread nums : %d\n",
+			    priv->cpuinfo.dib,
+			    priv->cpuinfo.cpuid,
+			    priv->cpuinfo.platform_id,
+			    priv->cpuinfo.cpuid & 0xf,
+			    priv->cpuinfo.microcode,
+			    priv->cpuinfo.logical_thread_nums);
+}
+
+static ssize_t show_tcontrol(struct device *dev,
+			     struct device_attribute *attr,
+			     char *buf)
+{
+	struct peci_hwmon *priv = dev_get_drvdata(dev);
+	int rc;
+
+	rc = get_tcontrol(priv);
+	if (rc < 0)
+		return rc;
+
+	return sprintf(buf, "%d\n", priv->temp.tcontrol.value);
+}
+
+static ssize_t show_tcontrol_margin(struct device *dev,
+				    struct device_attribute *attr,
+				    char *buf)
+{
+	struct peci_hwmon *priv = dev_get_drvdata(dev);
+	struct sensor_device_attribute *sensor_attr = to_sensor_dev_attr(attr);
+	int rc;
+
+	rc = get_tcontrol(priv);
+	if (rc < 0)
+		return rc;
+
+	return sprintf(buf, "%d\n", sensor_attr->index == POS ?
+				    priv->temp.tjmax.value -
+				    priv->temp.tcontrol.value :
+				    priv->temp.tcontrol.value -
+				    priv->temp.tjmax.value);
+}
+
+static ssize_t show_tthrottle(struct device *dev,
+			      struct device_attribute *attr,
+			      char *buf)
+{
+	struct peci_hwmon *priv = dev_get_drvdata(dev);
+	int rc;
+
+	rc = get_tthrottle(priv);
+	if (rc < 0)
+		return rc;
+
+	return sprintf(buf, "%d\n", priv->temp.tthrottle.value);
+}
+
+static ssize_t show_tjmax(struct device *dev,
+			  struct device_attribute *attr,
+			  char *buf)
+{
+	struct peci_hwmon *priv = dev_get_drvdata(dev);
+	int rc;
+
+	rc = get_tjmax(priv);
+	if (rc < 0)
+		return rc;
+
+	return sprintf(buf, "%d\n", priv->temp.tjmax.value);
+}
+
+static ssize_t show_die_temp(struct device *dev,
+			     struct device_attribute *attr,
+			     char *buf)
+{
+	struct peci_hwmon *priv = dev_get_drvdata(dev);
+	int rc;
+
+	rc = get_die_temp(priv);
+	if (rc < 0)
+		return rc;
+
+	return sprintf(buf, "%d\n", priv->temp.die.value);
+}
+
+static ssize_t show_dts_therm_margin(struct device *dev,
+				     struct device_attribute *attr,
+				     char *buf)
+{
+	struct peci_hwmon *priv = dev_get_drvdata(dev);
+	int rc;
+
+	rc = get_dts_margin(priv);
+	if (rc < 0)
+		return rc;
+
+	return sprintf(buf, "%d\n", priv->temp.dts_margin.value);
+}
+
+static ssize_t show_core_temp(struct device *dev,
+			      struct device_attribute *attr,
+			      char *buf)
+{
+	struct peci_hwmon *priv = dev_get_drvdata(dev);
+	struct sensor_device_attribute *sensor_attr = to_sensor_dev_attr(attr);
+	int core_index = sensor_attr->index;
+	int rc;
+
+	rc = get_core_temp(priv, core_index);
+	if (rc < 0)
+		return rc;
+
+	return sprintf(buf, "%d\n", priv->temp.core[core_index].value);
+}
+
+static ssize_t show_dimm_temp(struct device *dev,
+			      struct device_attribute *attr,
+			      char *buf)
+{
+	struct peci_hwmon *priv = dev_get_drvdata(dev);
+	struct sensor_device_attribute *sensor_attr = to_sensor_dev_attr(attr);
+	int dimm_index = sensor_attr->index;
+	int rc;
+
+	rc = get_dimm_temp(priv, dimm_index);
+	if (rc < 0)
+		return rc;
+
+	return sprintf(buf, "%d\n", priv->temp.dimm[dimm_index].value);
+}
+
+static ssize_t show_value(struct device *dev,
+			  struct device_attribute *attr,
+			  char *buf)
+{
+	struct sensor_device_attribute *sensor_attr = to_sensor_dev_attr(attr);
+
+	return sprintf(buf, "%d\n", sensor_attr->index);
+}
+
+static ssize_t show_label(struct device *dev,
+			  struct device_attribute *attr,
+			  char *buf)
+{
+	struct sensor_device_attribute *sensor_attr = to_sensor_dev_attr(attr);
+
+	return sprintf(buf, peci_label[sensor_attr->index]);
+}
+
+static ssize_t show_core_label(struct device *dev,
+			       struct device_attribute *attr,
+			       char *buf)
+{
+	struct sensor_device_attribute *sensor_attr = to_sensor_dev_attr(attr);
+
+	return sprintf(buf, "Core #%d temperature\n", sensor_attr->index);
+}
+
+static ssize_t show_dimm_label(struct device *dev,
+			       struct device_attribute *attr,
+			       char *buf)
+{
+	struct sensor_device_attribute *sensor_attr = to_sensor_dev_attr(attr);
+
+	char channel = 'A' + (sensor_attr->index / 2);
+	int index = sensor_attr->index % 2;
+
+	return sprintf(buf, "Channel Rank %c DDR DIMM #%d temperature\n",
+		       channel, index);
+}
+
+/* Die temperature */
+static SENSOR_DEVICE_ATTR(temp1_label, 0444, show_label, NULL, L_DIE);
+static SENSOR_DEVICE_ATTR(temp1_input, 0444, show_die_temp, NULL, 0);
+static SENSOR_DEVICE_ATTR(temp1_max, 0444, show_tcontrol, NULL, 0);
+static SENSOR_DEVICE_ATTR(temp1_crit, 0444, show_tjmax, NULL, 0);
+static SENSOR_DEVICE_ATTR(temp1_crit_hyst, 0444, show_tcontrol_margin, NULL,
+			  POS);
+
+static struct attribute *die_temp_attrs[] = {
+	&sensor_dev_attr_temp1_label.dev_attr.attr,
+	&sensor_dev_attr_temp1_input.dev_attr.attr,
+	&sensor_dev_attr_temp1_max.dev_attr.attr,
+	&sensor_dev_attr_temp1_crit.dev_attr.attr,
+	&sensor_dev_attr_temp1_crit_hyst.dev_attr.attr,
+	NULL
+};
+
+static const struct attribute_group die_temp_attr_group = {
+	.attrs = die_temp_attrs,
+};
+
+/* DTS thermal margin temperature */
+static SENSOR_DEVICE_ATTR(temp2_label, 0444, show_label, NULL, L_DTS);
+static SENSOR_DEVICE_ATTR(temp2_input, 0444, show_dts_therm_margin, NULL, 0);
+static SENSOR_DEVICE_ATTR(temp2_min, 0444, show_value, NULL, 0);
+static SENSOR_DEVICE_ATTR(temp2_lcrit, 0444, show_tcontrol_margin, NULL, NEG);
+
+static struct attribute *dts_margin_temp_attrs[] = {
+	&sensor_dev_attr_temp2_label.dev_attr.attr,
+	&sensor_dev_attr_temp2_input.dev_attr.attr,
+	&sensor_dev_attr_temp2_min.dev_attr.attr,
+	&sensor_dev_attr_temp2_lcrit.dev_attr.attr,
+	NULL
+};
+
+static const struct attribute_group dts_margin_temp_attr_group = {
+	.attrs = dts_margin_temp_attrs,
+};
+
+/* Tcontrol temperature */
+static SENSOR_DEVICE_ATTR(temp3_label, 0444, show_label, NULL, L_TCONTROL);
+static SENSOR_DEVICE_ATTR(temp3_input, 0444, show_tcontrol, NULL, 0);
+static SENSOR_DEVICE_ATTR(temp3_crit, 0444, show_tjmax, NULL, 0);
+
+static struct attribute *tcontrol_temp_attrs[] = {
+	&sensor_dev_attr_temp3_label.dev_attr.attr,
+	&sensor_dev_attr_temp3_input.dev_attr.attr,
+	&sensor_dev_attr_temp3_crit.dev_attr.attr,
+	NULL
+};
+
+static const struct attribute_group tcontrol_temp_attr_group = {
+	.attrs = tcontrol_temp_attrs,
+};
+
+/* Tthrottle temperature */
+static SENSOR_DEVICE_ATTR(temp4_label, 0444, show_label, NULL, L_TTHROTTLE);
+static SENSOR_DEVICE_ATTR(temp4_input, 0444, show_tthrottle, NULL, 0);
+
+static struct attribute *tthrottle_temp_attrs[] = {
+	&sensor_dev_attr_temp4_label.dev_attr.attr,
+	&sensor_dev_attr_temp4_input.dev_attr.attr,
+	NULL
+};
+
+static const struct attribute_group tthrottle_temp_attr_group = {
+	.attrs = tthrottle_temp_attrs,
+};
+
+/* CPU info */
+static SENSOR_DEVICE_ATTR(info, 0444, show_info, NULL, 0);
+
+static struct attribute *info_attrs[] = {
+	&sensor_dev_attr_info.dev_attr.attr,
+	NULL
+};
+
+static const struct attribute_group info_attr_group = {
+	.attrs = info_attrs,
+};
+
+const struct attribute_group *peci_hwmon_attr_groups[] = {
+	&info_attr_group,
+	&die_temp_attr_group,
+	&dts_margin_temp_attr_group,
+	&tcontrol_temp_attr_group,
+	&tthrottle_temp_attr_group,
+	NULL
+};
+
+static ssize_t (*const core_show_fn[CORE_TEMP_ATTRS]) (struct device *dev,
+		struct device_attribute *devattr, char *buf) = {
+	show_core_label,
+	show_core_temp,
+	show_tcontrol,
+	show_tjmax,
+	show_tcontrol_margin,
+};
+
+static const char *const core_suffix[CORE_TEMP_ATTRS] = {
+	"label",
+	"input",
+	"max",
+	"crit",
+	"crit_hyst",
+};
+
+static int create_core_temp_group(struct peci_hwmon *priv, int core_no)
+{
+	int i;
+
+	for (i = 0; i < CORE_TEMP_ATTRS; i++) {
+		snprintf(priv->core.attr_name[core_no][i],
+			 ATTR_NAME_LEN, "temp%d_%s",
+			 CORE_INDEX_OFFSET + core_no, core_suffix[i]);
+		sysfs_attr_init(
+			    &priv->core.sd_attrs[core_no][i].dev_attr.attr);
+		priv->core.sd_attrs[core_no][i].dev_attr.attr.name =
+					       priv->core.attr_name[core_no][i];
+		priv->core.sd_attrs[core_no][i].dev_attr.attr.mode = 0444;
+		priv->core.sd_attrs[core_no][i].dev_attr.show = core_show_fn[i];
+		if (i == 0 || i == 1) /* label or temp */
+			priv->core.sd_attrs[core_no][i].index = core_no;
+		priv->core.attrs[core_no][i] =
+				 &priv->core.sd_attrs[core_no][i].dev_attr.attr;
+	}
+
+	priv->core.attr_group[core_no].attrs = priv->core.attrs[core_no];
+
+	return sysfs_create_group(&priv->hwmon_dev->kobj,
+				  &priv->core.attr_group[core_no]);
+}
+
+static ssize_t (*const dimm_show_fn[DIMM_TEMP_ATTRS]) (struct device *dev,
+		struct device_attribute *devattr, char *buf) = {
+	show_dimm_label,
+	show_dimm_temp,
+};
+
+static const char *const dimm_suffix[DIMM_TEMP_ATTRS] = {
+	"label",
+	"input",
+};
+
+static int create_dimm_temp_group(struct peci_hwmon *priv, int dimm_no)
+{
+	int i;
+
+	for (i = 0; i < DIMM_TEMP_ATTRS; i++) {
+		snprintf(priv->dimm.attr_name[dimm_no][i],
+			 ATTR_NAME_LEN, "temp%d_%s",
+			 DIMM_INDEX_OFFSET + dimm_no, dimm_suffix[i]);
+		sysfs_attr_init(&priv->dimm.sd_attrs[dimm_no][i].dev_attr.attr);
+		priv->dimm.sd_attrs[dimm_no][i].dev_attr.attr.name =
+					       priv->dimm.attr_name[dimm_no][i];
+		priv->dimm.sd_attrs[dimm_no][i].dev_attr.attr.mode = 0444;
+		priv->dimm.sd_attrs[dimm_no][i].dev_attr.show = dimm_show_fn[i];
+		priv->dimm.sd_attrs[dimm_no][i].index = dimm_no;
+		priv->dimm.attrs[dimm_no][i] =
+				 &priv->dimm.sd_attrs[dimm_no][i].dev_attr.attr;
+	}
+
+	priv->dimm.attr_group[dimm_no].attrs = priv->dimm.attrs[dimm_no];
+
+	return sysfs_create_group(&priv->hwmon_dev->kobj,
+				  &priv->dimm.attr_group[dimm_no]);
+}
+
+static int peci_hwmon_probe(struct platform_device *pdev)
+{
+	struct device *dev = &pdev->dev;
+	struct device_node *np = dev->of_node;
+	struct peci_hwmon *priv;
+	struct device *hwmon;
+	int rc, i;
+
+	priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL);
+	if (!priv)
+		return -ENOMEM;
+
+	dev_set_drvdata(dev, priv);
+	priv->dev = dev;
+
+	rc = of_property_read_u32(np, "cpu-id", &priv->cpu_id);
+	if (rc || priv->cpu_id >= CPU_ID_MAX) {
+		dev_err(dev, "Invalid cpu-id configuration\n");
+		return rc;
+	}
+
+	rc = of_property_read_u32(np, "dimm-nums", &priv->dimm_nums);
+	if (rc || priv->dimm_nums > DIMM_NUMS_MAX) {
+		dev_warn(dev, "Invalid dimm-nums : %u. Use default : %u\n",
+			 priv->dimm_nums, OF_DIMM_NUMS_DEFAULT);
+		priv->dimm_nums = OF_DIMM_NUMS_DEFAULT;
+	}
+
+	priv->show_core = of_property_read_bool(np, "show-core");
+
+	priv->groups = peci_hwmon_attr_groups;
+
+	snprintf(priv->name, NAME_MAX, HWMON_NAME ".cpu%d", priv->cpu_id);
+
+	hwmon = devm_hwmon_device_register_with_groups(dev,
+						       priv->name,
+						       priv, priv->groups);
+
+	rc = PTR_ERR_OR_ZERO(hwmon);
+	if (rc != 0) {
+		dev_err(dev, "Failed to register peci hwmon\n");
+		return rc;
+	}
+
+	priv->hwmon_dev = hwmon;
+
+	for (i = 0; i < priv->dimm_nums; i++) {
+		rc = create_dimm_temp_group(priv, i);
+		if (rc != 0) {
+			dev_err(dev, "Failed to create dimm temp group\n");
+			for (--i; i >= 0; i--) {
+				sysfs_remove_group(&priv->hwmon_dev->kobj,
+						   &priv->dimm.attr_group[i]);
+			}
+			return rc;
+		}
+	}
+
+	/*
+	 * Try to create core temp group now. It will be created if CPU is
+	 * curretnly online or it will be created after the first reading of
+	 * cpuinfo from the online CPU otherwise.
+	 */
+	if (priv->show_core)
+		(void) get_cpuinfo(priv);
+
+	dev_info(dev, "peci hwmon for CPU#%d registered\n", priv->cpu_id);
+
+	return rc;
+}
+
+static int peci_hwmon_remove(struct platform_device *pdev)
+{
+	struct peci_hwmon *priv = dev_get_drvdata(&pdev->dev);
+	int i;
+
+	if (atomic_read(&priv->core_group_created))
+		for (i = 0; i < priv->core_nums; i++) {
+			sysfs_remove_group(&priv->hwmon_dev->kobj,
+					   &priv->core.attr_group[i]);
+		}
+
+	for (i = 0; i < priv->dimm_nums; i++) {
+		sysfs_remove_group(&priv->hwmon_dev->kobj,
+				   &priv->dimm.attr_group[i]);
+	}
+
+	return 0;
+}
+
+static const struct of_device_id peci_of_table[] = {
+	{ .compatible = "peci-hwmon", },
+	{ }
+};
+MODULE_DEVICE_TABLE(of, peci_of_table);
+
+static struct platform_driver peci_hwmon_driver = {
+	.probe = peci_hwmon_probe,
+	.remove = peci_hwmon_remove,
+	.driver = {
+		.name           = DEVICE_NAME,
+		.of_match_table = peci_of_table,
+	},
+};
+
+module_platform_driver(peci_hwmon_driver);
+MODULE_AUTHOR("Jae Hyun Yoo <jae.hyun.yoo@linux.intel.com>");
+MODULE_DESCRIPTION("PECI hwmon driver");
+MODULE_LICENSE("GPL v2");