diff mbox series

[v2,3/3] occ: Add support for GPU presence detection

Message ID 5344ef39f38b70a6c994654bc9f86f362ac858a7.1529466573.git-series.andrew.donnellan@au1.ibm.com
State Accepted
Headers show
Series occ: GPU presence detection | expand

Commit Message

Andrew Donnellan June 20, 2018, 3:49 a.m. UTC
On the Witherspoon platform, we need to distinguish between NVLink GPUs and
OpenCAPI accelerators. In order to do this, we first need to find out
whether the SXM2 socket is populated.

On Witherspoon, the SXM2 socket's presence detection pin is only visible
via I2C from the APSS, and thus can only be exposed to the host via the
OCC. The OCC, per OCC Firmware Interface Specification for POWER9 version
0.22, now exposes this to skiboot through a field in the dynamic data
shared memory.

Add the necessary dynamic data changes required to read the version and
GPU presence fields. Add a function, occ_get_gpu_presence(), that can be
used to check GPU presence.

If the OCC isn't reporting presence (old OCC firmware, or some other
reason), we default to assuming there is a device present and wait until
link training to fail.

This will be used in later patches to fix up the NPU2 probe path for
OpenCAPI support on Witherspoon.

Signed-off-by: Andrew Donnellan <andrew.donnellan@au1.ibm.com>
---
 hw/occ.c      | 23 ++++++++++++++++++++---
 include/occ.h |  4 ++++
 2 files changed, 24 insertions(+), 3 deletions(-)

Comments

Shilpasri G Bhat June 20, 2018, 5:40 a.m. UTC | #1
On 06/20/2018 09:19 AM, Andrew Donnellan wrote:
> On the Witherspoon platform, we need to distinguish between NVLink GPUs and
> OpenCAPI accelerators. In order to do this, we first need to find out
> whether the SXM2 socket is populated.
> 
> On Witherspoon, the SXM2 socket's presence detection pin is only visible
> via I2C from the APSS, and thus can only be exposed to the host via the
> OCC. The OCC, per OCC Firmware Interface Specification for POWER9 version
> 0.22, now exposes this to skiboot through a field in the dynamic data
> shared memory.
> 
> Add the necessary dynamic data changes required to read the version and
> GPU presence fields. Add a function, occ_get_gpu_presence(), that can be
> used to check GPU presence.
> 
> If the OCC isn't reporting presence (old OCC firmware, or some other
> reason), we default to assuming there is a device present and wait until
> link training to fail.
> 
> This will be used in later patches to fix up the NPU2 probe path for
> OpenCAPI support on Witherspoon.
> 
> Signed-off-by: Andrew Donnellan <andrew.donnellan@au1.ibm.com>

Reviewed-by: Shilpasri G Bhat <shilpa.bhat@linux.vnet.ibm.com>

> ---
>  hw/occ.c      | 23 ++++++++++++++++++++---
>  include/occ.h |  4 ++++
>  2 files changed, 24 insertions(+), 3 deletions(-)
> 
> diff --git a/hw/occ.c b/hw/occ.c
> index fc95d3926bb0..10b2de07dd7a 100644
> --- a/hw/occ.c
> +++ b/hw/occ.c
> @@ -229,10 +229,10 @@ struct occ_response_buffer {
>   */
>  struct occ_dynamic_data {
>  	u8 occ_state;
> +	u8 major_version;
> +	u8 minor_version;
> +	u8 gpus_present;
>  	u8 spare1;
> -	u8 spare2;
> -	u8 spare3;
> -	u8 spare4;
>  	u8 cpu_throttle;
>  	u8 mem_throttle;
>  	u8 quick_pwr_drop;
> @@ -1230,6 +1230,23 @@ exit:
>  	unlock(&chip->queue_lock);
>  }
>  
> +bool occ_get_gpu_presence(struct proc_chip *chip, int gpu_num)
> +{
> +	struct occ_dynamic_data *ddata;
> +
> +	assert(gpu_num <= 2);
> +
> +	ddata = get_occ_dynamic_data(chip);
> +
> +	if (ddata->major_version != 0 || ddata->minor_version < 1) {
> +		prlog(PR_INFO, "OCC: OCC not reporting GPU slot presence, "
> +		      "assuming device is present\n");
> +		return true;
> +	}
> +
> +	return (bool)(ddata->gpus_present & 1 << gpu_num);
> +}
> +
>  static void occ_add_powercap_sensors(struct dt_node *power_mgt);
>  static void occ_add_psr_sensors(struct dt_node *power_mgt);
>  
> diff --git a/include/occ.h b/include/occ.h
> index c9faef9fdfb8..a46b9219fc70 100644
> --- a/include/occ.h
> +++ b/include/occ.h
> @@ -14,6 +14,8 @@
>   * limitations under the License.
>   */
>  
> +#include <chip.h>
> +
>  /* OCC Functions */
>  
>  extern void occ_pstates_init(void);
> @@ -36,6 +38,8 @@ enum pnor_owner {
>  };
>  extern void occ_pnor_set_owner(enum pnor_owner owner);
>  
> +/* GPU presence detection */
> +bool occ_get_gpu_presence(struct proc_chip *chip, int gpu_num);
>  
>  /* OCC Inband Sensors */
>  extern bool occ_sensors_init(void);
>
diff mbox series

Patch

diff --git a/hw/occ.c b/hw/occ.c
index fc95d3926bb0..10b2de07dd7a 100644
--- a/hw/occ.c
+++ b/hw/occ.c
@@ -229,10 +229,10 @@  struct occ_response_buffer {
  */
 struct occ_dynamic_data {
 	u8 occ_state;
+	u8 major_version;
+	u8 minor_version;
+	u8 gpus_present;
 	u8 spare1;
-	u8 spare2;
-	u8 spare3;
-	u8 spare4;
 	u8 cpu_throttle;
 	u8 mem_throttle;
 	u8 quick_pwr_drop;
@@ -1230,6 +1230,23 @@  exit:
 	unlock(&chip->queue_lock);
 }
 
+bool occ_get_gpu_presence(struct proc_chip *chip, int gpu_num)
+{
+	struct occ_dynamic_data *ddata;
+
+	assert(gpu_num <= 2);
+
+	ddata = get_occ_dynamic_data(chip);
+
+	if (ddata->major_version != 0 || ddata->minor_version < 1) {
+		prlog(PR_INFO, "OCC: OCC not reporting GPU slot presence, "
+		      "assuming device is present\n");
+		return true;
+	}
+
+	return (bool)(ddata->gpus_present & 1 << gpu_num);
+}
+
 static void occ_add_powercap_sensors(struct dt_node *power_mgt);
 static void occ_add_psr_sensors(struct dt_node *power_mgt);
 
diff --git a/include/occ.h b/include/occ.h
index c9faef9fdfb8..a46b9219fc70 100644
--- a/include/occ.h
+++ b/include/occ.h
@@ -14,6 +14,8 @@ 
  * limitations under the License.
  */
 
+#include <chip.h>
+
 /* OCC Functions */
 
 extern void occ_pstates_init(void);
@@ -36,6 +38,8 @@  enum pnor_owner {
 };
 extern void occ_pnor_set_owner(enum pnor_owner owner);
 
+/* GPU presence detection */
+bool occ_get_gpu_presence(struct proc_chip *chip, int gpu_num);
 
 /* OCC Inband Sensors */
 extern bool occ_sensors_init(void);