npu2: Allow ATSD for LPAR other than 0
diff mbox series

Message ID 20181205045222.40053-1-aik@ozlabs.ru
State Accepted
Headers show
Series
  • npu2: Allow ATSD for LPAR other than 0
Related show

Checks

Context Check Description
snowpatch_ozlabs/snowpatch_job_snowpatch-skiboot success Test snowpatch/job/snowpatch-skiboot on branch master
snowpatch_ozlabs/apply_patch success master/apply_patch Successfully applied

Commit Message

Alexey Kardashevskiy Dec. 5, 2018, 4:52 a.m. UTC
Each XTS MMIO ATSD# register is accompanied by another register -
XTS MMIO ATSD0 LPARID# - which controls LPID filtering for ATSD
transactions.

When a host system passes a GPU through to a guest, we need to enable
some ATSD for an LPAR. At the moment the host assigns one ATSD to
a NVLink bridge and this maps it to an LPAR when GPU is assigned to
the LPAR. The link number is used for an ATSD index.

ATSD6&7 stay mapped to the host (LPAR=0) all the time which seems to be
acceptable price for the simplicity.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---
 include/npu2-regs.h |  2 ++
 hw/npu2.c           | 22 ++++++++++++++++++----
 2 files changed, 20 insertions(+), 4 deletions(-)

Comments

Alistair Popple Dec. 11, 2018, 2:53 a.m. UTC | #1
On Wednesday, 5 December 2018 3:52:22 PM AEDT Alexey Kardashevskiy wrote:
> Each XTS MMIO ATSD# register is accompanied by another register -
> XTS MMIO ATSD0 LPARID# - which controls LPID filtering for ATSD
> transactions.
> 
> When a host system passes a GPU through to a guest, we need to enable
> some ATSD for an LPAR. At the moment the host assigns one ATSD to
> a NVLink bridge and this maps it to an LPAR when GPU is assigned to
> the LPAR. The link number is used for an ATSD index.
> 
> ATSD6&7 stay mapped to the host (LPAR=0) all the time which seems to be
> acceptable price for the simplicity.
> 
> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> ---
>  include/npu2-regs.h |  2 ++
>  hw/npu2.c           | 22 ++++++++++++++++++----
>  2 files changed, 20 insertions(+), 4 deletions(-)
> 
> diff --git a/include/npu2-regs.h b/include/npu2-regs.h
> index 8273b2b..ae5e225 100644
> --- a/include/npu2-regs.h
> +++ b/include/npu2-regs.h
> @@ -547,6 +547,8 @@ void npu2_scom_write(uint64_t gcid, uint64_t scom_base,
>  #define NPU2_XTS_MMIO_ATSD5_LPARID		NPU2_REG_OFFSET(NPU2_STACK_MISC,
> NPU2_BLOCK_XTS, 0x128) #define
> NPU2_XTS_MMIO_ATSD6_LPARID		NPU2_REG_OFFSET(NPU2_STACK_MISC,
> NPU2_BLOCK_XTS, 0x130) #define
> NPU2_XTS_MMIO_ATSD7_LPARID		NPU2_REG_OFFSET(NPU2_STACK_MISC,
> NPU2_BLOCK_XTS, 0x138) +#define   NPU2_XTS_MMIO_ATSD_MSR_HV		PPC_BIT(51)
> +#define   NPU2_XTS_MMIO_ATSD_LPARID		PPC_BITMASK(52,63)
>  #define NPU2_XTS_BDF_MAP			NPU2_REG_OFFSET(NPU2_STACK_MISC, 
NPU2_BLOCK_XTS,
> 0x4000) #define   NPU2_XTS_BDF_MAP_VALID		PPC_BIT(0)
>  #define   NPU2_XTS_BDF_MAP_UNFILT		PPC_BIT(1)
> diff --git a/hw/npu2.c b/hw/npu2.c
> index 41563b4..767306f 100644
> --- a/hw/npu2.c
> +++ b/hw/npu2.c
> @@ -2255,9 +2255,14 @@ static int opal_npu_map_lpar(uint64_t phb_id,
> uint64_t bdf, uint64_t lparid, struct phb *phb = pci_get_phb(phb_id);
>  	struct npu2 *p;
>  	struct npu2_dev *ndev = NULL;
> -	uint64_t xts_bdf_lpar, rc = OPAL_SUCCESS;
> +	uint64_t xts_bdf_lpar, atsd_lpar, rc = OPAL_SUCCESS;
>  	int i;
>  	int id;
> +	static uint64_t atsd_lpar_regs[] = {
> +		NPU2_XTS_MMIO_ATSD0_LPARID, NPU2_XTS_MMIO_ATSD1_LPARID,
> +		NPU2_XTS_MMIO_ATSD2_LPARID, NPU2_XTS_MMIO_ATSD3_LPARID,
> +		NPU2_XTS_MMIO_ATSD4_LPARID, NPU2_XTS_MMIO_ATSD5_LPARID,
> +		NPU2_XTS_MMIO_ATSD6_LPARID, NPU2_XTS_MMIO_ATSD7_LPARID };
> 
>  	if (!phb || phb->phb_type != phb_type_npu_v2)
>  		return OPAL_PARAMETER;
> @@ -2297,11 +2302,20 @@ static int opal_npu_map_lpar(uint64_t phb_id,
> uint64_t bdf, uint64_t lparid, xts_bdf_lpar =
> SETFIELD(NPU2_XTS_BDF_MAP_LPARID, xts_bdf_lpar, lparid); xts_bdf_lpar =
> SETFIELD(NPU2_XTS_BDF_MAP_LPARSHORT, xts_bdf_lpar, id);
> 
> -	/* Need to find an NVLink to send the ATSDs for this device over */
> +	/*
> +	 * Need to find an NVLink to send the ATSDs for this device over.
> +	 * Also, the host allocates an ATSD per NVLink, enable filtering now.
> +	 */
> +	atsd_lpar = SETFIELD(NPU2_XTS_MMIO_ATSD_LPARID, 0, lparid);
> +	if (!lparid)
> +		atsd_lpar = SETFIELD(NPU2_XTS_MMIO_ATSD_MSR_HV, atsd_lpar, 1);
> +
>  	for (i = 0; i < p->total_devices; i++) {
>  		if (p->devices[i].nvlink.gpu_bdfn == bdf) {
> -			ndev = &p->devices[i];
> -			break;
> +			if (!ndev)
> +				ndev = &p->devices[i];
> +			if (i < ARRAY_SIZE(atsd_lpar_regs))
> +				npu2_write(p, atsd_lpar_regs[i], atsd_lpar);

I'm not sure I really like this as ATSD resources should be allocated and 
passed through to guests by the hypervisor independently of the NVLink devices 
themselves, and a single ATSD register can serve more than one device.

This also creates an implicit assumption that the order of NVLink devices in 
p->devices[] matches ATSD register numbers in some way that the hypervisor 
assumes, which seems like it would be especially brittle as devices[] is 
essentially unordered.

How does the hypervisor know which of the 8 ATSD registers has been assigned 
to the LPARID when calling opal_npu_map_lpar? It seems like you might need a 
seperate OPAL call for this along the lines of npu2_map_mmio_atsd(atsd_index, 
lparid).

- Alistair

>  		}
>  	}
Alexey Kardashevskiy Dec. 11, 2018, 5:45 a.m. UTC | #2
On 11/12/2018 13:53, Alistair Popple wrote:
> On Wednesday, 5 December 2018 3:52:22 PM AEDT Alexey Kardashevskiy wrote:
>> Each XTS MMIO ATSD# register is accompanied by another register -
>> XTS MMIO ATSD0 LPARID# - which controls LPID filtering for ATSD
>> transactions.
>>
>> When a host system passes a GPU through to a guest, we need to enable
>> some ATSD for an LPAR. At the moment the host assigns one ATSD to
>> a NVLink bridge and this maps it to an LPAR when GPU is assigned to
>> the LPAR. The link number is used for an ATSD index.
>>
>> ATSD6&7 stay mapped to the host (LPAR=0) all the time which seems to be
>> acceptable price for the simplicity.
>>
>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>> ---
>>  include/npu2-regs.h |  2 ++
>>  hw/npu2.c           | 22 ++++++++++++++++++----
>>  2 files changed, 20 insertions(+), 4 deletions(-)
>>
>> diff --git a/include/npu2-regs.h b/include/npu2-regs.h
>> index 8273b2b..ae5e225 100644
>> --- a/include/npu2-regs.h
>> +++ b/include/npu2-regs.h
>> @@ -547,6 +547,8 @@ void npu2_scom_write(uint64_t gcid, uint64_t scom_base,
>>  #define NPU2_XTS_MMIO_ATSD5_LPARID		NPU2_REG_OFFSET(NPU2_STACK_MISC,
>> NPU2_BLOCK_XTS, 0x128) #define
>> NPU2_XTS_MMIO_ATSD6_LPARID		NPU2_REG_OFFSET(NPU2_STACK_MISC,
>> NPU2_BLOCK_XTS, 0x130) #define
>> NPU2_XTS_MMIO_ATSD7_LPARID		NPU2_REG_OFFSET(NPU2_STACK_MISC,
>> NPU2_BLOCK_XTS, 0x138) +#define   NPU2_XTS_MMIO_ATSD_MSR_HV		PPC_BIT(51)
>> +#define   NPU2_XTS_MMIO_ATSD_LPARID		PPC_BITMASK(52,63)
>>  #define NPU2_XTS_BDF_MAP			NPU2_REG_OFFSET(NPU2_STACK_MISC, 
> NPU2_BLOCK_XTS,
>> 0x4000) #define   NPU2_XTS_BDF_MAP_VALID		PPC_BIT(0)
>>  #define   NPU2_XTS_BDF_MAP_UNFILT		PPC_BIT(1)
>> diff --git a/hw/npu2.c b/hw/npu2.c
>> index 41563b4..767306f 100644
>> --- a/hw/npu2.c
>> +++ b/hw/npu2.c
>> @@ -2255,9 +2255,14 @@ static int opal_npu_map_lpar(uint64_t phb_id,
>> uint64_t bdf, uint64_t lparid, struct phb *phb = pci_get_phb(phb_id);
>>  	struct npu2 *p;
>>  	struct npu2_dev *ndev = NULL;
>> -	uint64_t xts_bdf_lpar, rc = OPAL_SUCCESS;
>> +	uint64_t xts_bdf_lpar, atsd_lpar, rc = OPAL_SUCCESS;
>>  	int i;
>>  	int id;
>> +	static uint64_t atsd_lpar_regs[] = {
>> +		NPU2_XTS_MMIO_ATSD0_LPARID, NPU2_XTS_MMIO_ATSD1_LPARID,
>> +		NPU2_XTS_MMIO_ATSD2_LPARID, NPU2_XTS_MMIO_ATSD3_LPARID,
>> +		NPU2_XTS_MMIO_ATSD4_LPARID, NPU2_XTS_MMIO_ATSD5_LPARID,
>> +		NPU2_XTS_MMIO_ATSD6_LPARID, NPU2_XTS_MMIO_ATSD7_LPARID };
>>
>>  	if (!phb || phb->phb_type != phb_type_npu_v2)
>>  		return OPAL_PARAMETER;
>> @@ -2297,11 +2302,20 @@ static int opal_npu_map_lpar(uint64_t phb_id,
>> uint64_t bdf, uint64_t lparid, xts_bdf_lpar =
>> SETFIELD(NPU2_XTS_BDF_MAP_LPARID, xts_bdf_lpar, lparid); xts_bdf_lpar =
>> SETFIELD(NPU2_XTS_BDF_MAP_LPARSHORT, xts_bdf_lpar, id);
>>
>> -	/* Need to find an NVLink to send the ATSDs for this device over */
>> +	/*
>> +	 * Need to find an NVLink to send the ATSDs for this device over.
>> +	 * Also, the host allocates an ATSD per NVLink, enable filtering now.
>> +	 */
>> +	atsd_lpar = SETFIELD(NPU2_XTS_MMIO_ATSD_LPARID, 0, lparid);
>> +	if (!lparid)
>> +		atsd_lpar = SETFIELD(NPU2_XTS_MMIO_ATSD_MSR_HV, atsd_lpar, 1);
>> +
>>  	for (i = 0; i < p->total_devices; i++) {
>>  		if (p->devices[i].nvlink.gpu_bdfn == bdf) {
>> -			ndev = &p->devices[i];
>> -			break;
>> +			if (!ndev)
>> +				ndev = &p->devices[i];
>> +			if (i < ARRAY_SIZE(atsd_lpar_regs))
>> +				npu2_write(p, atsd_lpar_regs[i], atsd_lpar);
> 
> I'm not sure I really like this as ATSD resources should be allocated and 
> passed through to guests by the hypervisor independently of the NVLink devices 
> themselves, and a single ATSD register can serve more than one device.

Sure it can serve more, and all passed ATSDs are assembled in one vPHB
property and the guest then does not make distinction between them.  I
just pass maximum 6 of 8 ATSDs but until recently it was just a single
ATSD per PHB for all devices and yet nobody complained.

The problem here is that I need to mmap() ATSD to QEMU, then map them to
KVM and then present this somehow via the device tree.

Now, I can do mmap() on a fd such as a VFIO device fd or a IOMMU
container fd.  If I choose the dev fd, VFIO in QEMU can mmap it, store
the pointer in the QEMU PCI device, and SPAPR vPHB can put atsd property
in vPHB. If I choose the container fd, then VFIO in QEMU can mmap it but
SPAPR vPHB has no concept of IOMMU/VFIO groups/containers and I'd rather
leave like this.


> This also creates an implicit assumption that the order of NVLink devices in 
> p->devices[] matches ATSD register numbers in some way that the hypervisor 
> assumes, which seems like it would be especially brittle as devices[] is 
> essentially unordered.

The order does not matter here, all that matters is that the order does
not change - any ATSD works with any NVLink bridge on that NPU PHB.


> How does the hypervisor know which of the 8 ATSD registers has been assigned 
> to the LPARID when calling opal_npu_map_lpar?


One ATSD register is exposed to the userspace via mmap() on a
"corresponding" NVLink bridge VFIO device fd, register index == link
index. An NVLink device belongs to an IOMMU group. A KVM pointer (i.e.
LPID) is set to the IOMMU group from an ioctl() called by QEMU.


> It seems like you might need a 
> seperate OPAL call for this along the lines of npu2_map_mmio_atsd(atsd_index, 
> lparid).


I do not see what it buys us. I need an allocator on the host or skiboot
anyway, either static or dynamic, dynamic seems excessive as the
devices[] order or the number of ATSDs do not change.



> 
> - Alistair
> 
>>  		}
>>  	}
> 
>
Stewart Smith Dec. 11, 2018, 6:29 a.m. UTC | #3
Alexey Kardashevskiy <aik@ozlabs.ru> writes:
> Each XTS MMIO ATSD# register is accompanied by another register -
> XTS MMIO ATSD0 LPARID# - which controls LPID filtering for ATSD
> transactions.
>
> When a host system passes a GPU through to a guest, we need to enable
> some ATSD for an LPAR. At the moment the host assigns one ATSD to
> a NVLink bridge and this maps it to an LPAR when GPU is assigned to
> the LPAR. The link number is used for an ATSD index.
>
> ATSD6&7 stay mapped to the host (LPAR=0) all the time which seems to be
> acceptable price for the simplicity.
>
> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>

Thanks, merged to master as of d8b161f4b361f70a7bb43be47d4a32b8f937287a
Stewart Smith Dec. 11, 2018, 7:07 a.m. UTC | #4
Alexey Kardashevskiy <aik@ozlabs.ru> writes:
> On 11/12/2018 13:53, Alistair Popple wrote:
>> On Wednesday, 5 December 2018 3:52:22 PM AEDT Alexey Kardashevskiy wrote:
>>> Each XTS MMIO ATSD# register is accompanied by another register -
>>> XTS MMIO ATSD0 LPARID# - which controls LPID filtering for ATSD
>>> transactions.
>>>
>>> When a host system passes a GPU through to a guest, we need to enable
>>> some ATSD for an LPAR. At the moment the host assigns one ATSD to
>>> a NVLink bridge and this maps it to an LPAR when GPU is assigned to
>>> the LPAR. The link number is used for an ATSD index.
>>>
>>> ATSD6&7 stay mapped to the host (LPAR=0) all the time which seems to be
>>> acceptable price for the simplicity.
>>>
>>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>>> ---
>>>  include/npu2-regs.h |  2 ++
>>>  hw/npu2.c           | 22 ++++++++++++++++++----
>>>  2 files changed, 20 insertions(+), 4 deletions(-)
>>>
>>> diff --git a/include/npu2-regs.h b/include/npu2-regs.h
>>> index 8273b2b..ae5e225 100644
>>> --- a/include/npu2-regs.h
>>> +++ b/include/npu2-regs.h
>>> @@ -547,6 +547,8 @@ void npu2_scom_write(uint64_t gcid, uint64_t scom_base,
>>>  #define NPU2_XTS_MMIO_ATSD5_LPARID		NPU2_REG_OFFSET(NPU2_STACK_MISC,
>>> NPU2_BLOCK_XTS, 0x128) #define
>>> NPU2_XTS_MMIO_ATSD6_LPARID		NPU2_REG_OFFSET(NPU2_STACK_MISC,
>>> NPU2_BLOCK_XTS, 0x130) #define
>>> NPU2_XTS_MMIO_ATSD7_LPARID		NPU2_REG_OFFSET(NPU2_STACK_MISC,
>>> NPU2_BLOCK_XTS, 0x138) +#define   NPU2_XTS_MMIO_ATSD_MSR_HV		PPC_BIT(51)
>>> +#define   NPU2_XTS_MMIO_ATSD_LPARID		PPC_BITMASK(52,63)
>>>  #define NPU2_XTS_BDF_MAP			NPU2_REG_OFFSET(NPU2_STACK_MISC, 
>> NPU2_BLOCK_XTS,
>>> 0x4000) #define   NPU2_XTS_BDF_MAP_VALID		PPC_BIT(0)
>>>  #define   NPU2_XTS_BDF_MAP_UNFILT		PPC_BIT(1)
>>> diff --git a/hw/npu2.c b/hw/npu2.c
>>> index 41563b4..767306f 100644
>>> --- a/hw/npu2.c
>>> +++ b/hw/npu2.c
>>> @@ -2255,9 +2255,14 @@ static int opal_npu_map_lpar(uint64_t phb_id,
>>> uint64_t bdf, uint64_t lparid, struct phb *phb = pci_get_phb(phb_id);
>>>  	struct npu2 *p;
>>>  	struct npu2_dev *ndev = NULL;
>>> -	uint64_t xts_bdf_lpar, rc = OPAL_SUCCESS;
>>> +	uint64_t xts_bdf_lpar, atsd_lpar, rc = OPAL_SUCCESS;
>>>  	int i;
>>>  	int id;
>>> +	static uint64_t atsd_lpar_regs[] = {
>>> +		NPU2_XTS_MMIO_ATSD0_LPARID, NPU2_XTS_MMIO_ATSD1_LPARID,
>>> +		NPU2_XTS_MMIO_ATSD2_LPARID, NPU2_XTS_MMIO_ATSD3_LPARID,
>>> +		NPU2_XTS_MMIO_ATSD4_LPARID, NPU2_XTS_MMIO_ATSD5_LPARID,
>>> +		NPU2_XTS_MMIO_ATSD6_LPARID, NPU2_XTS_MMIO_ATSD7_LPARID };
>>>
>>>  	if (!phb || phb->phb_type != phb_type_npu_v2)
>>>  		return OPAL_PARAMETER;
>>> @@ -2297,11 +2302,20 @@ static int opal_npu_map_lpar(uint64_t phb_id,
>>> uint64_t bdf, uint64_t lparid, xts_bdf_lpar =
>>> SETFIELD(NPU2_XTS_BDF_MAP_LPARID, xts_bdf_lpar, lparid); xts_bdf_lpar =
>>> SETFIELD(NPU2_XTS_BDF_MAP_LPARSHORT, xts_bdf_lpar, id);
>>>
>>> -	/* Need to find an NVLink to send the ATSDs for this device over */
>>> +	/*
>>> +	 * Need to find an NVLink to send the ATSDs for this device over.
>>> +	 * Also, the host allocates an ATSD per NVLink, enable filtering now.
>>> +	 */
>>> +	atsd_lpar = SETFIELD(NPU2_XTS_MMIO_ATSD_LPARID, 0, lparid);
>>> +	if (!lparid)
>>> +		atsd_lpar = SETFIELD(NPU2_XTS_MMIO_ATSD_MSR_HV, atsd_lpar, 1);
>>> +
>>>  	for (i = 0; i < p->total_devices; i++) {
>>>  		if (p->devices[i].nvlink.gpu_bdfn == bdf) {
>>> -			ndev = &p->devices[i];
>>> -			break;
>>> +			if (!ndev)
>>> +				ndev = &p->devices[i];
>>> +			if (i < ARRAY_SIZE(atsd_lpar_regs))
>>> +				npu2_write(p, atsd_lpar_regs[i], atsd_lpar);
>> 
>> I'm not sure I really like this as ATSD resources should be allocated and 
>> passed through to guests by the hypervisor independently of the NVLink devices 
>> themselves, and a single ATSD register can serve more than one device.
>
> Sure it can serve more, and all passed ATSDs are assembled in one vPHB
> property and the guest then does not make distinction between them.  I
> just pass maximum 6 of 8 ATSDs but until recently it was just a single
> ATSD per PHB for all devices and yet nobody complained.
>
> The problem here is that I need to mmap() ATSD to QEMU, then map them to
> KVM and then present this somehow via the device tree.
>
> Now, I can do mmap() on a fd such as a VFIO device fd or a IOMMU
> container fd.  If I choose the dev fd, VFIO in QEMU can mmap it, store
> the pointer in the QEMU PCI device, and SPAPR vPHB can put atsd property
> in vPHB. If I choose the container fd, then VFIO in QEMU can mmap it but
> SPAPR vPHB has no concept of IOMMU/VFIO groups/containers and I'd rather
> leave like this.
>
>
>> This also creates an implicit assumption that the order of NVLink devices in 
>> p->devices[] matches ATSD register numbers in some way that the hypervisor 
>> assumes, which seems like it would be especially brittle as devices[] is 
>> essentially unordered.
>
> The order does not matter here, all that matters is that the order does
> not change - any ATSD works with any NVLink bridge on that NPU PHB.
>
>
>> How does the hypervisor know which of the 8 ATSD registers has been assigned 
>> to the LPARID when calling opal_npu_map_lpar?
>
>
> One ATSD register is exposed to the userspace via mmap() on a
> "corresponding" NVLink bridge VFIO device fd, register index == link
> index. An NVLink device belongs to an IOMMU group. A KVM pointer (i.e.
> LPID) is set to the IOMMU group from an ioctl() called by QEMU.
>
>
>> It seems like you might need a 
>> seperate OPAL call for this along the lines of npu2_map_mmio_atsd(atsd_index, 
>> lparid).
>
>
> I do not see what it buys us. I need an allocator on the host or skiboot
> anyway, either static or dynamic, dynamic seems excessive as the
> devices[] order or the number of ATSDs do not change.

So, of course I spot discussion after having hit the merge
button[1]. Maybe I should care/look closer at all the NPU2 things?
(it'd help if there was a sane way to test things that I could plug into
op-test)

Any recommendation what I should do from here, is keeping the patch okay?

[1] not an actual button.
Alexey Kardashevskiy Dec. 12, 2018, 12:16 a.m. UTC | #5
On 11/12/2018 18:07, Stewart Smith wrote:
> Alexey Kardashevskiy <aik@ozlabs.ru> writes:
>> On 11/12/2018 13:53, Alistair Popple wrote:
>>> On Wednesday, 5 December 2018 3:52:22 PM AEDT Alexey Kardashevskiy wrote:
>>>> Each XTS MMIO ATSD# register is accompanied by another register -
>>>> XTS MMIO ATSD0 LPARID# - which controls LPID filtering for ATSD
>>>> transactions.
>>>>
>>>> When a host system passes a GPU through to a guest, we need to enable
>>>> some ATSD for an LPAR. At the moment the host assigns one ATSD to
>>>> a NVLink bridge and this maps it to an LPAR when GPU is assigned to
>>>> the LPAR. The link number is used for an ATSD index.
>>>>
>>>> ATSD6&7 stay mapped to the host (LPAR=0) all the time which seems to be
>>>> acceptable price for the simplicity.
>>>>
>>>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>>>> ---
>>>>  include/npu2-regs.h |  2 ++
>>>>  hw/npu2.c           | 22 ++++++++++++++++++----
>>>>  2 files changed, 20 insertions(+), 4 deletions(-)
>>>>
>>>> diff --git a/include/npu2-regs.h b/include/npu2-regs.h
>>>> index 8273b2b..ae5e225 100644
>>>> --- a/include/npu2-regs.h
>>>> +++ b/include/npu2-regs.h
>>>> @@ -547,6 +547,8 @@ void npu2_scom_write(uint64_t gcid, uint64_t scom_base,
>>>>  #define NPU2_XTS_MMIO_ATSD5_LPARID		NPU2_REG_OFFSET(NPU2_STACK_MISC,
>>>> NPU2_BLOCK_XTS, 0x128) #define
>>>> NPU2_XTS_MMIO_ATSD6_LPARID		NPU2_REG_OFFSET(NPU2_STACK_MISC,
>>>> NPU2_BLOCK_XTS, 0x130) #define
>>>> NPU2_XTS_MMIO_ATSD7_LPARID		NPU2_REG_OFFSET(NPU2_STACK_MISC,
>>>> NPU2_BLOCK_XTS, 0x138) +#define   NPU2_XTS_MMIO_ATSD_MSR_HV		PPC_BIT(51)
>>>> +#define   NPU2_XTS_MMIO_ATSD_LPARID		PPC_BITMASK(52,63)
>>>>  #define NPU2_XTS_BDF_MAP			NPU2_REG_OFFSET(NPU2_STACK_MISC, 
>>> NPU2_BLOCK_XTS,
>>>> 0x4000) #define   NPU2_XTS_BDF_MAP_VALID		PPC_BIT(0)
>>>>  #define   NPU2_XTS_BDF_MAP_UNFILT		PPC_BIT(1)
>>>> diff --git a/hw/npu2.c b/hw/npu2.c
>>>> index 41563b4..767306f 100644
>>>> --- a/hw/npu2.c
>>>> +++ b/hw/npu2.c
>>>> @@ -2255,9 +2255,14 @@ static int opal_npu_map_lpar(uint64_t phb_id,
>>>> uint64_t bdf, uint64_t lparid, struct phb *phb = pci_get_phb(phb_id);
>>>>  	struct npu2 *p;
>>>>  	struct npu2_dev *ndev = NULL;
>>>> -	uint64_t xts_bdf_lpar, rc = OPAL_SUCCESS;
>>>> +	uint64_t xts_bdf_lpar, atsd_lpar, rc = OPAL_SUCCESS;
>>>>  	int i;
>>>>  	int id;
>>>> +	static uint64_t atsd_lpar_regs[] = {
>>>> +		NPU2_XTS_MMIO_ATSD0_LPARID, NPU2_XTS_MMIO_ATSD1_LPARID,
>>>> +		NPU2_XTS_MMIO_ATSD2_LPARID, NPU2_XTS_MMIO_ATSD3_LPARID,
>>>> +		NPU2_XTS_MMIO_ATSD4_LPARID, NPU2_XTS_MMIO_ATSD5_LPARID,
>>>> +		NPU2_XTS_MMIO_ATSD6_LPARID, NPU2_XTS_MMIO_ATSD7_LPARID };
>>>>
>>>>  	if (!phb || phb->phb_type != phb_type_npu_v2)
>>>>  		return OPAL_PARAMETER;
>>>> @@ -2297,11 +2302,20 @@ static int opal_npu_map_lpar(uint64_t phb_id,
>>>> uint64_t bdf, uint64_t lparid, xts_bdf_lpar =
>>>> SETFIELD(NPU2_XTS_BDF_MAP_LPARID, xts_bdf_lpar, lparid); xts_bdf_lpar =
>>>> SETFIELD(NPU2_XTS_BDF_MAP_LPARSHORT, xts_bdf_lpar, id);
>>>>
>>>> -	/* Need to find an NVLink to send the ATSDs for this device over */
>>>> +	/*
>>>> +	 * Need to find an NVLink to send the ATSDs for this device over.
>>>> +	 * Also, the host allocates an ATSD per NVLink, enable filtering now.
>>>> +	 */
>>>> +	atsd_lpar = SETFIELD(NPU2_XTS_MMIO_ATSD_LPARID, 0, lparid);
>>>> +	if (!lparid)
>>>> +		atsd_lpar = SETFIELD(NPU2_XTS_MMIO_ATSD_MSR_HV, atsd_lpar, 1);
>>>> +
>>>>  	for (i = 0; i < p->total_devices; i++) {
>>>>  		if (p->devices[i].nvlink.gpu_bdfn == bdf) {
>>>> -			ndev = &p->devices[i];
>>>> -			break;
>>>> +			if (!ndev)
>>>> +				ndev = &p->devices[i];
>>>> +			if (i < ARRAY_SIZE(atsd_lpar_regs))
>>>> +				npu2_write(p, atsd_lpar_regs[i], atsd_lpar);
>>>
>>> I'm not sure I really like this as ATSD resources should be allocated and 
>>> passed through to guests by the hypervisor independently of the NVLink devices 
>>> themselves, and a single ATSD register can serve more than one device.
>>
>> Sure it can serve more, and all passed ATSDs are assembled in one vPHB
>> property and the guest then does not make distinction between them.  I
>> just pass maximum 6 of 8 ATSDs but until recently it was just a single
>> ATSD per PHB for all devices and yet nobody complained.
>>
>> The problem here is that I need to mmap() ATSD to QEMU, then map them to
>> KVM and then present this somehow via the device tree.
>>
>> Now, I can do mmap() on a fd such as a VFIO device fd or a IOMMU
>> container fd.  If I choose the dev fd, VFIO in QEMU can mmap it, store
>> the pointer in the QEMU PCI device, and SPAPR vPHB can put atsd property
>> in vPHB. If I choose the container fd, then VFIO in QEMU can mmap it but
>> SPAPR vPHB has no concept of IOMMU/VFIO groups/containers and I'd rather
>> leave like this.
>>
>>
>>> This also creates an implicit assumption that the order of NVLink devices in 
>>> p->devices[] matches ATSD register numbers in some way that the hypervisor 
>>> assumes, which seems like it would be especially brittle as devices[] is 
>>> essentially unordered.
>>
>> The order does not matter here, all that matters is that the order does
>> not change - any ATSD works with any NVLink bridge on that NPU PHB.
>>
>>
>>> How does the hypervisor know which of the 8 ATSD registers has been assigned 
>>> to the LPARID when calling opal_npu_map_lpar?
>>
>>
>> One ATSD register is exposed to the userspace via mmap() on a
>> "corresponding" NVLink bridge VFIO device fd, register index == link
>> index. An NVLink device belongs to an IOMMU group. A KVM pointer (i.e.
>> LPID) is set to the IOMMU group from an ioctl() called by QEMU.
>>
>>
>>> It seems like you might need a 
>>> seperate OPAL call for this along the lines of npu2_map_mmio_atsd(atsd_index, 
>>> lparid).
>>
>>
>> I do not see what it buys us. I need an allocator on the host or skiboot
>> anyway, either static or dynamic, dynamic seems excessive as the
>> devices[] order or the number of ATSDs do not change.
> 
> So, of course I spot discussion after having hit the merge
> button[1]. Maybe I should care/look closer at all the NPU2 things?
> (it'd help if there was a sane way to test things that I could plug into
> op-test)

The way to test this is loading nvidia driver and trying ATSD, in both
host and guest. Doable but I have no idea how advanced op-test is.


> Any recommendation what I should do from here, is keeping the patch okay?

I checked with Alistair and yes, please keep it.


> [1] not an actual button.

:)
Alistair Popple Dec. 12, 2018, 12:31 a.m. UTC | #6
On Tuesday, 11 December 2018 6:07:41 PM AEDT Stewart Smith wrote:
> Alexey Kardashevskiy <aik@ozlabs.ru> writes:
> > On 11/12/2018 13:53, Alistair Popple wrote:
> >> On Wednesday, 5 December 2018 3:52:22 PM AEDT Alexey Kardashevskiy wrote:
> >>> Each XTS MMIO ATSD# register is accompanied by another register -
> >>> XTS MMIO ATSD0 LPARID# - which controls LPID filtering for ATSD
> >>> transactions.
> >>> 
> >>> When a host system passes a GPU through to a guest, we need to enable
> >>> some ATSD for an LPAR. At the moment the host assigns one ATSD to
> >>> a NVLink bridge and this maps it to an LPAR when GPU is assigned to
> >>> the LPAR. The link number is used for an ATSD index.
> >>> 
> >>> ATSD6&7 stay mapped to the host (LPAR=0) all the time which seems to be
> >>> acceptable price for the simplicity.
> >>> 
> >>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> >>> ---
> >>> 
> >>>  include/npu2-regs.h |  2 ++
> >>>  hw/npu2.c           | 22 ++++++++++++++++++----
> >>>  2 files changed, 20 insertions(+), 4 deletions(-)
> >>> 
> >>> diff --git a/include/npu2-regs.h b/include/npu2-regs.h
> >>> index 8273b2b..ae5e225 100644
> >>> --- a/include/npu2-regs.h
> >>> +++ b/include/npu2-regs.h
> >>> @@ -547,6 +547,8 @@ void npu2_scom_write(uint64_t gcid, uint64_t
> >>> scom_base,
> >>> 
> >>>  #define NPU2_XTS_MMIO_ATSD5_LPARID		NPU2_REG_OFFSET(NPU2_STACK_MISC,
> >>> 
> >>> NPU2_BLOCK_XTS, 0x128) #define
> >>> NPU2_XTS_MMIO_ATSD6_LPARID		NPU2_REG_OFFSET(NPU2_STACK_MISC,
> >>> NPU2_BLOCK_XTS, 0x130) #define
> >>> NPU2_XTS_MMIO_ATSD7_LPARID		NPU2_REG_OFFSET(NPU2_STACK_MISC,
> >>> NPU2_BLOCK_XTS, 0x138) +#define   NPU2_XTS_MMIO_ATSD_MSR_HV		
PPC_BIT(51)
> >>> +#define   NPU2_XTS_MMIO_ATSD_LPARID		PPC_BITMASK(52,63)
> >>> 
> >>>  #define NPU2_XTS_BDF_MAP			NPU2_REG_OFFSET(NPU2_STACK_MISC,
> >> 
> >> NPU2_BLOCK_XTS,
> >> 
> >>> 0x4000) #define   NPU2_XTS_BDF_MAP_VALID		PPC_BIT(0)
> >>> 
> >>>  #define   NPU2_XTS_BDF_MAP_UNFILT		PPC_BIT(1)
> >>> 
> >>> diff --git a/hw/npu2.c b/hw/npu2.c
> >>> index 41563b4..767306f 100644
> >>> --- a/hw/npu2.c
> >>> +++ b/hw/npu2.c
> >>> @@ -2255,9 +2255,14 @@ static int opal_npu_map_lpar(uint64_t phb_id,
> >>> uint64_t bdf, uint64_t lparid, struct phb *phb = pci_get_phb(phb_id);
> >>> 
> >>>  	struct npu2 *p;
> >>>  	struct npu2_dev *ndev = NULL;
> >>> 
> >>> -	uint64_t xts_bdf_lpar, rc = OPAL_SUCCESS;
> >>> +	uint64_t xts_bdf_lpar, atsd_lpar, rc = OPAL_SUCCESS;
> >>> 
> >>>  	int i;
> >>>  	int id;
> >>> 
> >>> +	static uint64_t atsd_lpar_regs[] = {
> >>> +		NPU2_XTS_MMIO_ATSD0_LPARID, NPU2_XTS_MMIO_ATSD1_LPARID,
> >>> +		NPU2_XTS_MMIO_ATSD2_LPARID, NPU2_XTS_MMIO_ATSD3_LPARID,
> >>> +		NPU2_XTS_MMIO_ATSD4_LPARID, NPU2_XTS_MMIO_ATSD5_LPARID,
> >>> +		NPU2_XTS_MMIO_ATSD6_LPARID, NPU2_XTS_MMIO_ATSD7_LPARID };
> >>> 
> >>>  	if (!phb || phb->phb_type != phb_type_npu_v2)
> >>>  	
> >>>  		return OPAL_PARAMETER;
> >>> 
> >>> @@ -2297,11 +2302,20 @@ static int opal_npu_map_lpar(uint64_t phb_id,
> >>> uint64_t bdf, uint64_t lparid, xts_bdf_lpar =
> >>> SETFIELD(NPU2_XTS_BDF_MAP_LPARID, xts_bdf_lpar, lparid); xts_bdf_lpar =
> >>> SETFIELD(NPU2_XTS_BDF_MAP_LPARSHORT, xts_bdf_lpar, id);
> >>> 
> >>> -	/* Need to find an NVLink to send the ATSDs for this device over */
> >>> +	/*
> >>> +	 * Need to find an NVLink to send the ATSDs for this device over.
> >>> +	 * Also, the host allocates an ATSD per NVLink, enable filtering now.
> >>> +	 */
> >>> +	atsd_lpar = SETFIELD(NPU2_XTS_MMIO_ATSD_LPARID, 0, lparid);
> >>> +	if (!lparid)
> >>> +		atsd_lpar = SETFIELD(NPU2_XTS_MMIO_ATSD_MSR_HV, atsd_lpar, 1);
> >>> +
> >>> 
> >>>  	for (i = 0; i < p->total_devices; i++) {
> >>>  	
> >>>  		if (p->devices[i].nvlink.gpu_bdfn == bdf) {
> >>> 
> >>> -			ndev = &p->devices[i];
> >>> -			break;
> >>> +			if (!ndev)
> >>> +				ndev = &p->devices[i];
> >>> +			if (i < ARRAY_SIZE(atsd_lpar_regs))
> >>> +				npu2_write(p, atsd_lpar_regs[i], atsd_lpar);
> >> 
> >> I'm not sure I really like this as ATSD resources should be allocated and
> >> passed through to guests by the hypervisor independently of the NVLink
> >> devices themselves, and a single ATSD register can serve more than one
> >> device.> 
> > Sure it can serve more, and all passed ATSDs are assembled in one vPHB
> > property and the guest then does not make distinction between them.  I
> > just pass maximum 6 of 8 ATSDs but until recently it was just a single
> > ATSD per PHB for all devices and yet nobody complained.
> > 
> > The problem here is that I need to mmap() ATSD to QEMU, then map them to
> > KVM and then present this somehow via the device tree.
> > 
> > Now, I can do mmap() on a fd such as a VFIO device fd or a IOMMU
> > container fd.  If I choose the dev fd, VFIO in QEMU can mmap it, store
> > the pointer in the QEMU PCI device, and SPAPR vPHB can put atsd property
> > in vPHB. If I choose the container fd, then VFIO in QEMU can mmap it but
> > SPAPR vPHB has no concept of IOMMU/VFIO groups/containers and I'd rather
> > leave like this.
> > 
> >> This also creates an implicit assumption that the order of NVLink devices
> >> in p->devices[] matches ATSD register numbers in some way that the
> >> hypervisor assumes, which seems like it would be especially brittle as
> >> devices[] is essentially unordered.
> > 
> > The order does not matter here, all that matters is that the order does
> > not change - any ATSD works with any NVLink bridge on that NPU PHB.
> >
> >> How does the hypervisor know which of the 8 ATSD registers has been
> >> assigned to the LPARID when calling opal_npu_map_lpar?
> > 
> > One ATSD register is exposed to the userspace via mmap() on a
> > "corresponding" NVLink bridge VFIO device fd, register index == link
> > index. An NVLink device belongs to an IOMMU group. A KVM pointer (i.e.
> > LPID) is set to the IOMMU group from an ioctl() called by QEMU.

Right, but doesn't the hypervisor have to know which of the 8 ATSD registers 
to pass through to mmap to the guest? Only one of them gets mapped to the LPAR 
in opal_npu_map_lpar(), so how do you work out which one to mmap?

> >> It seems like you might need a
> >> seperate OPAL call for this along the lines of
> >> npu2_map_mmio_atsd(atsd_index, lparid).
> > 
> > I do not see what it buys us. I need an allocator on the host or skiboot
> > anyway, either static or dynamic, dynamic seems excessive as the
> > devices[] order or the number of ATSDs do not change.
> 
> So, of course I spot discussion after having hit the merge
> button[1]. Maybe I should care/look closer at all the NPU2 things?

I could of course review things in a more timely and consistent manner :-)

> (it'd help if there was a sane way to test things that I could plug into
> op-test)
> 
> Any recommendation what I should do from here, is keeping the patch okay?

I thought it was ok but after re-reading the discussion I'm less convinced 
again.

- Alistair

> [1] not an actual button.
Alexey Kardashevskiy Dec. 12, 2018, 12:52 a.m. UTC | #7
On 12/12/2018 11:31, Alistair Popple wrote:
> On Tuesday, 11 December 2018 6:07:41 PM AEDT Stewart Smith wrote:
>> Alexey Kardashevskiy <aik@ozlabs.ru> writes:
>>> On 11/12/2018 13:53, Alistair Popple wrote:
>>>> On Wednesday, 5 December 2018 3:52:22 PM AEDT Alexey Kardashevskiy wrote:
>>>>> Each XTS MMIO ATSD# register is accompanied by another register -
>>>>> XTS MMIO ATSD0 LPARID# - which controls LPID filtering for ATSD
>>>>> transactions.
>>>>>
>>>>> When a host system passes a GPU through to a guest, we need to enable
>>>>> some ATSD for an LPAR. At the moment the host assigns one ATSD to
>>>>> a NVLink bridge and this maps it to an LPAR when GPU is assigned to
>>>>> the LPAR. The link number is used for an ATSD index.
>>>>>
>>>>> ATSD6&7 stay mapped to the host (LPAR=0) all the time which seems to be
>>>>> acceptable price for the simplicity.
>>>>>
>>>>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>>>>> ---
>>>>>
>>>>>  include/npu2-regs.h |  2 ++
>>>>>  hw/npu2.c           | 22 ++++++++++++++++++----
>>>>>  2 files changed, 20 insertions(+), 4 deletions(-)
>>>>>
>>>>> diff --git a/include/npu2-regs.h b/include/npu2-regs.h
>>>>> index 8273b2b..ae5e225 100644
>>>>> --- a/include/npu2-regs.h
>>>>> +++ b/include/npu2-regs.h
>>>>> @@ -547,6 +547,8 @@ void npu2_scom_write(uint64_t gcid, uint64_t
>>>>> scom_base,
>>>>>
>>>>>  #define NPU2_XTS_MMIO_ATSD5_LPARID		NPU2_REG_OFFSET(NPU2_STACK_MISC,
>>>>>
>>>>> NPU2_BLOCK_XTS, 0x128) #define
>>>>> NPU2_XTS_MMIO_ATSD6_LPARID		NPU2_REG_OFFSET(NPU2_STACK_MISC,
>>>>> NPU2_BLOCK_XTS, 0x130) #define
>>>>> NPU2_XTS_MMIO_ATSD7_LPARID		NPU2_REG_OFFSET(NPU2_STACK_MISC,
>>>>> NPU2_BLOCK_XTS, 0x138) +#define   NPU2_XTS_MMIO_ATSD_MSR_HV		
> PPC_BIT(51)
>>>>> +#define   NPU2_XTS_MMIO_ATSD_LPARID		PPC_BITMASK(52,63)
>>>>>
>>>>>  #define NPU2_XTS_BDF_MAP			NPU2_REG_OFFSET(NPU2_STACK_MISC,
>>>>
>>>> NPU2_BLOCK_XTS,
>>>>
>>>>> 0x4000) #define   NPU2_XTS_BDF_MAP_VALID		PPC_BIT(0)
>>>>>
>>>>>  #define   NPU2_XTS_BDF_MAP_UNFILT		PPC_BIT(1)
>>>>>
>>>>> diff --git a/hw/npu2.c b/hw/npu2.c
>>>>> index 41563b4..767306f 100644
>>>>> --- a/hw/npu2.c
>>>>> +++ b/hw/npu2.c
>>>>> @@ -2255,9 +2255,14 @@ static int opal_npu_map_lpar(uint64_t phb_id,
>>>>> uint64_t bdf, uint64_t lparid, struct phb *phb = pci_get_phb(phb_id);
>>>>>
>>>>>  	struct npu2 *p;
>>>>>  	struct npu2_dev *ndev = NULL;
>>>>>
>>>>> -	uint64_t xts_bdf_lpar, rc = OPAL_SUCCESS;
>>>>> +	uint64_t xts_bdf_lpar, atsd_lpar, rc = OPAL_SUCCESS;
>>>>>
>>>>>  	int i;
>>>>>  	int id;
>>>>>
>>>>> +	static uint64_t atsd_lpar_regs[] = {
>>>>> +		NPU2_XTS_MMIO_ATSD0_LPARID, NPU2_XTS_MMIO_ATSD1_LPARID,
>>>>> +		NPU2_XTS_MMIO_ATSD2_LPARID, NPU2_XTS_MMIO_ATSD3_LPARID,
>>>>> +		NPU2_XTS_MMIO_ATSD4_LPARID, NPU2_XTS_MMIO_ATSD5_LPARID,
>>>>> +		NPU2_XTS_MMIO_ATSD6_LPARID, NPU2_XTS_MMIO_ATSD7_LPARID };
>>>>>
>>>>>  	if (!phb || phb->phb_type != phb_type_npu_v2)
>>>>>  	
>>>>>  		return OPAL_PARAMETER;
>>>>>
>>>>> @@ -2297,11 +2302,20 @@ static int opal_npu_map_lpar(uint64_t phb_id,
>>>>> uint64_t bdf, uint64_t lparid, xts_bdf_lpar =
>>>>> SETFIELD(NPU2_XTS_BDF_MAP_LPARID, xts_bdf_lpar, lparid); xts_bdf_lpar =
>>>>> SETFIELD(NPU2_XTS_BDF_MAP_LPARSHORT, xts_bdf_lpar, id);
>>>>>
>>>>> -	/* Need to find an NVLink to send the ATSDs for this device over */
>>>>> +	/*
>>>>> +	 * Need to find an NVLink to send the ATSDs for this device over.
>>>>> +	 * Also, the host allocates an ATSD per NVLink, enable filtering now.
>>>>> +	 */
>>>>> +	atsd_lpar = SETFIELD(NPU2_XTS_MMIO_ATSD_LPARID, 0, lparid);
>>>>> +	if (!lparid)
>>>>> +		atsd_lpar = SETFIELD(NPU2_XTS_MMIO_ATSD_MSR_HV, atsd_lpar, 1);
>>>>> +
>>>>>
>>>>>  	for (i = 0; i < p->total_devices; i++) {
>>>>>  	
>>>>>  		if (p->devices[i].nvlink.gpu_bdfn == bdf) {
>>>>>
>>>>> -			ndev = &p->devices[i];
>>>>> -			break;
>>>>> +			if (!ndev)
>>>>> +				ndev = &p->devices[i];
>>>>> +			if (i < ARRAY_SIZE(atsd_lpar_regs))
>>>>> +				npu2_write(p, atsd_lpar_regs[i], atsd_lpar);
>>>>
>>>> I'm not sure I really like this as ATSD resources should be allocated and
>>>> passed through to guests by the hypervisor independently of the NVLink
>>>> devices themselves, and a single ATSD register can serve more than one
>>>> device.> 
>>> Sure it can serve more, and all passed ATSDs are assembled in one vPHB
>>> property and the guest then does not make distinction between them.  I
>>> just pass maximum 6 of 8 ATSDs but until recently it was just a single
>>> ATSD per PHB for all devices and yet nobody complained.
>>>
>>> The problem here is that I need to mmap() ATSD to QEMU, then map them to
>>> KVM and then present this somehow via the device tree.
>>>
>>> Now, I can do mmap() on a fd such as a VFIO device fd or a IOMMU
>>> container fd.  If I choose the dev fd, VFIO in QEMU can mmap it, store
>>> the pointer in the QEMU PCI device, and SPAPR vPHB can put atsd property
>>> in vPHB. If I choose the container fd, then VFIO in QEMU can mmap it but
>>> SPAPR vPHB has no concept of IOMMU/VFIO groups/containers and I'd rather
>>> leave like this.
>>>
>>>> This also creates an implicit assumption that the order of NVLink devices
>>>> in p->devices[] matches ATSD register numbers in some way that the
>>>> hypervisor assumes, which seems like it would be especially brittle as
>>>> devices[] is essentially unordered.
>>>
>>> The order does not matter here, all that matters is that the order does
>>> not change - any ATSD works with any NVLink bridge on that NPU PHB.
>>>
>>>> How does the hypervisor know which of the 8 ATSD registers has been
>>>> assigned to the LPARID when calling opal_npu_map_lpar?
>>>
>>> One ATSD register is exposed to the userspace via mmap() on a
>>> "corresponding" NVLink bridge VFIO device fd, register index == link
>>> index. An NVLink device belongs to an IOMMU group. A KVM pointer (i.e.
>>> LPID) is set to the IOMMU group from an ioctl() called by QEMU.
> 
> Right, but doesn't the hypervisor have to know which of the 8 ATSD registers 
> to pass through to mmap to the guest? Only one of them gets mapped to the LPAR 
> in opal_npu_map_lpar(), so how do you work out which one to mmap?


You got me there :) It is "ibm,npu-link-index" in the hypervisor/vfio
but nothing says the index from npu2::devices[index] is the same thing,
I need this conversion here as "index" is not visible to the host system :-/



> 
>>>> It seems like you might need a
>>>> seperate OPAL call for this along the lines of
>>>> npu2_map_mmio_atsd(atsd_index, lparid).
>>>
>>> I do not see what it buys us. I need an allocator on the host or skiboot
>>> anyway, either static or dynamic, dynamic seems excessive as the
>>> devices[] order or the number of ATSDs do not change.
>>
>> So, of course I spot discussion after having hit the merge
>> button[1]. Maybe I should care/look closer at all the NPU2 things?
> 
> I could of course review things in a more timely and consistent manner :-)
> 
>> (it'd help if there was a sane way to test things that I could plug into
>> op-test)
>>
>> Any recommendation what I should do from here, is keeping the patch okay?
> 
> I thought it was ok but after re-reading the discussion I'm less convinced 
> again.

Yeah, Stewart needs to remove the patch for now.



> 
> - Alistair
> 
>> [1] not an actual button.
> 
>
Alistair Popple Dec. 12, 2018, 2:07 a.m. UTC | #8
On Wednesday, 12 December 2018 11:52:48 AM AEDT Alexey Kardashevskiy wrote:
> On 12/12/2018 11:31, Alistair Popple wrote:
> > On Tuesday, 11 December 2018 6:07:41 PM AEDT Stewart Smith wrote:
> >> Alexey Kardashevskiy <aik@ozlabs.ru> writes:
> >>> On 11/12/2018 13:53, Alistair Popple wrote:
> >>>> On Wednesday, 5 December 2018 3:52:22 PM AEDT Alexey Kardashevskiy 
wrote:
> >>>>> Each XTS MMIO ATSD# register is accompanied by another register -
> >>>>> XTS MMIO ATSD0 LPARID# - which controls LPID filtering for ATSD
> >>>>> transactions.
> >>>>> 
> >>>>> When a host system passes a GPU through to a guest, we need to enable
> >>>>> some ATSD for an LPAR. At the moment the host assigns one ATSD to
> >>>>> a NVLink bridge and this maps it to an LPAR when GPU is assigned to
> >>>>> the LPAR. The link number is used for an ATSD index.
> >>>>> 
> >>>>> ATSD6&7 stay mapped to the host (LPAR=0) all the time which seems to
> >>>>> be
> >>>>> acceptable price for the simplicity.
> >>>>> 
> >>>>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> >>>>> ---
> >>>>> 
> >>>>>  include/npu2-regs.h |  2 ++
> >>>>>  hw/npu2.c           | 22 ++++++++++++++++++----
> >>>>>  2 files changed, 20 insertions(+), 4 deletions(-)
> >>>>> 
> >>>>> diff --git a/include/npu2-regs.h b/include/npu2-regs.h
> >>>>> index 8273b2b..ae5e225 100644
> >>>>> --- a/include/npu2-regs.h
> >>>>> +++ b/include/npu2-regs.h
> >>>>> @@ -547,6 +547,8 @@ void npu2_scom_write(uint64_t gcid, uint64_t
> >>>>> scom_base,
> >>>>> 
> >>>>>  #define NPU2_XTS_MMIO_ATSD5_LPARID		
NPU2_REG_OFFSET(NPU2_STACK_MISC,
> >>>>> 
> >>>>> NPU2_BLOCK_XTS, 0x128) #define
> >>>>> NPU2_XTS_MMIO_ATSD6_LPARID		NPU2_REG_OFFSET(NPU2_STACK_MISC,
> >>>>> NPU2_BLOCK_XTS, 0x130) #define
> >>>>> NPU2_XTS_MMIO_ATSD7_LPARID		NPU2_REG_OFFSET(NPU2_STACK_MISC,
> >>>>> NPU2_BLOCK_XTS, 0x138) +#define   NPU2_XTS_MMIO_ATSD_MSR_HV
> > 
> > PPC_BIT(51)
> > 
> >>>>> +#define   NPU2_XTS_MMIO_ATSD_LPARID		PPC_BITMASK(52,63)
> >>>>> 
> >>>>>  #define NPU2_XTS_BDF_MAP			NPU2_REG_OFFSET(NPU2_STACK_MISC,
> >>>> 
> >>>> NPU2_BLOCK_XTS,
> >>>> 
> >>>>> 0x4000) #define   NPU2_XTS_BDF_MAP_VALID		PPC_BIT(0)
> >>>>> 
> >>>>>  #define   NPU2_XTS_BDF_MAP_UNFILT		PPC_BIT(1)
> >>>>> 
> >>>>> diff --git a/hw/npu2.c b/hw/npu2.c
> >>>>> index 41563b4..767306f 100644
> >>>>> --- a/hw/npu2.c
> >>>>> +++ b/hw/npu2.c
> >>>>> @@ -2255,9 +2255,14 @@ static int opal_npu_map_lpar(uint64_t phb_id,
> >>>>> uint64_t bdf, uint64_t lparid, struct phb *phb = pci_get_phb(phb_id);
> >>>>> 
> >>>>>  	struct npu2 *p;
> >>>>>  	struct npu2_dev *ndev = NULL;
> >>>>> 
> >>>>> -	uint64_t xts_bdf_lpar, rc = OPAL_SUCCESS;
> >>>>> +	uint64_t xts_bdf_lpar, atsd_lpar, rc = OPAL_SUCCESS;
> >>>>> 
> >>>>>  	int i;
> >>>>>  	int id;
> >>>>> 
> >>>>> +	static uint64_t atsd_lpar_regs[] = {
> >>>>> +		NPU2_XTS_MMIO_ATSD0_LPARID, NPU2_XTS_MMIO_ATSD1_LPARID,
> >>>>> +		NPU2_XTS_MMIO_ATSD2_LPARID, NPU2_XTS_MMIO_ATSD3_LPARID,
> >>>>> +		NPU2_XTS_MMIO_ATSD4_LPARID, NPU2_XTS_MMIO_ATSD5_LPARID,
> >>>>> +		NPU2_XTS_MMIO_ATSD6_LPARID, NPU2_XTS_MMIO_ATSD7_LPARID };
> >>>>> 
> >>>>>  	if (!phb || phb->phb_type != phb_type_npu_v2)
> >>>>>  	
> >>>>>  		return OPAL_PARAMETER;
> >>>>> 
> >>>>> @@ -2297,11 +2302,20 @@ static int opal_npu_map_lpar(uint64_t phb_id,
> >>>>> uint64_t bdf, uint64_t lparid, xts_bdf_lpar =
> >>>>> SETFIELD(NPU2_XTS_BDF_MAP_LPARID, xts_bdf_lpar, lparid); xts_bdf_lpar
> >>>>> =
> >>>>> SETFIELD(NPU2_XTS_BDF_MAP_LPARSHORT, xts_bdf_lpar, id);
> >>>>> 
> >>>>> -	/* Need to find an NVLink to send the ATSDs for this device over */
> >>>>> +	/*
> >>>>> +	 * Need to find an NVLink to send the ATSDs for this device over.
> >>>>> +	 * Also, the host allocates an ATSD per NVLink, enable filtering
> >>>>> now.
> >>>>> +	 */
> >>>>> +	atsd_lpar = SETFIELD(NPU2_XTS_MMIO_ATSD_LPARID, 0, lparid);
> >>>>> +	if (!lparid)
> >>>>> +		atsd_lpar = SETFIELD(NPU2_XTS_MMIO_ATSD_MSR_HV, atsd_lpar, 1);
> >>>>> +
> >>>>> 
> >>>>>  	for (i = 0; i < p->total_devices; i++) {
> >>>>>  	
> >>>>>  		if (p->devices[i].nvlink.gpu_bdfn == bdf) {
> >>>>> 
> >>>>> -			ndev = &p->devices[i];
> >>>>> -			break;
> >>>>> +			if (!ndev)
> >>>>> +				ndev = &p->devices[i];
> >>>>> +			if (i < ARRAY_SIZE(atsd_lpar_regs))
> >>>>> +				npu2_write(p, atsd_lpar_regs[i], atsd_lpar);
> >>>> 
> >>>> I'm not sure I really like this as ATSD resources should be allocated
> >>>> and
> >>>> passed through to guests by the hypervisor independently of the NVLink
> >>>> devices themselves, and a single ATSD register can serve more than one
> >>>> device.>
> >>> 
> >>> Sure it can serve more, and all passed ATSDs are assembled in one vPHB
> >>> property and the guest then does not make distinction between them.  I
> >>> just pass maximum 6 of 8 ATSDs but until recently it was just a single
> >>> ATSD per PHB for all devices and yet nobody complained.
> >>> 
> >>> The problem here is that I need to mmap() ATSD to QEMU, then map them to
> >>> KVM and then present this somehow via the device tree.
> >>> 
> >>> Now, I can do mmap() on a fd such as a VFIO device fd or a IOMMU
> >>> container fd.  If I choose the dev fd, VFIO in QEMU can mmap it, store
> >>> the pointer in the QEMU PCI device, and SPAPR vPHB can put atsd property
> >>> in vPHB. If I choose the container fd, then VFIO in QEMU can mmap it but
> >>> SPAPR vPHB has no concept of IOMMU/VFIO groups/containers and I'd rather
> >>> leave like this.
> >>> 
> >>>> This also creates an implicit assumption that the order of NVLink
> >>>> devices
> >>>> in p->devices[] matches ATSD register numbers in some way that the
> >>>> hypervisor assumes, which seems like it would be especially brittle as
> >>>> devices[] is essentially unordered.
> >>> 
> >>> The order does not matter here, all that matters is that the order does
> >>> not change - any ATSD works with any NVLink bridge on that NPU PHB.
> >>> 
> >>>> How does the hypervisor know which of the 8 ATSD registers has been
> >>>> assigned to the LPARID when calling opal_npu_map_lpar?
> >>> 
> >>> One ATSD register is exposed to the userspace via mmap() on a
> >>> "corresponding" NVLink bridge VFIO device fd, register index == link
> >>> index. An NVLink device belongs to an IOMMU group. A KVM pointer (i.e.
> >>> LPID) is set to the IOMMU group from an ioctl() called by QEMU.
> > 
> > Right, but doesn't the hypervisor have to know which of the 8 ATSD
> > registers to pass through to mmap to the guest? Only one of them gets
> > mapped to the LPAR in opal_npu_map_lpar(), so how do you work out which
> > one to mmap?
> 
> You got me there :) It is "ibm,npu-link-index" in the hypervisor/vfio
> but nothing says the index from npu2::devices[index] is the same thing,
> I need this conversion here as "index" is not visible to the host system :-/

Yeah, that is what I was getting at. You could just use dev->link_index (which 
is the same as "ibm,npu-link-index") instead of npu2::devices[index]. That 
would make me somewhat happier. Theorectically there is nothing in HW that 
guarantees there will be a matching ATSD register for every npu-link-index, 
but in practice there is and it looks likely that won't change in future so it 
should be ok.

> >>>> It seems like you might need a
> >>>> seperate OPAL call for this along the lines of
> >>>> npu2_map_mmio_atsd(atsd_index, lparid).
> >>> 
> >>> I do not see what it buys us. I need an allocator on the host or skiboot
> >>> anyway, either static or dynamic, dynamic seems excessive as the
> >>> devices[] order or the number of ATSDs do not change.
> >> 
> >> So, of course I spot discussion after having hit the merge
> >> button[1]. Maybe I should care/look closer at all the NPU2 things?
> > 
> > I could of course review things in a more timely and consistent manner :-)
> > 
> >> (it'd help if there was a sane way to test things that I could plug into
> >> op-test)
> >> 
> >> Any recommendation what I should do from here, is keeping the patch okay?
> > 
> > I thought it was ok but after re-reading the discussion I'm less convinced
> > again.
> 
> Yeah, Stewart needs to remove the patch for now.
> 
> > - Alistair
> > 
> >> [1] not an actual button.
Stewart Smith Dec. 12, 2018, 3:50 a.m. UTC | #9
Alexey Kardashevskiy <aik@ozlabs.ru> writes:
> On 12/12/2018 11:31, Alistair Popple wrote:
>> On Tuesday, 11 December 2018 6:07:41 PM AEDT Stewart Smith wrote:
>>> Alexey Kardashevskiy <aik@ozlabs.ru> writes:
>>>> On 11/12/2018 13:53, Alistair Popple wrote:
>>>>> On Wednesday, 5 December 2018 3:52:22 PM AEDT Alexey Kardashevskiy wrote:
>>>>>> Each XTS MMIO ATSD# register is accompanied by another register -
>>>>>> XTS MMIO ATSD0 LPARID# - which controls LPID filtering for ATSD
>>>>>> transactions.
>>>>>>
>>>>>> When a host system passes a GPU through to a guest, we need to enable
>>>>>> some ATSD for an LPAR. At the moment the host assigns one ATSD to
>>>>>> a NVLink bridge and this maps it to an LPAR when GPU is assigned to
>>>>>> the LPAR. The link number is used for an ATSD index.
>>>>>>
>>>>>> ATSD6&7 stay mapped to the host (LPAR=0) all the time which seems to be
>>>>>> acceptable price for the simplicity.
>>>>>>
>>>>>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>>>>>> ---
>>>>>>
>>>>>>  include/npu2-regs.h |  2 ++
>>>>>>  hw/npu2.c           | 22 ++++++++++++++++++----
>>>>>>  2 files changed, 20 insertions(+), 4 deletions(-)
>>>>>>
>>>>>> diff --git a/include/npu2-regs.h b/include/npu2-regs.h
>>>>>> index 8273b2b..ae5e225 100644
>>>>>> --- a/include/npu2-regs.h
>>>>>> +++ b/include/npu2-regs.h
>>>>>> @@ -547,6 +547,8 @@ void npu2_scom_write(uint64_t gcid, uint64_t
>>>>>> scom_base,
>>>>>>
>>>>>>  #define NPU2_XTS_MMIO_ATSD5_LPARID		NPU2_REG_OFFSET(NPU2_STACK_MISC,
>>>>>>
>>>>>> NPU2_BLOCK_XTS, 0x128) #define
>>>>>> NPU2_XTS_MMIO_ATSD6_LPARID		NPU2_REG_OFFSET(NPU2_STACK_MISC,
>>>>>> NPU2_BLOCK_XTS, 0x130) #define
>>>>>> NPU2_XTS_MMIO_ATSD7_LPARID		NPU2_REG_OFFSET(NPU2_STACK_MISC,
>>>>>> NPU2_BLOCK_XTS, 0x138) +#define   NPU2_XTS_MMIO_ATSD_MSR_HV		
>> PPC_BIT(51)
>>>>>> +#define   NPU2_XTS_MMIO_ATSD_LPARID		PPC_BITMASK(52,63)
>>>>>>
>>>>>>  #define NPU2_XTS_BDF_MAP			NPU2_REG_OFFSET(NPU2_STACK_MISC,
>>>>>
>>>>> NPU2_BLOCK_XTS,
>>>>>
>>>>>> 0x4000) #define   NPU2_XTS_BDF_MAP_VALID		PPC_BIT(0)
>>>>>>
>>>>>>  #define   NPU2_XTS_BDF_MAP_UNFILT		PPC_BIT(1)
>>>>>>
>>>>>> diff --git a/hw/npu2.c b/hw/npu2.c
>>>>>> index 41563b4..767306f 100644
>>>>>> --- a/hw/npu2.c
>>>>>> +++ b/hw/npu2.c
>>>>>> @@ -2255,9 +2255,14 @@ static int opal_npu_map_lpar(uint64_t phb_id,
>>>>>> uint64_t bdf, uint64_t lparid, struct phb *phb = pci_get_phb(phb_id);
>>>>>>
>>>>>>  	struct npu2 *p;
>>>>>>  	struct npu2_dev *ndev = NULL;
>>>>>>
>>>>>> -	uint64_t xts_bdf_lpar, rc = OPAL_SUCCESS;
>>>>>> +	uint64_t xts_bdf_lpar, atsd_lpar, rc = OPAL_SUCCESS;
>>>>>>
>>>>>>  	int i;
>>>>>>  	int id;
>>>>>>
>>>>>> +	static uint64_t atsd_lpar_regs[] = {
>>>>>> +		NPU2_XTS_MMIO_ATSD0_LPARID, NPU2_XTS_MMIO_ATSD1_LPARID,
>>>>>> +		NPU2_XTS_MMIO_ATSD2_LPARID, NPU2_XTS_MMIO_ATSD3_LPARID,
>>>>>> +		NPU2_XTS_MMIO_ATSD4_LPARID, NPU2_XTS_MMIO_ATSD5_LPARID,
>>>>>> +		NPU2_XTS_MMIO_ATSD6_LPARID, NPU2_XTS_MMIO_ATSD7_LPARID };
>>>>>>
>>>>>>  	if (!phb || phb->phb_type != phb_type_npu_v2)
>>>>>>  	
>>>>>>  		return OPAL_PARAMETER;
>>>>>>
>>>>>> @@ -2297,11 +2302,20 @@ static int opal_npu_map_lpar(uint64_t phb_id,
>>>>>> uint64_t bdf, uint64_t lparid, xts_bdf_lpar =
>>>>>> SETFIELD(NPU2_XTS_BDF_MAP_LPARID, xts_bdf_lpar, lparid); xts_bdf_lpar =
>>>>>> SETFIELD(NPU2_XTS_BDF_MAP_LPARSHORT, xts_bdf_lpar, id);
>>>>>>
>>>>>> -	/* Need to find an NVLink to send the ATSDs for this device over */
>>>>>> +	/*
>>>>>> +	 * Need to find an NVLink to send the ATSDs for this device over.
>>>>>> +	 * Also, the host allocates an ATSD per NVLink, enable filtering now.
>>>>>> +	 */
>>>>>> +	atsd_lpar = SETFIELD(NPU2_XTS_MMIO_ATSD_LPARID, 0, lparid);
>>>>>> +	if (!lparid)
>>>>>> +		atsd_lpar = SETFIELD(NPU2_XTS_MMIO_ATSD_MSR_HV, atsd_lpar, 1);
>>>>>> +
>>>>>>
>>>>>>  	for (i = 0; i < p->total_devices; i++) {
>>>>>>  	
>>>>>>  		if (p->devices[i].nvlink.gpu_bdfn == bdf) {
>>>>>>
>>>>>> -			ndev = &p->devices[i];
>>>>>> -			break;
>>>>>> +			if (!ndev)
>>>>>> +				ndev = &p->devices[i];
>>>>>> +			if (i < ARRAY_SIZE(atsd_lpar_regs))
>>>>>> +				npu2_write(p, atsd_lpar_regs[i], atsd_lpar);
>>>>>
>>>>> I'm not sure I really like this as ATSD resources should be allocated and
>>>>> passed through to guests by the hypervisor independently of the NVLink
>>>>> devices themselves, and a single ATSD register can serve more than one
>>>>> device.> 
>>>> Sure it can serve more, and all passed ATSDs are assembled in one vPHB
>>>> property and the guest then does not make distinction between them.  I
>>>> just pass maximum 6 of 8 ATSDs but until recently it was just a single
>>>> ATSD per PHB for all devices and yet nobody complained.
>>>>
>>>> The problem here is that I need to mmap() ATSD to QEMU, then map them to
>>>> KVM and then present this somehow via the device tree.
>>>>
>>>> Now, I can do mmap() on a fd such as a VFIO device fd or a IOMMU
>>>> container fd.  If I choose the dev fd, VFIO in QEMU can mmap it, store
>>>> the pointer in the QEMU PCI device, and SPAPR vPHB can put atsd property
>>>> in vPHB. If I choose the container fd, then VFIO in QEMU can mmap it but
>>>> SPAPR vPHB has no concept of IOMMU/VFIO groups/containers and I'd rather
>>>> leave like this.
>>>>
>>>>> This also creates an implicit assumption that the order of NVLink devices
>>>>> in p->devices[] matches ATSD register numbers in some way that the
>>>>> hypervisor assumes, which seems like it would be especially brittle as
>>>>> devices[] is essentially unordered.
>>>>
>>>> The order does not matter here, all that matters is that the order does
>>>> not change - any ATSD works with any NVLink bridge on that NPU PHB.
>>>>
>>>>> How does the hypervisor know which of the 8 ATSD registers has been
>>>>> assigned to the LPARID when calling opal_npu_map_lpar?
>>>>
>>>> One ATSD register is exposed to the userspace via mmap() on a
>>>> "corresponding" NVLink bridge VFIO device fd, register index == link
>>>> index. An NVLink device belongs to an IOMMU group. A KVM pointer (i.e.
>>>> LPID) is set to the IOMMU group from an ioctl() called by QEMU.
>> 
>> Right, but doesn't the hypervisor have to know which of the 8 ATSD registers 
>> to pass through to mmap to the guest? Only one of them gets mapped to the LPAR 
>> in opal_npu_map_lpar(), so how do you work out which one to mmap?
>
>
> You got me there :) It is "ibm,npu-link-index" in the hypervisor/vfio
> but nothing says the index from npu2::devices[index] is the same thing,
> I need this conversion here as "index" is not visible to the host system :-/
>
>
>
>> 
>>>>> It seems like you might need a
>>>>> seperate OPAL call for this along the lines of
>>>>> npu2_map_mmio_atsd(atsd_index, lparid).
>>>>
>>>> I do not see what it buys us. I need an allocator on the host or skiboot
>>>> anyway, either static or dynamic, dynamic seems excessive as the
>>>> devices[] order or the number of ATSDs do not change.
>>>
>>> So, of course I spot discussion after having hit the merge
>>> button[1]. Maybe I should care/look closer at all the NPU2 things?
>> 
>> I could of course review things in a more timely and consistent manner :-)
>> 
>>> (it'd help if there was a sane way to test things that I could plug into
>>> op-test)
>>>
>>> Any recommendation what I should do from here, is keeping the patch okay?
>> 
>> I thought it was ok but after re-reading the discussion I'm less convinced 
>> again.
>
> Yeah, Stewart needs to remove the patch for now.

No problem - reverted as of eb0226273239d806f404712aacc42961130c7079
Andrew Donnellan Dec. 12, 2018, 3:57 a.m. UTC | #10
On 12/12/18 1:07 pm, Alistair Popple wrote:
>> You got me there :) It is "ibm,npu-link-index" in the hypervisor/vfio
>> but nothing says the index from npu2::devices[index] is the same thing,
>> I need this conversion here as "index" is not visible to the host system :-/
> 
> Yeah, that is what I was getting at. You could just use dev->link_index (which
> is the same as "ibm,npu-link-index") instead of npu2::devices[index]. That
> would make me somewhat happier. Theorectically there is nothing in HW that
> guarantees there will be a matching ATSD register for every npu-link-index,
> but in practice there is and it looks likely that won't change in future so it
> should be ok.

I didn't see this patch early enough and only noticed it when rebasing 
my current OpenCAPI work.

As part of my Witherspoon OpenCAPI enablement work I am currently 
looking at every place where we're currently indexing into the devices 
array (as that's a place where you probably want to check the type of 
the device you're looking at). There should be no uses of the array 
index at all, IMHO - always check link_index or brick_index explicitly.

Anyway, the v2 of this patch will need to be rebased on top of the 
series I'm about to send :)
Alexey Kardashevskiy Dec. 12, 2018, 4:19 a.m. UTC | #11
On 12/12/2018 14:57, Andrew Donnellan wrote:
> On 12/12/18 1:07 pm, Alistair Popple wrote:
>>> You got me there :) It is "ibm,npu-link-index" in the hypervisor/vfio
>>> but nothing says the index from npu2::devices[index] is the same thing,
>>> I need this conversion here as "index" is not visible to the host
>>> system :-/
>>
>> Yeah, that is what I was getting at. You could just use
>> dev->link_index (which
>> is the same as "ibm,npu-link-index") instead of npu2::devices[index].
>> That
>> would make me somewhat happier. Theorectically there is nothing in HW
>> that
>> guarantees there will be a matching ATSD register for every
>> npu-link-index,
>> but in practice there is and it looks likely that won't change in
>> future so it
>> should be ok.
> 
> I didn't see this patch early enough and only noticed it when rebasing
> my current OpenCAPI work.
> 
> As part of my Witherspoon OpenCAPI enablement work I am currently
> looking at every place where we're currently indexing into the devices
> array (as that's a place where you probably want to check the type of
> the device you're looking at). There should be no uses of the array
> index at all, IMHO - always check link_index or brick_index explicitly.\

The current plan is to ditch dev->link_index and store the device
pointer in npu2::devices[dt_prop_get_u32(link, "ibm,npu-link-index")] as
ibm,npu-link-index is a skiboot made-up thing anyway,

https://github.com/open-power/skiboot/blob/master/hdata/spira.c#L1502

and then the index in devices[] will make sense.

> 
> Anyway, the v2 of this patch will need to be rebased on top of the
> series I'm about to send :)
>
Alistair Popple Dec. 13, 2018, 12:25 a.m. UTC | #12
On Wednesday, 12 December 2018 1:07:45 PM AEDT Alistair Popple wrote:
> On Wednesday, 12 December 2018 11:52:48 AM AEDT Alexey Kardashevskiy wrote:
> > On 12/12/2018 11:31, Alistair Popple wrote:
> > > On Tuesday, 11 December 2018 6:07:41 PM AEDT Stewart Smith wrote:
> > >> Alexey Kardashevskiy <aik@ozlabs.ru> writes:
> > >>> On 11/12/2018 13:53, Alistair Popple wrote:
> > >>>> On Wednesday, 5 December 2018 3:52:22 PM AEDT Alexey Kardashevskiy
> 
> wrote:
> > >>>>> Each XTS MMIO ATSD# register is accompanied by another register -
> > >>>>> XTS MMIO ATSD0 LPARID# - which controls LPID filtering for ATSD
> > >>>>> transactions.
> > >>>>> 
> > >>>>> When a host system passes a GPU through to a guest, we need to
> > >>>>> enable
> > >>>>> some ATSD for an LPAR. At the moment the host assigns one ATSD to
> > >>>>> a NVLink bridge and this maps it to an LPAR when GPU is assigned to
> > >>>>> the LPAR. The link number is used for an ATSD index.
> > >>>>> 
> > >>>>> ATSD6&7 stay mapped to the host (LPAR=0) all the time which seems to
> > >>>>> be
> > >>>>> acceptable price for the simplicity.
> > >>>>> 
> > >>>>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> > >>>>> ---
> > >>>>> 
> > >>>>>  include/npu2-regs.h |  2 ++
> > >>>>>  hw/npu2.c           | 22 ++++++++++++++++++----
> > >>>>>  2 files changed, 20 insertions(+), 4 deletions(-)
> > >>>>> 
> > >>>>> diff --git a/include/npu2-regs.h b/include/npu2-regs.h
> > >>>>> index 8273b2b..ae5e225 100644
> > >>>>> --- a/include/npu2-regs.h
> > >>>>> +++ b/include/npu2-regs.h
> > >>>>> @@ -547,6 +547,8 @@ void npu2_scom_write(uint64_t gcid, uint64_t
> > >>>>> scom_base,
> > >>>>> 
> > >>>>>  #define NPU2_XTS_MMIO_ATSD5_LPARID
> 
> NPU2_REG_OFFSET(NPU2_STACK_MISC,
> 
> > >>>>> NPU2_BLOCK_XTS, 0x128) #define
> > >>>>> NPU2_XTS_MMIO_ATSD6_LPARID		NPU2_REG_OFFSET(NPU2_STACK_MISC,
> > >>>>> NPU2_BLOCK_XTS, 0x130) #define
> > >>>>> NPU2_XTS_MMIO_ATSD7_LPARID		NPU2_REG_OFFSET(NPU2_STACK_MISC,
> > >>>>> NPU2_BLOCK_XTS, 0x138) +#define   NPU2_XTS_MMIO_ATSD_MSR_HV
> > > 
> > > PPC_BIT(51)
> > > 
> > >>>>> +#define   NPU2_XTS_MMIO_ATSD_LPARID		PPC_BITMASK(52,63)
> > >>>>> 
> > >>>>>  #define NPU2_XTS_BDF_MAP			NPU2_REG_OFFSET(NPU2_STACK_MISC,
> > >>>> 
> > >>>> NPU2_BLOCK_XTS,
> > >>>> 
> > >>>>> 0x4000) #define   NPU2_XTS_BDF_MAP_VALID		PPC_BIT(0)
> > >>>>> 
> > >>>>>  #define   NPU2_XTS_BDF_MAP_UNFILT		PPC_BIT(1)
> > >>>>> 
> > >>>>> diff --git a/hw/npu2.c b/hw/npu2.c
> > >>>>> index 41563b4..767306f 100644
> > >>>>> --- a/hw/npu2.c
> > >>>>> +++ b/hw/npu2.c
> > >>>>> @@ -2255,9 +2255,14 @@ static int opal_npu_map_lpar(uint64_t phb_id,
> > >>>>> uint64_t bdf, uint64_t lparid, struct phb *phb =
> > >>>>> pci_get_phb(phb_id);
> > >>>>> 
> > >>>>>  	struct npu2 *p;
> > >>>>>  	struct npu2_dev *ndev = NULL;
> > >>>>> 
> > >>>>> -	uint64_t xts_bdf_lpar, rc = OPAL_SUCCESS;
> > >>>>> +	uint64_t xts_bdf_lpar, atsd_lpar, rc = OPAL_SUCCESS;
> > >>>>> 
> > >>>>>  	int i;
> > >>>>>  	int id;
> > >>>>> 
> > >>>>> +	static uint64_t atsd_lpar_regs[] = {
> > >>>>> +		NPU2_XTS_MMIO_ATSD0_LPARID, NPU2_XTS_MMIO_ATSD1_LPARID,
> > >>>>> +		NPU2_XTS_MMIO_ATSD2_LPARID, NPU2_XTS_MMIO_ATSD3_LPARID,
> > >>>>> +		NPU2_XTS_MMIO_ATSD4_LPARID, NPU2_XTS_MMIO_ATSD5_LPARID,
> > >>>>> +		NPU2_XTS_MMIO_ATSD6_LPARID, NPU2_XTS_MMIO_ATSD7_LPARID };
> > >>>>> 
> > >>>>>  	if (!phb || phb->phb_type != phb_type_npu_v2)
> > >>>>>  	
> > >>>>>  		return OPAL_PARAMETER;
> > >>>>> 
> > >>>>> @@ -2297,11 +2302,20 @@ static int opal_npu_map_lpar(uint64_t
> > >>>>> phb_id,
> > >>>>> uint64_t bdf, uint64_t lparid, xts_bdf_lpar =
> > >>>>> SETFIELD(NPU2_XTS_BDF_MAP_LPARID, xts_bdf_lpar, lparid);
> > >>>>> xts_bdf_lpar
> > >>>>> =
> > >>>>> SETFIELD(NPU2_XTS_BDF_MAP_LPARSHORT, xts_bdf_lpar, id);
> > >>>>> 
> > >>>>> -	/* Need to find an NVLink to send the ATSDs for this device over
> > >>>>> */
> > >>>>> +	/*
> > >>>>> +	 * Need to find an NVLink to send the ATSDs for this device over.
> > >>>>> +	 * Also, the host allocates an ATSD per NVLink, enable filtering
> > >>>>> now.
> > >>>>> +	 */
> > >>>>> +	atsd_lpar = SETFIELD(NPU2_XTS_MMIO_ATSD_LPARID, 0, lparid);
> > >>>>> +	if (!lparid)
> > >>>>> +		atsd_lpar = SETFIELD(NPU2_XTS_MMIO_ATSD_MSR_HV, atsd_lpar, 1);
> > >>>>> +
> > >>>>> 
> > >>>>>  	for (i = 0; i < p->total_devices; i++) {
> > >>>>>  	
> > >>>>>  		if (p->devices[i].nvlink.gpu_bdfn == bdf) {
> > >>>>> 
> > >>>>> -			ndev = &p->devices[i];
> > >>>>> -			break;
> > >>>>> +			if (!ndev)
> > >>>>> +				ndev = &p->devices[i];
> > >>>>> +			if (i < ARRAY_SIZE(atsd_lpar_regs))
> > >>>>> +				npu2_write(p, atsd_lpar_regs[i], atsd_lpar);
> > >>>> 
> > >>>> I'm not sure I really like this as ATSD resources should be allocated
> > >>>> and
> > >>>> passed through to guests by the hypervisor independently of the
> > >>>> NVLink
> > >>>> devices themselves, and a single ATSD register can serve more than
> > >>>> one
> > >>>> device.>
> > >>> 
> > >>> Sure it can serve more, and all passed ATSDs are assembled in one vPHB
> > >>> property and the guest then does not make distinction between them.  I
> > >>> just pass maximum 6 of 8 ATSDs but until recently it was just a single
> > >>> ATSD per PHB for all devices and yet nobody complained.
> > >>> 
> > >>> The problem here is that I need to mmap() ATSD to QEMU, then map them
> > >>> to
> > >>> KVM and then present this somehow via the device tree.
> > >>> 
> > >>> Now, I can do mmap() on a fd such as a VFIO device fd or a IOMMU
> > >>> container fd.  If I choose the dev fd, VFIO in QEMU can mmap it, store
> > >>> the pointer in the QEMU PCI device, and SPAPR vPHB can put atsd
> > >>> property
> > >>> in vPHB. If I choose the container fd, then VFIO in QEMU can mmap it
> > >>> but
> > >>> SPAPR vPHB has no concept of IOMMU/VFIO groups/containers and I'd
> > >>> rather
> > >>> leave like this.
> > >>> 
> > >>>> This also creates an implicit assumption that the order of NVLink
> > >>>> devices
> > >>>> in p->devices[] matches ATSD register numbers in some way that the
> > >>>> hypervisor assumes, which seems like it would be especially brittle
> > >>>> as
> > >>>> devices[] is essentially unordered.
> > >>> 
> > >>> The order does not matter here, all that matters is that the order
> > >>> does
> > >>> not change - any ATSD works with any NVLink bridge on that NPU PHB.
> > >>> 
> > >>>> How does the hypervisor know which of the 8 ATSD registers has been
> > >>>> assigned to the LPARID when calling opal_npu_map_lpar?
> > >>> 
> > >>> One ATSD register is exposed to the userspace via mmap() on a
> > >>> "corresponding" NVLink bridge VFIO device fd, register index == link
> > >>> index. An NVLink device belongs to an IOMMU group. A KVM pointer (i.e.
> > >>> LPID) is set to the IOMMU group from an ioctl() called by QEMU.
> > > 
> > > Right, but doesn't the hypervisor have to know which of the 8 ATSD
> > > registers to pass through to mmap to the guest? Only one of them gets
> > > mapped to the LPAR in opal_npu_map_lpar(), so how do you work out which
> > > one to mmap?
> > 
> > You got me there :) It is "ibm,npu-link-index" in the hypervisor/vfio
> > but nothing says the index from npu2::devices[index] is the same thing,
> > I need this conversion here as "index" is not visible to the host system
> > :-/
> Yeah, that is what I was getting at. You could just use dev->link_index
> (which is the same as "ibm,npu-link-index") instead of
> npu2::devices[index]. That would make me somewhat happier. Theorectically
> there is nothing in HW that guarantees there will be a matching ATSD
> register for every npu-link-index, but in practice there is and it looks
> likely that won't change in future so it should be ok.

One more thought. You're adding features to this function (and others) as part 
of this pass through work, have you given any thought to what happens when 
someone tries to use pass through on firmware versions too old to support all 
the required features? I can't see any way of detecting this at runtime with 
the current design so I guess you're just going to mandate that at least 
version X of Skiboot is required for pass through to work? Pretty sure someone 
will try on an old version though :-)

- Alistair

> > >>>> It seems like you might need a
> > >>>> seperate OPAL call for this along the lines of
> > >>>> npu2_map_mmio_atsd(atsd_index, lparid).
> > >>> 
> > >>> I do not see what it buys us. I need an allocator on the host or
> > >>> skiboot
> > >>> anyway, either static or dynamic, dynamic seems excessive as the
> > >>> devices[] order or the number of ATSDs do not change.
> > >> 
> > >> So, of course I spot discussion after having hit the merge
> > >> button[1]. Maybe I should care/look closer at all the NPU2 things?
> > > 
> > > I could of course review things in a more timely and consistent manner
> > > :-)
> > > 
> > >> (it'd help if there was a sane way to test things that I could plug
> > >> into
> > >> op-test)
> > >> 
> > >> Any recommendation what I should do from here, is keeping the patch
> > >> okay?
> > > 
> > > I thought it was ok but after re-reading the discussion I'm less
> > > convinced
> > > again.
> > 
> > Yeah, Stewart needs to remove the patch for now.
> > 
> > > - Alistair
> > > 
> > >> [1] not an actual button.
Alexey Kardashevskiy Dec. 13, 2018, 2:23 a.m. UTC | #13
On 13/12/2018 11:25, Alistair Popple wrote:
> On Wednesday, 12 December 2018 1:07:45 PM AEDT Alistair Popple wrote:
>> On Wednesday, 12 December 2018 11:52:48 AM AEDT Alexey Kardashevskiy wrote:
>>> On 12/12/2018 11:31, Alistair Popple wrote:
>>>> On Tuesday, 11 December 2018 6:07:41 PM AEDT Stewart Smith wrote:
>>>>> Alexey Kardashevskiy <aik@ozlabs.ru> writes:
>>>>>> On 11/12/2018 13:53, Alistair Popple wrote:
>>>>>>> On Wednesday, 5 December 2018 3:52:22 PM AEDT Alexey Kardashevskiy
>>
>> wrote:
>>>>>>>> Each XTS MMIO ATSD# register is accompanied by another register -
>>>>>>>> XTS MMIO ATSD0 LPARID# - which controls LPID filtering for ATSD
>>>>>>>> transactions.
>>>>>>>>
>>>>>>>> When a host system passes a GPU through to a guest, we need to
>>>>>>>> enable
>>>>>>>> some ATSD for an LPAR. At the moment the host assigns one ATSD to
>>>>>>>> a NVLink bridge and this maps it to an LPAR when GPU is assigned to
>>>>>>>> the LPAR. The link number is used for an ATSD index.
>>>>>>>>
>>>>>>>> ATSD6&7 stay mapped to the host (LPAR=0) all the time which seems to
>>>>>>>> be
>>>>>>>> acceptable price for the simplicity.
>>>>>>>>
>>>>>>>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>>>>>>>> ---
>>>>>>>>
>>>>>>>>  include/npu2-regs.h |  2 ++
>>>>>>>>  hw/npu2.c           | 22 ++++++++++++++++++----
>>>>>>>>  2 files changed, 20 insertions(+), 4 deletions(-)
>>>>>>>>
>>>>>>>> diff --git a/include/npu2-regs.h b/include/npu2-regs.h
>>>>>>>> index 8273b2b..ae5e225 100644
>>>>>>>> --- a/include/npu2-regs.h
>>>>>>>> +++ b/include/npu2-regs.h
>>>>>>>> @@ -547,6 +547,8 @@ void npu2_scom_write(uint64_t gcid, uint64_t
>>>>>>>> scom_base,
>>>>>>>>
>>>>>>>>  #define NPU2_XTS_MMIO_ATSD5_LPARID
>>
>> NPU2_REG_OFFSET(NPU2_STACK_MISC,
>>
>>>>>>>> NPU2_BLOCK_XTS, 0x128) #define
>>>>>>>> NPU2_XTS_MMIO_ATSD6_LPARID		NPU2_REG_OFFSET(NPU2_STACK_MISC,
>>>>>>>> NPU2_BLOCK_XTS, 0x130) #define
>>>>>>>> NPU2_XTS_MMIO_ATSD7_LPARID		NPU2_REG_OFFSET(NPU2_STACK_MISC,
>>>>>>>> NPU2_BLOCK_XTS, 0x138) +#define   NPU2_XTS_MMIO_ATSD_MSR_HV
>>>>
>>>> PPC_BIT(51)
>>>>
>>>>>>>> +#define   NPU2_XTS_MMIO_ATSD_LPARID		PPC_BITMASK(52,63)
>>>>>>>>
>>>>>>>>  #define NPU2_XTS_BDF_MAP			NPU2_REG_OFFSET(NPU2_STACK_MISC,
>>>>>>>
>>>>>>> NPU2_BLOCK_XTS,
>>>>>>>
>>>>>>>> 0x4000) #define   NPU2_XTS_BDF_MAP_VALID		PPC_BIT(0)
>>>>>>>>
>>>>>>>>  #define   NPU2_XTS_BDF_MAP_UNFILT		PPC_BIT(1)
>>>>>>>>
>>>>>>>> diff --git a/hw/npu2.c b/hw/npu2.c
>>>>>>>> index 41563b4..767306f 100644
>>>>>>>> --- a/hw/npu2.c
>>>>>>>> +++ b/hw/npu2.c
>>>>>>>> @@ -2255,9 +2255,14 @@ static int opal_npu_map_lpar(uint64_t phb_id,
>>>>>>>> uint64_t bdf, uint64_t lparid, struct phb *phb =
>>>>>>>> pci_get_phb(phb_id);
>>>>>>>>
>>>>>>>>  	struct npu2 *p;
>>>>>>>>  	struct npu2_dev *ndev = NULL;
>>>>>>>>
>>>>>>>> -	uint64_t xts_bdf_lpar, rc = OPAL_SUCCESS;
>>>>>>>> +	uint64_t xts_bdf_lpar, atsd_lpar, rc = OPAL_SUCCESS;
>>>>>>>>
>>>>>>>>  	int i;
>>>>>>>>  	int id;
>>>>>>>>
>>>>>>>> +	static uint64_t atsd_lpar_regs[] = {
>>>>>>>> +		NPU2_XTS_MMIO_ATSD0_LPARID, NPU2_XTS_MMIO_ATSD1_LPARID,
>>>>>>>> +		NPU2_XTS_MMIO_ATSD2_LPARID, NPU2_XTS_MMIO_ATSD3_LPARID,
>>>>>>>> +		NPU2_XTS_MMIO_ATSD4_LPARID, NPU2_XTS_MMIO_ATSD5_LPARID,
>>>>>>>> +		NPU2_XTS_MMIO_ATSD6_LPARID, NPU2_XTS_MMIO_ATSD7_LPARID };
>>>>>>>>
>>>>>>>>  	if (!phb || phb->phb_type != phb_type_npu_v2)
>>>>>>>>  	
>>>>>>>>  		return OPAL_PARAMETER;
>>>>>>>>
>>>>>>>> @@ -2297,11 +2302,20 @@ static int opal_npu_map_lpar(uint64_t
>>>>>>>> phb_id,
>>>>>>>> uint64_t bdf, uint64_t lparid, xts_bdf_lpar =
>>>>>>>> SETFIELD(NPU2_XTS_BDF_MAP_LPARID, xts_bdf_lpar, lparid);
>>>>>>>> xts_bdf_lpar
>>>>>>>> =
>>>>>>>> SETFIELD(NPU2_XTS_BDF_MAP_LPARSHORT, xts_bdf_lpar, id);
>>>>>>>>
>>>>>>>> -	/* Need to find an NVLink to send the ATSDs for this device over
>>>>>>>> */
>>>>>>>> +	/*
>>>>>>>> +	 * Need to find an NVLink to send the ATSDs for this device over.
>>>>>>>> +	 * Also, the host allocates an ATSD per NVLink, enable filtering
>>>>>>>> now.
>>>>>>>> +	 */
>>>>>>>> +	atsd_lpar = SETFIELD(NPU2_XTS_MMIO_ATSD_LPARID, 0, lparid);
>>>>>>>> +	if (!lparid)
>>>>>>>> +		atsd_lpar = SETFIELD(NPU2_XTS_MMIO_ATSD_MSR_HV, atsd_lpar, 1);
>>>>>>>> +
>>>>>>>>
>>>>>>>>  	for (i = 0; i < p->total_devices; i++) {
>>>>>>>>  	
>>>>>>>>  		if (p->devices[i].nvlink.gpu_bdfn == bdf) {
>>>>>>>>
>>>>>>>> -			ndev = &p->devices[i];
>>>>>>>> -			break;
>>>>>>>> +			if (!ndev)
>>>>>>>> +				ndev = &p->devices[i];
>>>>>>>> +			if (i < ARRAY_SIZE(atsd_lpar_regs))
>>>>>>>> +				npu2_write(p, atsd_lpar_regs[i], atsd_lpar);
>>>>>>>
>>>>>>> I'm not sure I really like this as ATSD resources should be allocated
>>>>>>> and
>>>>>>> passed through to guests by the hypervisor independently of the
>>>>>>> NVLink
>>>>>>> devices themselves, and a single ATSD register can serve more than
>>>>>>> one
>>>>>>> device.>
>>>>>>
>>>>>> Sure it can serve more, and all passed ATSDs are assembled in one vPHB
>>>>>> property and the guest then does not make distinction between them.  I
>>>>>> just pass maximum 6 of 8 ATSDs but until recently it was just a single
>>>>>> ATSD per PHB for all devices and yet nobody complained.
>>>>>>
>>>>>> The problem here is that I need to mmap() ATSD to QEMU, then map them
>>>>>> to
>>>>>> KVM and then present this somehow via the device tree.
>>>>>>
>>>>>> Now, I can do mmap() on a fd such as a VFIO device fd or a IOMMU
>>>>>> container fd.  If I choose the dev fd, VFIO in QEMU can mmap it, store
>>>>>> the pointer in the QEMU PCI device, and SPAPR vPHB can put atsd
>>>>>> property
>>>>>> in vPHB. If I choose the container fd, then VFIO in QEMU can mmap it
>>>>>> but
>>>>>> SPAPR vPHB has no concept of IOMMU/VFIO groups/containers and I'd
>>>>>> rather
>>>>>> leave like this.
>>>>>>
>>>>>>> This also creates an implicit assumption that the order of NVLink
>>>>>>> devices
>>>>>>> in p->devices[] matches ATSD register numbers in some way that the
>>>>>>> hypervisor assumes, which seems like it would be especially brittle
>>>>>>> as
>>>>>>> devices[] is essentially unordered.
>>>>>>
>>>>>> The order does not matter here, all that matters is that the order
>>>>>> does
>>>>>> not change - any ATSD works with any NVLink bridge on that NPU PHB.
>>>>>>
>>>>>>> How does the hypervisor know which of the 8 ATSD registers has been
>>>>>>> assigned to the LPARID when calling opal_npu_map_lpar?
>>>>>>
>>>>>> One ATSD register is exposed to the userspace via mmap() on a
>>>>>> "corresponding" NVLink bridge VFIO device fd, register index == link
>>>>>> index. An NVLink device belongs to an IOMMU group. A KVM pointer (i.e.
>>>>>> LPID) is set to the IOMMU group from an ioctl() called by QEMU.
>>>>
>>>> Right, but doesn't the hypervisor have to know which of the 8 ATSD
>>>> registers to pass through to mmap to the guest? Only one of them gets
>>>> mapped to the LPAR in opal_npu_map_lpar(), so how do you work out which
>>>> one to mmap?
>>>
>>> You got me there :) It is "ibm,npu-link-index" in the hypervisor/vfio
>>> but nothing says the index from npu2::devices[index] is the same thing,
>>> I need this conversion here as "index" is not visible to the host system
>>> :-/
>> Yeah, that is what I was getting at. You could just use dev->link_index
>> (which is the same as "ibm,npu-link-index") instead of
>> npu2::devices[index]. That would make me somewhat happier. Theorectically
>> there is nothing in HW that guarantees there will be a matching ATSD
>> register for every npu-link-index, but in practice there is and it looks
>> likely that won't change in future so it should be ok.
> 
> One more thought. You're adding features to this function (and others) as part 
> of this pass through work, have you given any thought to what happens when 
> someone tries to use pass through on firmware versions too old to support all 
> the required features? I can't see any way of detecting this at runtime with 
> the current design so I guess you're just going to mandate that at least 
> version X of Skiboot is required for pass through to work? Pretty sure someone 
> will try on an old version though :-)


If the firmware is old, then ATS/ATSD simply won't work (as they remain
mapped to LPID=0) but that's about it. Since ATSD is the only supported
config, we will be telling people to use skiboot v6.3 when it is out, yes.

Patch
diff mbox series

diff --git a/include/npu2-regs.h b/include/npu2-regs.h
index 8273b2b..ae5e225 100644
--- a/include/npu2-regs.h
+++ b/include/npu2-regs.h
@@ -547,6 +547,8 @@  void npu2_scom_write(uint64_t gcid, uint64_t scom_base,
 #define NPU2_XTS_MMIO_ATSD5_LPARID		NPU2_REG_OFFSET(NPU2_STACK_MISC, NPU2_BLOCK_XTS, 0x128)
 #define NPU2_XTS_MMIO_ATSD6_LPARID		NPU2_REG_OFFSET(NPU2_STACK_MISC, NPU2_BLOCK_XTS, 0x130)
 #define NPU2_XTS_MMIO_ATSD7_LPARID		NPU2_REG_OFFSET(NPU2_STACK_MISC, NPU2_BLOCK_XTS, 0x138)
+#define   NPU2_XTS_MMIO_ATSD_MSR_HV		PPC_BIT(51)
+#define   NPU2_XTS_MMIO_ATSD_LPARID		PPC_BITMASK(52,63)
 #define NPU2_XTS_BDF_MAP			NPU2_REG_OFFSET(NPU2_STACK_MISC, NPU2_BLOCK_XTS, 0x4000)
 #define   NPU2_XTS_BDF_MAP_VALID		PPC_BIT(0)
 #define   NPU2_XTS_BDF_MAP_UNFILT		PPC_BIT(1)
diff --git a/hw/npu2.c b/hw/npu2.c
index 41563b4..767306f 100644
--- a/hw/npu2.c
+++ b/hw/npu2.c
@@ -2255,9 +2255,14 @@  static int opal_npu_map_lpar(uint64_t phb_id, uint64_t bdf, uint64_t lparid,
 	struct phb *phb = pci_get_phb(phb_id);
 	struct npu2 *p;
 	struct npu2_dev *ndev = NULL;
-	uint64_t xts_bdf_lpar, rc = OPAL_SUCCESS;
+	uint64_t xts_bdf_lpar, atsd_lpar, rc = OPAL_SUCCESS;
 	int i;
 	int id;
+	static uint64_t atsd_lpar_regs[] = {
+		NPU2_XTS_MMIO_ATSD0_LPARID, NPU2_XTS_MMIO_ATSD1_LPARID,
+		NPU2_XTS_MMIO_ATSD2_LPARID, NPU2_XTS_MMIO_ATSD3_LPARID,
+		NPU2_XTS_MMIO_ATSD4_LPARID, NPU2_XTS_MMIO_ATSD5_LPARID,
+		NPU2_XTS_MMIO_ATSD6_LPARID, NPU2_XTS_MMIO_ATSD7_LPARID };
 
 	if (!phb || phb->phb_type != phb_type_npu_v2)
 		return OPAL_PARAMETER;
@@ -2297,11 +2302,20 @@  static int opal_npu_map_lpar(uint64_t phb_id, uint64_t bdf, uint64_t lparid,
 	xts_bdf_lpar = SETFIELD(NPU2_XTS_BDF_MAP_LPARID, xts_bdf_lpar, lparid);
 	xts_bdf_lpar = SETFIELD(NPU2_XTS_BDF_MAP_LPARSHORT, xts_bdf_lpar, id);
 
-	/* Need to find an NVLink to send the ATSDs for this device over */
+	/*
+	 * Need to find an NVLink to send the ATSDs for this device over.
+	 * Also, the host allocates an ATSD per NVLink, enable filtering now.
+	 */
+	atsd_lpar = SETFIELD(NPU2_XTS_MMIO_ATSD_LPARID, 0, lparid);
+	if (!lparid)
+		atsd_lpar = SETFIELD(NPU2_XTS_MMIO_ATSD_MSR_HV, atsd_lpar, 1);
+
 	for (i = 0; i < p->total_devices; i++) {
 		if (p->devices[i].nvlink.gpu_bdfn == bdf) {
-			ndev = &p->devices[i];
-			break;
+			if (!ndev)
+				ndev = &p->devices[i];
+			if (i < ARRAY_SIZE(atsd_lpar_regs))
+				npu2_write(p, atsd_lpar_regs[i], atsd_lpar);
 		}
 	}