diff mbox series

[05/15] hw/nvme: Add support for SR-IOV

Message ID 20211007162406.1920374-6-lukasz.maniak@linux.intel.com
State New
Headers show
Series hw/nvme: SR-IOV with Virtualization Enhancements | expand

Commit Message

Lukasz Maniak Oct. 7, 2021, 4:23 p.m. UTC
This patch implements initial support for Single Root I/O Virtualization
on an NVMe device.

Essentially, it allows to define the maximum number of virtual functions
supported by the NVMe controller via sriov_max_vfs parameter.

Passing a non-zero value to sriov_max_vfs triggers reporting of SR-IOV
capability by a physical controller and ARI capability by both the
physical and virtual function devices.

NVMe controllers created via virtual functions mirror functionally
the physical controller, which may not entirely be the case, thus
consideration would be needed on the way to limit the capabilities of
the VF.

NVMe subsystem is required for the use of SR-IOV.

Signed-off-by: Lukasz Maniak <lukasz.maniak@linux.intel.com>
---
 hw/nvme/ctrl.c           | 74 ++++++++++++++++++++++++++++++++++++++--
 hw/nvme/nvme.h           |  1 +
 include/hw/pci/pci_ids.h |  1 +
 3 files changed, 73 insertions(+), 3 deletions(-)

Comments

Klaus Jensen Oct. 20, 2021, 7:07 p.m. UTC | #1
On Oct  7 18:23, Lukasz Maniak wrote:
> This patch implements initial support for Single Root I/O Virtualization
> on an NVMe device.
> 
> Essentially, it allows to define the maximum number of virtual functions
> supported by the NVMe controller via sriov_max_vfs parameter.
> 
> Passing a non-zero value to sriov_max_vfs triggers reporting of SR-IOV
> capability by a physical controller and ARI capability by both the
> physical and virtual function devices.
> 
> NVMe controllers created via virtual functions mirror functionally
> the physical controller, which may not entirely be the case, thus
> consideration would be needed on the way to limit the capabilities of
> the VF.
> 
> NVMe subsystem is required for the use of SR-IOV.
> 
> Signed-off-by: Lukasz Maniak <lukasz.maniak@linux.intel.com>
> ---
>  hw/nvme/ctrl.c           | 74 ++++++++++++++++++++++++++++++++++++++--
>  hw/nvme/nvme.h           |  1 +
>  include/hw/pci/pci_ids.h |  1 +
>  3 files changed, 73 insertions(+), 3 deletions(-)
> 
> diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
> index 6a571d18cf..ad79ff0c00 100644
> --- a/hw/nvme/ctrl.c
> +++ b/hw/nvme/ctrl.c
> @@ -35,6 +35,7 @@
>   *              mdts=<N[optional]>,vsl=<N[optional]>, \
>   *              zoned.zasl=<N[optional]>, \
>   *              zoned.auto_transition=<on|off[optional]>, \
> + *              sriov_max_vfs=<N[optional]> \
>   *              subsys=<subsys_id>
>   *      -device nvme-ns,drive=<drive_id>,bus=<bus_name>,nsid=<nsid>,\
>   *              zoned=<true|false[optional]>, \
> @@ -106,6 +107,12 @@
>   *   transitioned to zone state closed for resource management purposes.
>   *   Defaults to 'on'.
>   *
> + * - `sriov_max_vfs`
> + *   Indicates the maximum number of PCIe virtual functions supported
> + *   by the controller. The default value is 0. Specifying a non-zero value
> + *   enables reporting of both SR-IOV and ARI capabilities by the NVMe device.
> + *   Virtual function controllers will not report SR-IOV capability.
> + *
>   * nvme namespace device parameters
>   * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
>   * - `shared`
> @@ -160,6 +167,7 @@
>  #include "sysemu/block-backend.h"
>  #include "sysemu/hostmem.h"
>  #include "hw/pci/msix.h"
> +#include "hw/pci/pcie_sriov.h"
>  #include "migration/vmstate.h"
>  
>  #include "nvme.h"
> @@ -175,6 +183,9 @@
>  #define NVME_TEMPERATURE_CRITICAL 0x175
>  #define NVME_NUM_FW_SLOTS 1
>  #define NVME_DEFAULT_MAX_ZA_SIZE (128 * KiB)
> +#define NVME_MAX_VFS 127
> +#define NVME_VF_OFFSET 0x1
> +#define NVME_VF_STRIDE 1
>  
>  #define NVME_GUEST_ERR(trace, fmt, ...) \
>      do { \
> @@ -5583,6 +5594,10 @@ static void nvme_ctrl_reset(NvmeCtrl *n)
>          g_free(event);
>      }
>  
> +    if (!pci_is_vf(&n->parent_obj) && n->params.sriov_max_vfs) {
> +        pcie_sriov_pf_disable_vfs(&n->parent_obj);
> +    }
> +
>      n->aer_queued = 0;
>      n->outstanding_aers = 0;
>      n->qs_created = false;
> @@ -6264,6 +6279,19 @@ static void nvme_check_constraints(NvmeCtrl *n, Error **errp)
>          error_setg(errp, "vsl must be non-zero");
>          return;
>      }
> +
> +    if (params->sriov_max_vfs) {
> +        if (!n->subsys) {
> +            error_setg(errp, "subsystem is required for the use of SR-IOV");
> +            return;
> +        }
> +
> +        if (params->sriov_max_vfs > NVME_MAX_VFS) {
> +            error_setg(errp, "sriov_max_vfs must be between 0 and %d",
> +                       NVME_MAX_VFS);
> +            return;
> +        }
> +    }
>  }
>  
>  static void nvme_init_state(NvmeCtrl *n)
> @@ -6321,6 +6349,20 @@ static void nvme_init_pmr(NvmeCtrl *n, PCIDevice *pci_dev)
>      memory_region_set_enabled(&n->pmr.dev->mr, false);
>  }
>  
> +static void nvme_init_sriov(NvmeCtrl *n, PCIDevice *pci_dev, uint16_t offset,
> +                            uint64_t bar_size)
> +{
> +    uint16_t vf_dev_id = n->params.use_intel_id ?
> +                         PCI_DEVICE_ID_INTEL_NVME : PCI_DEVICE_ID_REDHAT_NVME;
> +
> +    pcie_sriov_pf_init(pci_dev, offset, "nvme", vf_dev_id,
> +                       n->params.sriov_max_vfs, n->params.sriov_max_vfs,
> +                       NVME_VF_OFFSET, NVME_VF_STRIDE, NULL);

Did you consider adding a new device for the virtual function device,
"nvmevf"?

Down the road it might help with the variations in capabilities that you
describe.
Lukasz Maniak Oct. 21, 2021, 2:33 p.m. UTC | #2
On Wed, Oct 20, 2021 at 09:07:47PM +0200, Klaus Jensen wrote:
> On Oct  7 18:23, Lukasz Maniak wrote:
> > This patch implements initial support for Single Root I/O Virtualization
> > on an NVMe device.
> > 
> > Essentially, it allows to define the maximum number of virtual functions
> > supported by the NVMe controller via sriov_max_vfs parameter.
> > 
> > Passing a non-zero value to sriov_max_vfs triggers reporting of SR-IOV
> > capability by a physical controller and ARI capability by both the
> > physical and virtual function devices.
> > 
> > NVMe controllers created via virtual functions mirror functionally
> > the physical controller, which may not entirely be the case, thus
> > consideration would be needed on the way to limit the capabilities of
> > the VF.
> > 
> > NVMe subsystem is required for the use of SR-IOV.
> > 
> > Signed-off-by: Lukasz Maniak <lukasz.maniak@linux.intel.com>
> > ---
> >  hw/nvme/ctrl.c           | 74 ++++++++++++++++++++++++++++++++++++++--
> >  hw/nvme/nvme.h           |  1 +
> >  include/hw/pci/pci_ids.h |  1 +
> >  3 files changed, 73 insertions(+), 3 deletions(-)
> > 
> > diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
> > index 6a571d18cf..ad79ff0c00 100644
> > --- a/hw/nvme/ctrl.c
> > +++ b/hw/nvme/ctrl.c
> > @@ -35,6 +35,7 @@
> >   *              mdts=<N[optional]>,vsl=<N[optional]>, \
> >   *              zoned.zasl=<N[optional]>, \
> >   *              zoned.auto_transition=<on|off[optional]>, \
> > + *              sriov_max_vfs=<N[optional]> \
> >   *              subsys=<subsys_id>
> >   *      -device nvme-ns,drive=<drive_id>,bus=<bus_name>,nsid=<nsid>,\
> >   *              zoned=<true|false[optional]>, \
> > @@ -106,6 +107,12 @@
> >   *   transitioned to zone state closed for resource management purposes.
> >   *   Defaults to 'on'.
> >   *
> > + * - `sriov_max_vfs`
> > + *   Indicates the maximum number of PCIe virtual functions supported
> > + *   by the controller. The default value is 0. Specifying a non-zero value
> > + *   enables reporting of both SR-IOV and ARI capabilities by the NVMe device.
> > + *   Virtual function controllers will not report SR-IOV capability.
> > + *
> >   * nvme namespace device parameters
> >   * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
> >   * - `shared`
> > @@ -160,6 +167,7 @@
> >  #include "sysemu/block-backend.h"
> >  #include "sysemu/hostmem.h"
> >  #include "hw/pci/msix.h"
> > +#include "hw/pci/pcie_sriov.h"
> >  #include "migration/vmstate.h"
> >  
> >  #include "nvme.h"
> > @@ -175,6 +183,9 @@
> >  #define NVME_TEMPERATURE_CRITICAL 0x175
> >  #define NVME_NUM_FW_SLOTS 1
> >  #define NVME_DEFAULT_MAX_ZA_SIZE (128 * KiB)
> > +#define NVME_MAX_VFS 127
> > +#define NVME_VF_OFFSET 0x1
> > +#define NVME_VF_STRIDE 1
> >  
> >  #define NVME_GUEST_ERR(trace, fmt, ...) \
> >      do { \
> > @@ -5583,6 +5594,10 @@ static void nvme_ctrl_reset(NvmeCtrl *n)
> >          g_free(event);
> >      }
> >  
> > +    if (!pci_is_vf(&n->parent_obj) && n->params.sriov_max_vfs) {
> > +        pcie_sriov_pf_disable_vfs(&n->parent_obj);
> > +    }
> > +
> >      n->aer_queued = 0;
> >      n->outstanding_aers = 0;
> >      n->qs_created = false;
> > @@ -6264,6 +6279,19 @@ static void nvme_check_constraints(NvmeCtrl *n, Error **errp)
> >          error_setg(errp, "vsl must be non-zero");
> >          return;
> >      }
> > +
> > +    if (params->sriov_max_vfs) {
> > +        if (!n->subsys) {
> > +            error_setg(errp, "subsystem is required for the use of SR-IOV");
> > +            return;
> > +        }
> > +
> > +        if (params->sriov_max_vfs > NVME_MAX_VFS) {
> > +            error_setg(errp, "sriov_max_vfs must be between 0 and %d",
> > +                       NVME_MAX_VFS);
> > +            return;
> > +        }
> > +    }
> >  }
> >  
> >  static void nvme_init_state(NvmeCtrl *n)
> > @@ -6321,6 +6349,20 @@ static void nvme_init_pmr(NvmeCtrl *n, PCIDevice *pci_dev)
> >      memory_region_set_enabled(&n->pmr.dev->mr, false);
> >  }
> >  
> > +static void nvme_init_sriov(NvmeCtrl *n, PCIDevice *pci_dev, uint16_t offset,
> > +                            uint64_t bar_size)
> > +{
> > +    uint16_t vf_dev_id = n->params.use_intel_id ?
> > +                         PCI_DEVICE_ID_INTEL_NVME : PCI_DEVICE_ID_REDHAT_NVME;
> > +
> > +    pcie_sriov_pf_init(pci_dev, offset, "nvme", vf_dev_id,
> > +                       n->params.sriov_max_vfs, n->params.sriov_max_vfs,
> > +                       NVME_VF_OFFSET, NVME_VF_STRIDE, NULL);
> 
> Did you consider adding a new device for the virtual function device,
> "nvmevf"?
> 
> Down the road it might help with the variations in capabilities that you
> describe.

Hi Klaus,

A separate nvmevf device was actually the first approach I tried.
Well, it came down to copying the nvme device functions in favor of a
few changes that can be covered with conditions.

As for limiting VF capabilities, the problem comes down to a nice
restriction on supported command set by the VF controller. Thus, using
nvmevf for this purpose sounds like an overkill.

Concerning restriction on the supported command set, an actual real
device would reduce VF's ability to use namespace attachment, namespace
management, virtualization enhancements, and corresponding identify
commands. However, since implementing secure virtualization in QEMU
would be complex and is not required it can be skipped for now.

Kind regards,
Lukasz
Klaus Jensen Nov. 2, 2021, 2:33 p.m. UTC | #3
On Oct  7 18:23, Lukasz Maniak wrote:
> This patch implements initial support for Single Root I/O Virtualization
> on an NVMe device.
> 
> Essentially, it allows to define the maximum number of virtual functions
> supported by the NVMe controller via sriov_max_vfs parameter.
> 
> Passing a non-zero value to sriov_max_vfs triggers reporting of SR-IOV
> capability by a physical controller and ARI capability by both the
> physical and virtual function devices.
> 
> NVMe controllers created via virtual functions mirror functionally
> the physical controller, which may not entirely be the case, thus
> consideration would be needed on the way to limit the capabilities of
> the VF.
> 
> NVMe subsystem is required for the use of SR-IOV.
> 
> Signed-off-by: Lukasz Maniak <lukasz.maniak@linux.intel.com>
> ---
>  hw/nvme/ctrl.c           | 74 ++++++++++++++++++++++++++++++++++++++--
>  hw/nvme/nvme.h           |  1 +
>  include/hw/pci/pci_ids.h |  1 +
>  3 files changed, 73 insertions(+), 3 deletions(-)
> 
> diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
> index 6a571d18cf..ad79ff0c00 100644
> --- a/hw/nvme/ctrl.c
> +++ b/hw/nvme/ctrl.c
> @@ -6361,8 +6406,12 @@ static int nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, Error **errp)
>                            n->reg_size);
>      memory_region_add_subregion(&n->bar0, 0, &n->iomem);
>  
> -    pci_register_bar(pci_dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY |
> -                     PCI_BASE_ADDRESS_MEM_TYPE_64, &n->bar0);
> +    if (pci_is_vf(pci_dev)) {
> +        pcie_sriov_vf_register_bar(pci_dev, 0, &n->bar0);
> +    } else {
> +        pci_register_bar(pci_dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY |
> +                         PCI_BASE_ADDRESS_MEM_TYPE_64, &n->bar0);
> +    }

I assume that the assert we are seeing means that the pci_register_bars
in nvme_init_cmb and nvme_init_pmr must be changed similarly to this.
Lukasz Maniak Nov. 2, 2021, 5:33 p.m. UTC | #4
On Tue, Nov 02, 2021 at 03:33:15PM +0100, Klaus Jensen wrote:
> On Oct  7 18:23, Lukasz Maniak wrote:
> > This patch implements initial support for Single Root I/O Virtualization
> > on an NVMe device.
> > 
> > Essentially, it allows to define the maximum number of virtual functions
> > supported by the NVMe controller via sriov_max_vfs parameter.
> > 
> > Passing a non-zero value to sriov_max_vfs triggers reporting of SR-IOV
> > capability by a physical controller and ARI capability by both the
> > physical and virtual function devices.
> > 
> > NVMe controllers created via virtual functions mirror functionally
> > the physical controller, which may not entirely be the case, thus
> > consideration would be needed on the way to limit the capabilities of
> > the VF.
> > 
> > NVMe subsystem is required for the use of SR-IOV.
> > 
> > Signed-off-by: Lukasz Maniak <lukasz.maniak@linux.intel.com>
> > ---
> >  hw/nvme/ctrl.c           | 74 ++++++++++++++++++++++++++++++++++++++--
> >  hw/nvme/nvme.h           |  1 +
> >  include/hw/pci/pci_ids.h |  1 +
> >  3 files changed, 73 insertions(+), 3 deletions(-)
> > 
> > diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
> > index 6a571d18cf..ad79ff0c00 100644
> > --- a/hw/nvme/ctrl.c
> > +++ b/hw/nvme/ctrl.c
> > @@ -6361,8 +6406,12 @@ static int nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, Error **errp)
> >                            n->reg_size);
> >      memory_region_add_subregion(&n->bar0, 0, &n->iomem);
> >  
> > -    pci_register_bar(pci_dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY |
> > -                     PCI_BASE_ADDRESS_MEM_TYPE_64, &n->bar0);
> > +    if (pci_is_vf(pci_dev)) {
> > +        pcie_sriov_vf_register_bar(pci_dev, 0, &n->bar0);
> > +    } else {
> > +        pci_register_bar(pci_dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY |
> > +                         PCI_BASE_ADDRESS_MEM_TYPE_64, &n->bar0);
> > +    }
> 
> I assume that the assert we are seeing means that the pci_register_bars
> in nvme_init_cmb and nvme_init_pmr must be changed similarly to this.

Assert will only arise for CMB as VF params are initialized with PF
params.

@@ -6532,6 +6585,15 @@ static void nvme_realize(PCIDevice *pci_dev, Error **errp)
     NvmeCtrl *n = NVME(pci_dev);
     NvmeNamespace *ns;
     Error *local_err = NULL;
+    NvmeCtrl *pn = NVME(pcie_sriov_get_pf(pci_dev));
+
+    if (pci_is_vf(pci_dev)) {
+        /* VFs derive settings from the parent. PF's lifespan exceeds
+         * that of VF's, so it's safe to share params.serial.
+         */
+        memcpy(&n->params, &pn->params, sizeof(NvmeParams));
+        n->subsys = pn->subsys;
+    }
 
     nvme_check_constraints(n, &local_err);
     if (local_err) {

The following simple fix will both fix assert and also allow
each VF to have its own CMB of the size defined for PF.

---
 hw/nvme/ctrl.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index 19b32dd4da..99daa6290c 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -6837,10 +6837,15 @@ static void nvme_init_cmb(NvmeCtrl *n, PCIDevice *pci_dev)
     n->cmb.buf = g_malloc0(cmb_size);
     memory_region_init_io(&n->cmb.mem, OBJECT(n), &nvme_cmb_ops, n,
                           "nvme-cmb", cmb_size);
-    pci_register_bar(pci_dev, NVME_CMB_BIR,
-                     PCI_BASE_ADDRESS_SPACE_MEMORY |
-                     PCI_BASE_ADDRESS_MEM_TYPE_64 |
-                     PCI_BASE_ADDRESS_MEM_PREFETCH, &n->cmb.mem);
+
+    if (pci_is_vf(pci_dev)) {
+        pcie_sriov_vf_register_bar(pci_dev, NVME_CMB_BIR, &n->cmb.mem);
+    } else {
+        pci_register_bar(pci_dev, NVME_CMB_BIR,
+                        PCI_BASE_ADDRESS_SPACE_MEMORY |
+                        PCI_BASE_ADDRESS_MEM_TYPE_64 |
+                        PCI_BASE_ADDRESS_MEM_PREFETCH, &n->cmb.mem);
+    }
 
     NVME_CAP_SET_CMBS(cap, 1);
     stq_le_p(&n->bar.cap, cap);

As for PMR, it is currently only available on PF, as only PF is capable
of specifying the memory-backend-file object to use with PMR.
Otherwise, either VFs would have to share the PMR with its PF, or there
would be a requirement to define a memory-backend-file object for each VF.
Lukasz Maniak Nov. 4, 2021, 2:30 p.m. UTC | #5
On Tue, Nov 02, 2021 at 06:33:31PM +0100, Lukasz Maniak wrote:
> On Tue, Nov 02, 2021 at 03:33:15PM +0100, Klaus Jensen wrote:
> > On Oct  7 18:23, Lukasz Maniak wrote:
> > > This patch implements initial support for Single Root I/O Virtualization
> > > on an NVMe device.
> > > 
> > > Essentially, it allows to define the maximum number of virtual functions
> > > supported by the NVMe controller via sriov_max_vfs parameter.
> > > 
> > > Passing a non-zero value to sriov_max_vfs triggers reporting of SR-IOV
> > > capability by a physical controller and ARI capability by both the
> > > physical and virtual function devices.
> > > 
> > > NVMe controllers created via virtual functions mirror functionally
> > > the physical controller, which may not entirely be the case, thus
> > > consideration would be needed on the way to limit the capabilities of
> > > the VF.
> > > 
> > > NVMe subsystem is required for the use of SR-IOV.
> > > 
> > > Signed-off-by: Lukasz Maniak <lukasz.maniak@linux.intel.com>
> > > ---
> > >  hw/nvme/ctrl.c           | 74 ++++++++++++++++++++++++++++++++++++++--
> > >  hw/nvme/nvme.h           |  1 +
> > >  include/hw/pci/pci_ids.h |  1 +
> > >  3 files changed, 73 insertions(+), 3 deletions(-)
> > > 
> > > diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
> > > index 6a571d18cf..ad79ff0c00 100644
> > > --- a/hw/nvme/ctrl.c
> > > +++ b/hw/nvme/ctrl.c
> > > @@ -6361,8 +6406,12 @@ static int nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, Error **errp)
> > >                            n->reg_size);
> > >      memory_region_add_subregion(&n->bar0, 0, &n->iomem);
> > >  
> > > -    pci_register_bar(pci_dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY |
> > > -                     PCI_BASE_ADDRESS_MEM_TYPE_64, &n->bar0);
> > > +    if (pci_is_vf(pci_dev)) {
> > > +        pcie_sriov_vf_register_bar(pci_dev, 0, &n->bar0);
> > > +    } else {
> > > +        pci_register_bar(pci_dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY |
> > > +                         PCI_BASE_ADDRESS_MEM_TYPE_64, &n->bar0);
> > > +    }
> > 
> > I assume that the assert we are seeing means that the pci_register_bars
> > in nvme_init_cmb and nvme_init_pmr must be changed similarly to this.
> 
> Assert will only arise for CMB as VF params are initialized with PF
> params.
> 
> @@ -6532,6 +6585,15 @@ static void nvme_realize(PCIDevice *pci_dev, Error **errp)
>      NvmeCtrl *n = NVME(pci_dev);
>      NvmeNamespace *ns;
>      Error *local_err = NULL;
> +    NvmeCtrl *pn = NVME(pcie_sriov_get_pf(pci_dev));
> +
> +    if (pci_is_vf(pci_dev)) {
> +        /* VFs derive settings from the parent. PF's lifespan exceeds
> +         * that of VF's, so it's safe to share params.serial.
> +         */
> +        memcpy(&n->params, &pn->params, sizeof(NvmeParams));
> +        n->subsys = pn->subsys;
> +    }
>  
>      nvme_check_constraints(n, &local_err);
>      if (local_err) {
> 
> The following simple fix will both fix assert and also allow
> each VF to have its own CMB of the size defined for PF.
> 
> ---
>  hw/nvme/ctrl.c | 13 +++++++++----
>  1 file changed, 9 insertions(+), 4 deletions(-)
> 
> diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
> index 19b32dd4da..99daa6290c 100644
> --- a/hw/nvme/ctrl.c
> +++ b/hw/nvme/ctrl.c
> @@ -6837,10 +6837,15 @@ static void nvme_init_cmb(NvmeCtrl *n, PCIDevice *pci_dev)
>      n->cmb.buf = g_malloc0(cmb_size);
>      memory_region_init_io(&n->cmb.mem, OBJECT(n), &nvme_cmb_ops, n,
>                            "nvme-cmb", cmb_size);
> -    pci_register_bar(pci_dev, NVME_CMB_BIR,
> -                     PCI_BASE_ADDRESS_SPACE_MEMORY |
> -                     PCI_BASE_ADDRESS_MEM_TYPE_64 |
> -                     PCI_BASE_ADDRESS_MEM_PREFETCH, &n->cmb.mem);
> +
> +    if (pci_is_vf(pci_dev)) {
> +        pcie_sriov_vf_register_bar(pci_dev, NVME_CMB_BIR, &n->cmb.mem);
> +    } else {
> +        pci_register_bar(pci_dev, NVME_CMB_BIR,
> +                        PCI_BASE_ADDRESS_SPACE_MEMORY |
> +                        PCI_BASE_ADDRESS_MEM_TYPE_64 |
> +                        PCI_BASE_ADDRESS_MEM_PREFETCH, &n->cmb.mem);
> +    }
>  
>      NVME_CAP_SET_CMBS(cap, 1);
>      stq_le_p(&n->bar.cap, cap);
> 
> As for PMR, it is currently only available on PF, as only PF is capable
> of specifying the memory-backend-file object to use with PMR.
> Otherwise, either VFs would have to share the PMR with its PF, or there
> would be a requirement to define a memory-backend-file object for each VF.

Hi Klaus,

After some discussion, we decided to prohibit in V2 the use of CMB and
PMR in combination with SR-IOV.

While the implementation of CMB with SR-IOV is relatively
straightforward, PMR is not. We are committed to consistency in CMB and
PMR design in association with SR-IOV. So we considered it best to
disable both features and implement them in separate patches.

Kind regards,
Lukasz
Klaus Jensen Nov. 8, 2021, 7:56 a.m. UTC | #6
On Nov  4 15:30, Lukasz Maniak wrote:
> On Tue, Nov 02, 2021 at 06:33:31PM +0100, Lukasz Maniak wrote:
> > On Tue, Nov 02, 2021 at 03:33:15PM +0100, Klaus Jensen wrote:
> > > On Oct  7 18:23, Lukasz Maniak wrote:
> > > > This patch implements initial support for Single Root I/O Virtualization
> > > > on an NVMe device.
> > > > 
> > > > Essentially, it allows to define the maximum number of virtual functions
> > > > supported by the NVMe controller via sriov_max_vfs parameter.
> > > > 
> > > > Passing a non-zero value to sriov_max_vfs triggers reporting of SR-IOV
> > > > capability by a physical controller and ARI capability by both the
> > > > physical and virtual function devices.
> > > > 
> > > > NVMe controllers created via virtual functions mirror functionally
> > > > the physical controller, which may not entirely be the case, thus
> > > > consideration would be needed on the way to limit the capabilities of
> > > > the VF.
> > > > 
> > > > NVMe subsystem is required for the use of SR-IOV.
> > > > 
> > > > Signed-off-by: Lukasz Maniak <lukasz.maniak@linux.intel.com>
> > > > ---
> > > >  hw/nvme/ctrl.c           | 74 ++++++++++++++++++++++++++++++++++++++--
> > > >  hw/nvme/nvme.h           |  1 +
> > > >  include/hw/pci/pci_ids.h |  1 +
> > > >  3 files changed, 73 insertions(+), 3 deletions(-)
> > > > 
> > > > diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
> > > > index 6a571d18cf..ad79ff0c00 100644
> > > > --- a/hw/nvme/ctrl.c
> > > > +++ b/hw/nvme/ctrl.c
> > > > @@ -6361,8 +6406,12 @@ static int nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, Error **errp)
> > > >                            n->reg_size);
> > > >      memory_region_add_subregion(&n->bar0, 0, &n->iomem);
> > > >  
> > > > -    pci_register_bar(pci_dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY |
> > > > -                     PCI_BASE_ADDRESS_MEM_TYPE_64, &n->bar0);
> > > > +    if (pci_is_vf(pci_dev)) {
> > > > +        pcie_sriov_vf_register_bar(pci_dev, 0, &n->bar0);
> > > > +    } else {
> > > > +        pci_register_bar(pci_dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY |
> > > > +                         PCI_BASE_ADDRESS_MEM_TYPE_64, &n->bar0);
> > > > +    }
> > > 
> > > I assume that the assert we are seeing means that the pci_register_bars
> > > in nvme_init_cmb and nvme_init_pmr must be changed similarly to this.
> > 
> > Assert will only arise for CMB as VF params are initialized with PF
> > params.
> > 
> > @@ -6532,6 +6585,15 @@ static void nvme_realize(PCIDevice *pci_dev, Error **errp)
> >      NvmeCtrl *n = NVME(pci_dev);
> >      NvmeNamespace *ns;
> >      Error *local_err = NULL;
> > +    NvmeCtrl *pn = NVME(pcie_sriov_get_pf(pci_dev));
> > +
> > +    if (pci_is_vf(pci_dev)) {
> > +        /* VFs derive settings from the parent. PF's lifespan exceeds
> > +         * that of VF's, so it's safe to share params.serial.
> > +         */
> > +        memcpy(&n->params, &pn->params, sizeof(NvmeParams));
> > +        n->subsys = pn->subsys;
> > +    }
> >  
> >      nvme_check_constraints(n, &local_err);
> >      if (local_err) {
> > 
> > The following simple fix will both fix assert and also allow
> > each VF to have its own CMB of the size defined for PF.
> > 
> > ---
> >  hw/nvme/ctrl.c | 13 +++++++++----
> >  1 file changed, 9 insertions(+), 4 deletions(-)
> > 
> > diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
> > index 19b32dd4da..99daa6290c 100644
> > --- a/hw/nvme/ctrl.c
> > +++ b/hw/nvme/ctrl.c
> > @@ -6837,10 +6837,15 @@ static void nvme_init_cmb(NvmeCtrl *n, PCIDevice *pci_dev)
> >      n->cmb.buf = g_malloc0(cmb_size);
> >      memory_region_init_io(&n->cmb.mem, OBJECT(n), &nvme_cmb_ops, n,
> >                            "nvme-cmb", cmb_size);
> > -    pci_register_bar(pci_dev, NVME_CMB_BIR,
> > -                     PCI_BASE_ADDRESS_SPACE_MEMORY |
> > -                     PCI_BASE_ADDRESS_MEM_TYPE_64 |
> > -                     PCI_BASE_ADDRESS_MEM_PREFETCH, &n->cmb.mem);
> > +
> > +    if (pci_is_vf(pci_dev)) {
> > +        pcie_sriov_vf_register_bar(pci_dev, NVME_CMB_BIR, &n->cmb.mem);
> > +    } else {
> > +        pci_register_bar(pci_dev, NVME_CMB_BIR,
> > +                        PCI_BASE_ADDRESS_SPACE_MEMORY |
> > +                        PCI_BASE_ADDRESS_MEM_TYPE_64 |
> > +                        PCI_BASE_ADDRESS_MEM_PREFETCH, &n->cmb.mem);
> > +    }
> >  
> >      NVME_CAP_SET_CMBS(cap, 1);
> >      stq_le_p(&n->bar.cap, cap);
> > 
> > As for PMR, it is currently only available on PF, as only PF is capable
> > of specifying the memory-backend-file object to use with PMR.
> > Otherwise, either VFs would have to share the PMR with its PF, or there
> > would be a requirement to define a memory-backend-file object for each VF.
> 
> Hi Klaus,
> 
> After some discussion, we decided to prohibit in V2 the use of CMB and
> PMR in combination with SR-IOV.
> 
> While the implementation of CMB with SR-IOV is relatively
> straightforward, PMR is not. We are committed to consistency in CMB and
> PMR design in association with SR-IOV. So we considered it best to
> disable both features and implement them in separate patches.
> 

I am completely fine with that. However, since we are copying the
parameters verbatimly, it would nice that the `info qtree` would reflect
this difference (that the parameters, say, cmb_size_mb is 0 for the
virtual controllers).
Lukasz Maniak Nov. 10, 2021, 1:42 p.m. UTC | #7
On Mon, Nov 08, 2021 at 08:56:43AM +0100, Klaus Jensen wrote:
> On Nov  4 15:30, Lukasz Maniak wrote:
> > On Tue, Nov 02, 2021 at 06:33:31PM +0100, Lukasz Maniak wrote:
> > > On Tue, Nov 02, 2021 at 03:33:15PM +0100, Klaus Jensen wrote:
> > > > On Oct  7 18:23, Lukasz Maniak wrote:
> > > > > This patch implements initial support for Single Root I/O Virtualization
> > > > > on an NVMe device.
> > > > > 
> > > > > Essentially, it allows to define the maximum number of virtual functions
> > > > > supported by the NVMe controller via sriov_max_vfs parameter.
> > > > > 
> > > > > Passing a non-zero value to sriov_max_vfs triggers reporting of SR-IOV
> > > > > capability by a physical controller and ARI capability by both the
> > > > > physical and virtual function devices.
> > > > > 
> > > > > NVMe controllers created via virtual functions mirror functionally
> > > > > the physical controller, which may not entirely be the case, thus
> > > > > consideration would be needed on the way to limit the capabilities of
> > > > > the VF.
> > > > > 
> > > > > NVMe subsystem is required for the use of SR-IOV.
> > > > > 
> > > > > Signed-off-by: Lukasz Maniak <lukasz.maniak@linux.intel.com>
> > > > > ---
> > > > >  hw/nvme/ctrl.c           | 74 ++++++++++++++++++++++++++++++++++++++--
> > > > >  hw/nvme/nvme.h           |  1 +
> > > > >  include/hw/pci/pci_ids.h |  1 +
> > > > >  3 files changed, 73 insertions(+), 3 deletions(-)
> > > > > 
> > > > > diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
> > > > > index 6a571d18cf..ad79ff0c00 100644
> > > > > --- a/hw/nvme/ctrl.c
> > > > > +++ b/hw/nvme/ctrl.c
> > > > > @@ -6361,8 +6406,12 @@ static int nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, Error **errp)
> > > > >                            n->reg_size);
> > > > >      memory_region_add_subregion(&n->bar0, 0, &n->iomem);
> > > > >  
> > > > > -    pci_register_bar(pci_dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY |
> > > > > -                     PCI_BASE_ADDRESS_MEM_TYPE_64, &n->bar0);
> > > > > +    if (pci_is_vf(pci_dev)) {
> > > > > +        pcie_sriov_vf_register_bar(pci_dev, 0, &n->bar0);
> > > > > +    } else {
> > > > > +        pci_register_bar(pci_dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY |
> > > > > +                         PCI_BASE_ADDRESS_MEM_TYPE_64, &n->bar0);
> > > > > +    }
> > > > 
> > > > I assume that the assert we are seeing means that the pci_register_bars
> > > > in nvme_init_cmb and nvme_init_pmr must be changed similarly to this.
> > > 
> > > Assert will only arise for CMB as VF params are initialized with PF
> > > params.
> > > 
> > > @@ -6532,6 +6585,15 @@ static void nvme_realize(PCIDevice *pci_dev, Error **errp)
> > >      NvmeCtrl *n = NVME(pci_dev);
> > >      NvmeNamespace *ns;
> > >      Error *local_err = NULL;
> > > +    NvmeCtrl *pn = NVME(pcie_sriov_get_pf(pci_dev));
> > > +
> > > +    if (pci_is_vf(pci_dev)) {
> > > +        /* VFs derive settings from the parent. PF's lifespan exceeds
> > > +         * that of VF's, so it's safe to share params.serial.
> > > +         */
> > > +        memcpy(&n->params, &pn->params, sizeof(NvmeParams));
> > > +        n->subsys = pn->subsys;
> > > +    }
> > >  
> > >      nvme_check_constraints(n, &local_err);
> > >      if (local_err) {
> > > 
> > > The following simple fix will both fix assert and also allow
> > > each VF to have its own CMB of the size defined for PF.
> > > 
> > > ---
> > >  hw/nvme/ctrl.c | 13 +++++++++----
> > >  1 file changed, 9 insertions(+), 4 deletions(-)
> > > 
> > > diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
> > > index 19b32dd4da..99daa6290c 100644
> > > --- a/hw/nvme/ctrl.c
> > > +++ b/hw/nvme/ctrl.c
> > > @@ -6837,10 +6837,15 @@ static void nvme_init_cmb(NvmeCtrl *n, PCIDevice *pci_dev)
> > >      n->cmb.buf = g_malloc0(cmb_size);
> > >      memory_region_init_io(&n->cmb.mem, OBJECT(n), &nvme_cmb_ops, n,
> > >                            "nvme-cmb", cmb_size);
> > > -    pci_register_bar(pci_dev, NVME_CMB_BIR,
> > > -                     PCI_BASE_ADDRESS_SPACE_MEMORY |
> > > -                     PCI_BASE_ADDRESS_MEM_TYPE_64 |
> > > -                     PCI_BASE_ADDRESS_MEM_PREFETCH, &n->cmb.mem);
> > > +
> > > +    if (pci_is_vf(pci_dev)) {
> > > +        pcie_sriov_vf_register_bar(pci_dev, NVME_CMB_BIR, &n->cmb.mem);
> > > +    } else {
> > > +        pci_register_bar(pci_dev, NVME_CMB_BIR,
> > > +                        PCI_BASE_ADDRESS_SPACE_MEMORY |
> > > +                        PCI_BASE_ADDRESS_MEM_TYPE_64 |
> > > +                        PCI_BASE_ADDRESS_MEM_PREFETCH, &n->cmb.mem);
> > > +    }
> > >  
> > >      NVME_CAP_SET_CMBS(cap, 1);
> > >      stq_le_p(&n->bar.cap, cap);
> > > 
> > > As for PMR, it is currently only available on PF, as only PF is capable
> > > of specifying the memory-backend-file object to use with PMR.
> > > Otherwise, either VFs would have to share the PMR with its PF, or there
> > > would be a requirement to define a memory-backend-file object for each VF.
> > 
> > Hi Klaus,
> > 
> > After some discussion, we decided to prohibit in V2 the use of CMB and
> > PMR in combination with SR-IOV.
> > 
> > While the implementation of CMB with SR-IOV is relatively
> > straightforward, PMR is not. We are committed to consistency in CMB and
> > PMR design in association with SR-IOV. So we considered it best to
> > disable both features and implement them in separate patches.
> > 
> 
> I am completely fine with that. However, since we are copying the
> parameters verbatimly, it would nice that the `info qtree` would reflect
> this difference (that the parameters, say, cmb_size_mb is 0 for the
> virtual controllers).
> 

Hi Klaus,

Literal copying will still be correct and there will be no difference
between PF and VF since by prohibit we mean to disable interaction
between SR-IOV functionality and CMB/PMR for PF as well.

if (params->sriov_max_vfs) {
    if (!n->subsys) {
        error_setg(errp, "subsystem is required for the use of SR-IOV");
        return;
    }

    if (params->sriov_max_vfs > NVME_MAX_VFS) {
        error_setg(errp, "sriov_max_vfs must be between 0 and %d",
                   NVME_MAX_VFS);
        return;
    }

    if (params->cmb_size_mb) {
        error_setg(errp, "CMB is not supported with SR-IOV");
        return;
    }

    if (n->pmr.dev) {
        error_setg(errp, "PMR is not supported with SR-IOV");
        return;
    }

Regards,
Lukasz
Klaus Jensen Nov. 10, 2021, 4:39 p.m. UTC | #8
On Nov 10 14:42, Lukasz Maniak wrote:
> On Mon, Nov 08, 2021 at 08:56:43AM +0100, Klaus Jensen wrote:
> > On Nov  4 15:30, Lukasz Maniak wrote:
> > > On Tue, Nov 02, 2021 at 06:33:31PM +0100, Lukasz Maniak wrote:
> > > > On Tue, Nov 02, 2021 at 03:33:15PM +0100, Klaus Jensen wrote:
> > > > > On Oct  7 18:23, Lukasz Maniak wrote:
> > > > > > This patch implements initial support for Single Root I/O Virtualization
> > > > > > on an NVMe device.
> > > > > > 
> > > > > > Essentially, it allows to define the maximum number of virtual functions
> > > > > > supported by the NVMe controller via sriov_max_vfs parameter.
> > > > > > 
> > > > > > Passing a non-zero value to sriov_max_vfs triggers reporting of SR-IOV
> > > > > > capability by a physical controller and ARI capability by both the
> > > > > > physical and virtual function devices.
> > > > > > 
> > > > > > NVMe controllers created via virtual functions mirror functionally
> > > > > > the physical controller, which may not entirely be the case, thus
> > > > > > consideration would be needed on the way to limit the capabilities of
> > > > > > the VF.
> > > > > > 
> > > > > > NVMe subsystem is required for the use of SR-IOV.
> > > > > > 
> > > > > > Signed-off-by: Lukasz Maniak <lukasz.maniak@linux.intel.com>
> > > > > > ---
> > > > > >  hw/nvme/ctrl.c           | 74 ++++++++++++++++++++++++++++++++++++++--
> > > > > >  hw/nvme/nvme.h           |  1 +
> > > > > >  include/hw/pci/pci_ids.h |  1 +
> > > > > >  3 files changed, 73 insertions(+), 3 deletions(-)
> > > > > > 
> > > > > > diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
> > > > > > index 6a571d18cf..ad79ff0c00 100644
> > > > > > --- a/hw/nvme/ctrl.c
> > > > > > +++ b/hw/nvme/ctrl.c
> > > > > > @@ -6361,8 +6406,12 @@ static int nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, Error **errp)
> > > > > >                            n->reg_size);
> > > > > >      memory_region_add_subregion(&n->bar0, 0, &n->iomem);
> > > > > >  
> > > > > > -    pci_register_bar(pci_dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY |
> > > > > > -                     PCI_BASE_ADDRESS_MEM_TYPE_64, &n->bar0);
> > > > > > +    if (pci_is_vf(pci_dev)) {
> > > > > > +        pcie_sriov_vf_register_bar(pci_dev, 0, &n->bar0);
> > > > > > +    } else {
> > > > > > +        pci_register_bar(pci_dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY |
> > > > > > +                         PCI_BASE_ADDRESS_MEM_TYPE_64, &n->bar0);
> > > > > > +    }
> > > > > 
> > > > > I assume that the assert we are seeing means that the pci_register_bars
> > > > > in nvme_init_cmb and nvme_init_pmr must be changed similarly to this.
> > > > 
> > > > Assert will only arise for CMB as VF params are initialized with PF
> > > > params.
> > > > 
> > > > @@ -6532,6 +6585,15 @@ static void nvme_realize(PCIDevice *pci_dev, Error **errp)
> > > >      NvmeCtrl *n = NVME(pci_dev);
> > > >      NvmeNamespace *ns;
> > > >      Error *local_err = NULL;
> > > > +    NvmeCtrl *pn = NVME(pcie_sriov_get_pf(pci_dev));
> > > > +
> > > > +    if (pci_is_vf(pci_dev)) {
> > > > +        /* VFs derive settings from the parent. PF's lifespan exceeds
> > > > +         * that of VF's, so it's safe to share params.serial.
> > > > +         */
> > > > +        memcpy(&n->params, &pn->params, sizeof(NvmeParams));
> > > > +        n->subsys = pn->subsys;
> > > > +    }
> > > >  
> > > >      nvme_check_constraints(n, &local_err);
> > > >      if (local_err) {
> > > > 
> > > > The following simple fix will both fix assert and also allow
> > > > each VF to have its own CMB of the size defined for PF.
> > > > 
> > > > ---
> > > >  hw/nvme/ctrl.c | 13 +++++++++----
> > > >  1 file changed, 9 insertions(+), 4 deletions(-)
> > > > 
> > > > diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
> > > > index 19b32dd4da..99daa6290c 100644
> > > > --- a/hw/nvme/ctrl.c
> > > > +++ b/hw/nvme/ctrl.c
> > > > @@ -6837,10 +6837,15 @@ static void nvme_init_cmb(NvmeCtrl *n, PCIDevice *pci_dev)
> > > >      n->cmb.buf = g_malloc0(cmb_size);
> > > >      memory_region_init_io(&n->cmb.mem, OBJECT(n), &nvme_cmb_ops, n,
> > > >                            "nvme-cmb", cmb_size);
> > > > -    pci_register_bar(pci_dev, NVME_CMB_BIR,
> > > > -                     PCI_BASE_ADDRESS_SPACE_MEMORY |
> > > > -                     PCI_BASE_ADDRESS_MEM_TYPE_64 |
> > > > -                     PCI_BASE_ADDRESS_MEM_PREFETCH, &n->cmb.mem);
> > > > +
> > > > +    if (pci_is_vf(pci_dev)) {
> > > > +        pcie_sriov_vf_register_bar(pci_dev, NVME_CMB_BIR, &n->cmb.mem);
> > > > +    } else {
> > > > +        pci_register_bar(pci_dev, NVME_CMB_BIR,
> > > > +                        PCI_BASE_ADDRESS_SPACE_MEMORY |
> > > > +                        PCI_BASE_ADDRESS_MEM_TYPE_64 |
> > > > +                        PCI_BASE_ADDRESS_MEM_PREFETCH, &n->cmb.mem);
> > > > +    }
> > > >  
> > > >      NVME_CAP_SET_CMBS(cap, 1);
> > > >      stq_le_p(&n->bar.cap, cap);
> > > > 
> > > > As for PMR, it is currently only available on PF, as only PF is capable
> > > > of specifying the memory-backend-file object to use with PMR.
> > > > Otherwise, either VFs would have to share the PMR with its PF, or there
> > > > would be a requirement to define a memory-backend-file object for each VF.
> > > 
> > > Hi Klaus,
> > > 
> > > After some discussion, we decided to prohibit in V2 the use of CMB and
> > > PMR in combination with SR-IOV.
> > > 
> > > While the implementation of CMB with SR-IOV is relatively
> > > straightforward, PMR is not. We are committed to consistency in CMB and
> > > PMR design in association with SR-IOV. So we considered it best to
> > > disable both features and implement them in separate patches.
> > > 
> > 
> > I am completely fine with that. However, since we are copying the
> > parameters verbatimly, it would nice that the `info qtree` would reflect
> > this difference (that the parameters, say, cmb_size_mb is 0 for the
> > virtual controllers).
> > 
> 
> Hi Klaus,
> 
> Literal copying will still be correct and there will be no difference
> between PF and VF since by prohibit we mean to disable interaction
> between SR-IOV functionality and CMB/PMR for PF as well.
> 
> if (params->sriov_max_vfs) {
>     if (!n->subsys) {
>         error_setg(errp, "subsystem is required for the use of SR-IOV");
>         return;
>     }
> 
>     if (params->sriov_max_vfs > NVME_MAX_VFS) {
>         error_setg(errp, "sriov_max_vfs must be between 0 and %d",
>                    NVME_MAX_VFS);
>         return;
>     }
> 
>     if (params->cmb_size_mb) {
>         error_setg(errp, "CMB is not supported with SR-IOV");
>         return;
>     }
> 
>     if (n->pmr.dev) {
>         error_setg(errp, "PMR is not supported with SR-IOV");
>         return;
>     }
> 

Right. Understood.
diff mbox series

Patch

diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index 6a571d18cf..ad79ff0c00 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -35,6 +35,7 @@ 
  *              mdts=<N[optional]>,vsl=<N[optional]>, \
  *              zoned.zasl=<N[optional]>, \
  *              zoned.auto_transition=<on|off[optional]>, \
+ *              sriov_max_vfs=<N[optional]> \
  *              subsys=<subsys_id>
  *      -device nvme-ns,drive=<drive_id>,bus=<bus_name>,nsid=<nsid>,\
  *              zoned=<true|false[optional]>, \
@@ -106,6 +107,12 @@ 
  *   transitioned to zone state closed for resource management purposes.
  *   Defaults to 'on'.
  *
+ * - `sriov_max_vfs`
+ *   Indicates the maximum number of PCIe virtual functions supported
+ *   by the controller. The default value is 0. Specifying a non-zero value
+ *   enables reporting of both SR-IOV and ARI capabilities by the NVMe device.
+ *   Virtual function controllers will not report SR-IOV capability.
+ *
  * nvme namespace device parameters
  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  * - `shared`
@@ -160,6 +167,7 @@ 
 #include "sysemu/block-backend.h"
 #include "sysemu/hostmem.h"
 #include "hw/pci/msix.h"
+#include "hw/pci/pcie_sriov.h"
 #include "migration/vmstate.h"
 
 #include "nvme.h"
@@ -175,6 +183,9 @@ 
 #define NVME_TEMPERATURE_CRITICAL 0x175
 #define NVME_NUM_FW_SLOTS 1
 #define NVME_DEFAULT_MAX_ZA_SIZE (128 * KiB)
+#define NVME_MAX_VFS 127
+#define NVME_VF_OFFSET 0x1
+#define NVME_VF_STRIDE 1
 
 #define NVME_GUEST_ERR(trace, fmt, ...) \
     do { \
@@ -5583,6 +5594,10 @@  static void nvme_ctrl_reset(NvmeCtrl *n)
         g_free(event);
     }
 
+    if (!pci_is_vf(&n->parent_obj) && n->params.sriov_max_vfs) {
+        pcie_sriov_pf_disable_vfs(&n->parent_obj);
+    }
+
     n->aer_queued = 0;
     n->outstanding_aers = 0;
     n->qs_created = false;
@@ -6264,6 +6279,19 @@  static void nvme_check_constraints(NvmeCtrl *n, Error **errp)
         error_setg(errp, "vsl must be non-zero");
         return;
     }
+
+    if (params->sriov_max_vfs) {
+        if (!n->subsys) {
+            error_setg(errp, "subsystem is required for the use of SR-IOV");
+            return;
+        }
+
+        if (params->sriov_max_vfs > NVME_MAX_VFS) {
+            error_setg(errp, "sriov_max_vfs must be between 0 and %d",
+                       NVME_MAX_VFS);
+            return;
+        }
+    }
 }
 
 static void nvme_init_state(NvmeCtrl *n)
@@ -6321,6 +6349,20 @@  static void nvme_init_pmr(NvmeCtrl *n, PCIDevice *pci_dev)
     memory_region_set_enabled(&n->pmr.dev->mr, false);
 }
 
+static void nvme_init_sriov(NvmeCtrl *n, PCIDevice *pci_dev, uint16_t offset,
+                            uint64_t bar_size)
+{
+    uint16_t vf_dev_id = n->params.use_intel_id ?
+                         PCI_DEVICE_ID_INTEL_NVME : PCI_DEVICE_ID_REDHAT_NVME;
+
+    pcie_sriov_pf_init(pci_dev, offset, "nvme", vf_dev_id,
+                       n->params.sriov_max_vfs, n->params.sriov_max_vfs,
+                       NVME_VF_OFFSET, NVME_VF_STRIDE, NULL);
+
+    pcie_sriov_pf_init_vf_bar(pci_dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY |
+                              PCI_BASE_ADDRESS_MEM_TYPE_64, bar_size);
+}
+
 static int nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, Error **errp)
 {
     uint8_t *pci_conf = pci_dev->config;
@@ -6335,7 +6377,7 @@  static int nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, Error **errp)
 
     if (n->params.use_intel_id) {
         pci_config_set_vendor_id(pci_conf, PCI_VENDOR_ID_INTEL);
-        pci_config_set_device_id(pci_conf, 0x5845);
+        pci_config_set_device_id(pci_conf, PCI_DEVICE_ID_INTEL_NVME);
     } else {
         pci_config_set_vendor_id(pci_conf, PCI_VENDOR_ID_REDHAT);
         pci_config_set_device_id(pci_conf, PCI_DEVICE_ID_REDHAT_NVME);
@@ -6343,6 +6385,9 @@  static int nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, Error **errp)
 
     pci_config_set_class(pci_conf, PCI_CLASS_STORAGE_EXPRESS);
     pcie_endpoint_cap_init(pci_dev, 0x80);
+    if (n->params.sriov_max_vfs) {
+        pcie_ari_init(pci_dev, 0x100, 1);
+    }
 
     bar_size = QEMU_ALIGN_UP(n->reg_size, 4 * KiB);
     msix_table_offset = bar_size;
@@ -6361,8 +6406,12 @@  static int nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, Error **errp)
                           n->reg_size);
     memory_region_add_subregion(&n->bar0, 0, &n->iomem);
 
-    pci_register_bar(pci_dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY |
-                     PCI_BASE_ADDRESS_MEM_TYPE_64, &n->bar0);
+    if (pci_is_vf(pci_dev)) {
+        pcie_sriov_vf_register_bar(pci_dev, 0, &n->bar0);
+    } else {
+        pci_register_bar(pci_dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY |
+                         PCI_BASE_ADDRESS_MEM_TYPE_64, &n->bar0);
+    }
     ret = msix_init(pci_dev, n->params.msix_qsize,
                     &n->bar0, 0, msix_table_offset,
                     &n->bar0, 0, msix_pba_offset, 0, &err);
@@ -6383,6 +6432,10 @@  static int nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, Error **errp)
         nvme_init_pmr(n, pci_dev);
     }
 
+    if (!pci_is_vf(pci_dev) && n->params.sriov_max_vfs) {
+        nvme_init_sriov(n, pci_dev, 0x120, bar_size);
+    }
+
     return 0;
 }
 
@@ -6532,6 +6585,15 @@  static void nvme_realize(PCIDevice *pci_dev, Error **errp)
     NvmeCtrl *n = NVME(pci_dev);
     NvmeNamespace *ns;
     Error *local_err = NULL;
+    NvmeCtrl *pn = NVME(pcie_sriov_get_pf(pci_dev));
+
+    if (pci_is_vf(pci_dev)) {
+        /* VFs derive settings from the parent. PF's lifespan exceeds
+         * that of VF's, so it's safe to share params.serial.
+         */
+        memcpy(&n->params, &pn->params, sizeof(NvmeParams));
+        n->subsys = pn->subsys;
+    }
 
     nvme_check_constraints(n, &local_err);
     if (local_err) {
@@ -6596,6 +6658,11 @@  static void nvme_exit(PCIDevice *pci_dev)
     if (n->pmr.dev) {
         host_memory_backend_set_mapped(n->pmr.dev, false);
     }
+
+    if (!pci_is_vf(pci_dev) && n->params.sriov_max_vfs) {
+        pcie_sriov_pf_exit(pci_dev);
+    }
+
     msix_uninit(pci_dev, &n->bar0, &n->bar0);
     memory_region_del_subregion(&n->bar0, &n->iomem);
 }
@@ -6620,6 +6687,7 @@  static Property nvme_props[] = {
     DEFINE_PROP_UINT8("zoned.zasl", NvmeCtrl, params.zasl, 0),
     DEFINE_PROP_BOOL("zoned.auto_transition", NvmeCtrl,
                      params.auto_transition_zones, true),
+    DEFINE_PROP_UINT8("sriov_max_vfs", NvmeCtrl, params.sriov_max_vfs, 0),
     DEFINE_PROP_END_OF_LIST(),
 };
 
diff --git a/hw/nvme/nvme.h b/hw/nvme/nvme.h
index 83ffabade4..4331f5da1f 100644
--- a/hw/nvme/nvme.h
+++ b/hw/nvme/nvme.h
@@ -391,6 +391,7 @@  typedef struct NvmeParams {
     uint8_t  zasl;
     bool     auto_transition_zones;
     bool     legacy_cmb;
+    uint8_t  sriov_max_vfs;
 } NvmeParams;
 
 typedef struct NvmeCtrl {
diff --git a/include/hw/pci/pci_ids.h b/include/hw/pci/pci_ids.h
index 11abe22d46..992426768e 100644
--- a/include/hw/pci/pci_ids.h
+++ b/include/hw/pci/pci_ids.h
@@ -237,6 +237,7 @@ 
 #define PCI_DEVICE_ID_INTEL_82801BA_11   0x244e
 #define PCI_DEVICE_ID_INTEL_82801D       0x24CD
 #define PCI_DEVICE_ID_INTEL_ESB_9        0x25ab
+#define PCI_DEVICE_ID_INTEL_NVME         0x5845
 #define PCI_DEVICE_ID_INTEL_82371SB_0    0x7000
 #define PCI_DEVICE_ID_INTEL_82371SB_1    0x7010
 #define PCI_DEVICE_ID_INTEL_82371SB_2    0x7020