diff mbox series

[v6,14/42] nvme: add missing mandatory features

Message ID 20200316142928.153431-15-its@irrelevant.dk
State New
Headers show
Series nvme: support NVMe v1.3d, SGLs and multiple namespaces | expand

Commit Message

Klaus Jensen March 16, 2020, 2:29 p.m. UTC
From: Klaus Jensen <k.jensen@samsung.com>

Add support for returning a resonable response to Get/Set Features of
mandatory features.

Signed-off-by: Klaus Jensen <klaus.jensen@cnexlabs.com>
Acked-by: Keith Busch <kbusch@kernel.org>
---
 hw/block/nvme.c       | 60 ++++++++++++++++++++++++++++++++++++++++++-
 hw/block/trace-events |  2 ++
 include/block/nvme.h  |  6 ++++-
 3 files changed, 66 insertions(+), 2 deletions(-)

Comments

Maxim Levitsky March 25, 2020, 10:41 a.m. UTC | #1
On Mon, 2020-03-16 at 07:29 -0700, Klaus Jensen wrote:
> From: Klaus Jensen <k.jensen@samsung.com>
> 
> Add support for returning a resonable response to Get/Set Features of
> mandatory features.
> 
> Signed-off-by: Klaus Jensen <klaus.jensen@cnexlabs.com>
> Acked-by: Keith Busch <kbusch@kernel.org>
> ---
>  hw/block/nvme.c       | 60 ++++++++++++++++++++++++++++++++++++++++++-
>  hw/block/trace-events |  2 ++
>  include/block/nvme.h  |  6 ++++-
>  3 files changed, 66 insertions(+), 2 deletions(-)
> 
> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index ff8975cd6667..eb9c722df968 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -1025,7 +1025,15 @@ static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
>      uint32_t dw11 = le32_to_cpu(cmd->cdw11);
>      uint32_t result;
>  
> +    trace_nvme_dev_getfeat(nvme_cid(req), dw10);
> +
>      switch (dw10) {
> +    case NVME_ARBITRATION:
> +        result = cpu_to_le32(n->features.arbitration);
> +        break;
> +    case NVME_POWER_MANAGEMENT:
> +        result = cpu_to_le32(n->features.power_mgmt);
> +        break;
>      case NVME_TEMPERATURE_THRESHOLD:
>          result = 0;
>  
> @@ -1046,9 +1054,12 @@ static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
>              break;
>          }
>  
> +        break;
> +    case NVME_ERROR_RECOVERY:
> +        result = cpu_to_le32(n->features.err_rec);
>          break;
>      case NVME_VOLATILE_WRITE_CACHE:
> -        result = blk_enable_write_cache(n->conf.blk);
> +        result = cpu_to_le32(blk_enable_write_cache(n->conf.blk));
>          trace_nvme_dev_getfeat_vwcache(result ? "enabled" : "disabled");
>          break;
>      case NVME_NUMBER_OF_QUEUES:
> @@ -1058,6 +1069,19 @@ static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
>          break;
>      case NVME_TIMESTAMP:
>          return nvme_get_feature_timestamp(n, cmd);
> +    case NVME_INTERRUPT_COALESCING:
> +        result = cpu_to_le32(n->features.int_coalescing);
> +        break;
> +    case NVME_INTERRUPT_VECTOR_CONF:
> +        if ((dw11 & 0xffff) > n->params.max_ioqpairs + 1) {
> +            return NVME_INVALID_FIELD | NVME_DNR;
> +        }
I still think that this should be >= since the interrupt vector is not zero based.
So if we have for example 3 IO queues, then we have 4 queues in total
which translates to irq numbers 0..3.

BTW the user of the device doesn't have to have 1:1 mapping between qid and msi interrupt index,
in fact when MSI is not used, all the queues will map to the same vector, which will be interrupt 0
from point of view of the device IMHO.
So it kind of makes sense IMHO to have num_irqs or something, even if it technically equals to number of queues.


> +
> +        result = cpu_to_le32(n->features.int_vector_config[dw11 & 0xffff]);
> +        break;
> +    case NVME_WRITE_ATOMICITY:
> +        result = cpu_to_le32(n->features.write_atomicity);
> +        break;
>      case NVME_ASYNCHRONOUS_EVENT_CONF:
>          result = cpu_to_le32(n->features.async_config);
>          break;
> @@ -1093,6 +1117,8 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
>      uint32_t dw10 = le32_to_cpu(cmd->cdw10);
>      uint32_t dw11 = le32_to_cpu(cmd->cdw11);
>  
> +    trace_nvme_dev_setfeat(nvme_cid(req), dw10, dw11);
> +
>      switch (dw10) {
>      case NVME_TEMPERATURE_THRESHOLD:
>          if (NVME_TEMP_TMPSEL(dw11)) {
> @@ -1120,6 +1146,10 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
>  
>          break;
>      case NVME_VOLATILE_WRITE_CACHE:
> +        if (blk_enable_write_cache(n->conf.blk)) {
> +            blk_flush(n->conf.blk);
> +        }

(not your fault) but the blk_enable_write_cache function name is highly misleading,
since it doesn't enable anything but just gets the flag if the write cache is enabled.
It really should be called blk_get_enable_write_cache.

> +
>          blk_set_enable_write_cache(n->conf.blk, dw11 & 1);
>          break;
>      case NVME_NUMBER_OF_QUEUES:
> @@ -1135,6 +1165,13 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
>      case NVME_ASYNCHRONOUS_EVENT_CONF:
>          n->features.async_config = dw11;
>          break;
> +    case NVME_ARBITRATION:
> +    case NVME_POWER_MANAGEMENT:
> +    case NVME_ERROR_RECOVERY:
> +    case NVME_INTERRUPT_COALESCING:
> +    case NVME_INTERRUPT_VECTOR_CONF:
> +    case NVME_WRITE_ATOMICITY:
> +        return NVME_FEAT_NOT_CHANGABLE | NVME_DNR;
>      default:
>          trace_nvme_dev_err_invalid_setfeat(dw10);
>          return NVME_INVALID_FIELD | NVME_DNR;
> @@ -1716,6 +1753,25 @@ static void nvme_init_state(NvmeCtrl *n)
>      n->temperature = NVME_TEMPERATURE;
>      n->features.temp_thresh_hi = NVME_TEMPERATURE_WARNING;
>      n->starttime_ms = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
> +
> +    /*
> +     * There is no limit on the number of commands that the controller may
> +     * launch at one time from a particular Submission Queue.
> +     */
> +    n->features.arbitration = NVME_ARB_AB_NOLIMIT;
> +
> +    n->features.int_vector_config = g_malloc0_n(n->params.max_ioqpairs + 1,
> +        sizeof(*n->features.int_vector_config));
> +
> +    for (int i = 0; i < n->params.max_ioqpairs + 1; i++) {
> +        n->features.int_vector_config[i] = i;
> +
> +        /* interrupt coalescing is not supported for the admin queue */
> +        if (i == 0) {
> +            n->features.int_vector_config[i] |= NVME_INTVC_NOCOALESCING;
> +        }
> +    }
> +
>      n->aer_reqs = g_new0(NvmeRequest *, n->params.aerl + 1);
>  }
>  
> @@ -1804,6 +1860,7 @@ static void nvme_init_ctrl(NvmeCtrl *n)
>      id->cqes = (0x4 << 4) | 0x4;
>      id->nn = cpu_to_le32(n->num_namespaces);
>      id->oncs = cpu_to_le16(NVME_ONCS_WRITE_ZEROS | NVME_ONCS_TIMESTAMP);
> +
Unrelated whitespace change
>      id->psd[0].mp = cpu_to_le16(0x9c4);
>      id->psd[0].enlat = cpu_to_le32(0x10);
>      id->psd[0].exlat = cpu_to_le32(0x4);
> @@ -1879,6 +1936,7 @@ static void nvme_exit(PCIDevice *pci_dev)
>      g_free(n->cq);
>      g_free(n->sq);
>      g_free(n->aer_reqs);
> +    g_free(n->features.int_vector_config);
>  
>      if (n->params.cmb_size_mb) {
>          g_free(n->cmbuf);
> diff --git a/hw/block/trace-events b/hw/block/trace-events
> index 3952c36774cf..4cf39961989d 100644
> --- a/hw/block/trace-events
> +++ b/hw/block/trace-events
> @@ -41,6 +41,8 @@ nvme_dev_del_cq(uint16_t cqid) "deleted completion queue, sqid=%"PRIu16""
>  nvme_dev_identify_ctrl(void) "identify controller"
>  nvme_dev_identify_ns(uint16_t ns) "identify namespace, nsid=%"PRIu16""
>  nvme_dev_identify_nslist(uint16_t ns) "identify namespace list, nsid=%"PRIu16""
> +nvme_dev_getfeat(uint16_t cid, uint32_t fid) "cid %"PRIu16" fid 0x%"PRIx32""
> +nvme_dev_setfeat(uint16_t cid, uint32_t fid, uint32_t val) "cid %"PRIu16" fid 0x%"PRIx32" val 0x%"PRIx32""
>  nvme_dev_getfeat_vwcache(const char* result) "get feature volatile write cache, result=%s"
>  nvme_dev_getfeat_numq(int result) "get feature number of queues, result=%d"
>  nvme_dev_setfeat_numq(int reqcq, int reqsq, int gotcq, int gotsq) "requested cq_count=%d sq_count=%d, responding with cq_count=%d sq_count=%d"
> diff --git a/include/block/nvme.h b/include/block/nvme.h
> index f2a8b07c0f2f..ecc02fbe8bb8 100644
> --- a/include/block/nvme.h
> +++ b/include/block/nvme.h
> @@ -490,7 +490,8 @@ enum NvmeStatusCodes {
>      NVME_FW_REQ_RESET           = 0x010b,
>      NVME_INVALID_QUEUE_DEL      = 0x010c,
>      NVME_FID_NOT_SAVEABLE       = 0x010d,
> -    NVME_FID_NOT_NSID_SPEC      = 0x010f,
> +    NVME_FEAT_NOT_CHANGABLE     = 0x010e,
> +    NVME_FEAT_NOT_NS_SPEC       = 0x010f,
>      NVME_FW_REQ_SUSYSTEM_RESET  = 0x0110,
>      NVME_CONFLICTING_ATTRS      = 0x0180,
>      NVME_INVALID_PROT_INFO      = 0x0181,
> @@ -706,6 +707,7 @@ typedef struct NvmeFeatureVal {
>  } NvmeFeatureVal;
>  
>  #define NVME_ARB_AB(arb)    (arb & 0x7)
> +#define NVME_ARB_AB_NOLIMIT 0x7
>  #define NVME_ARB_LPW(arb)   ((arb >> 8) & 0xff)
>  #define NVME_ARB_MPW(arb)   ((arb >> 16) & 0xff)
>  #define NVME_ARB_HPW(arb)   ((arb >> 24) & 0xff)
> @@ -713,6 +715,8 @@ typedef struct NvmeFeatureVal {
>  #define NVME_INTC_THR(intc)     (intc & 0xff)
>  #define NVME_INTC_TIME(intc)    ((intc >> 8) & 0xff)
>  
> +#define NVME_INTVC_NOCOALESCING (0x1 << 16)
> +
>  #define NVME_TEMP_THSEL(temp)  ((temp >> 20) & 0x3)
>  #define NVME_TEMP_TMPSEL(temp) ((temp >> 16) & 0xf)
>  #define NVME_TEMP_TMPTH(temp)  ((temp >>  0) & 0xffff)


Best regards,
	Maxim Levitsky
Klaus Jensen March 31, 2020, 5:41 a.m. UTC | #2
On Mar 25 12:41, Maxim Levitsky wrote:
> On Mon, 2020-03-16 at 07:29 -0700, Klaus Jensen wrote:
> > From: Klaus Jensen <k.jensen@samsung.com>
> > 
> > Add support for returning a resonable response to Get/Set Features of
> > mandatory features.
> > 
> > Signed-off-by: Klaus Jensen <klaus.jensen@cnexlabs.com>
> > Acked-by: Keith Busch <kbusch@kernel.org>
> > ---
> >  hw/block/nvme.c       | 60 ++++++++++++++++++++++++++++++++++++++++++-
> >  hw/block/trace-events |  2 ++
> >  include/block/nvme.h  |  6 ++++-
> >  3 files changed, 66 insertions(+), 2 deletions(-)
> > 
> > diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> > index ff8975cd6667..eb9c722df968 100644
> > --- a/hw/block/nvme.c
> > +++ b/hw/block/nvme.c
> > @@ -1058,6 +1069,19 @@ static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
> >          break;
> >      case NVME_TIMESTAMP:
> >          return nvme_get_feature_timestamp(n, cmd);
> > +    case NVME_INTERRUPT_COALESCING:
> > +        result = cpu_to_le32(n->features.int_coalescing);
> > +        break;
> > +    case NVME_INTERRUPT_VECTOR_CONF:
> > +        if ((dw11 & 0xffff) > n->params.max_ioqpairs + 1) {
> > +            return NVME_INVALID_FIELD | NVME_DNR;
> > +        }
> I still think that this should be >= since the interrupt vector is not zero based.
> So if we have for example 3 IO queues, then we have 4 queues in total
> which translates to irq numbers 0..3.
> 

Yes you are right. The device will support max_ioqpairs + 1 IVs, so
trying to access that would actually go 1 beyond the array.

Fixed.

> BTW the user of the device doesn't have to have 1:1 mapping between qid and msi interrupt index,
> in fact when MSI is not used, all the queues will map to the same vector, which will be interrupt 0
> from point of view of the device IMHO.
> So it kind of makes sense IMHO to have num_irqs or something, even if it technically equals to number of queues.
> 

Yeah, but the device will still *support* the N IVs, so they can still
be configured even though they will not be used. So I don't think we
need to introduce an additional parameter?

> > @@ -1120,6 +1146,10 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
> >  
> >          break;
> >      case NVME_VOLATILE_WRITE_CACHE:
> > +        if (blk_enable_write_cache(n->conf.blk)) {
> > +            blk_flush(n->conf.blk);
> > +        }
> 
> (not your fault) but the blk_enable_write_cache function name is highly misleading,
> since it doesn't enable anything but just gets the flag if the write cache is enabled.
> It really should be called blk_get_enable_write_cache.
> 

Agreed :)

> > @@ -1804,6 +1860,7 @@ static void nvme_init_ctrl(NvmeCtrl *n)
> >      id->cqes = (0x4 << 4) | 0x4;
> >      id->nn = cpu_to_le32(n->num_namespaces);
> >      id->oncs = cpu_to_le16(NVME_ONCS_WRITE_ZEROS | NVME_ONCS_TIMESTAMP);
> > +
> Unrelated whitespace change

Fixed.

> 
> Best regards,
> 	Maxim Levitsky
> 
> 
> 
>
Maxim Levitsky March 31, 2020, 9:39 a.m. UTC | #3
On Tue, 2020-03-31 at 07:41 +0200, Klaus Birkelund Jensen wrote:
> On Mar 25 12:41, Maxim Levitsky wrote:
> > On Mon, 2020-03-16 at 07:29 -0700, Klaus Jensen wrote:
> > > From: Klaus Jensen <k.jensen@samsung.com>
> > > 
> > > Add support for returning a resonable response to Get/Set Features of
> > > mandatory features.
> > > 
> > > Signed-off-by: Klaus Jensen <klaus.jensen@cnexlabs.com>
> > > Acked-by: Keith Busch <kbusch@kernel.org>
> > > ---
> > >  hw/block/nvme.c       | 60 ++++++++++++++++++++++++++++++++++++++++++-
> > >  hw/block/trace-events |  2 ++
> > >  include/block/nvme.h  |  6 ++++-
> > >  3 files changed, 66 insertions(+), 2 deletions(-)
> > > 
> > > diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> > > index ff8975cd6667..eb9c722df968 100644
> > > --- a/hw/block/nvme.c
> > > +++ b/hw/block/nvme.c
> > > @@ -1058,6 +1069,19 @@ static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
> > >          break;
> > >      case NVME_TIMESTAMP:
> > >          return nvme_get_feature_timestamp(n, cmd);
> > > +    case NVME_INTERRUPT_COALESCING:
> > > +        result = cpu_to_le32(n->features.int_coalescing);
> > > +        break;
> > > +    case NVME_INTERRUPT_VECTOR_CONF:
> > > +        if ((dw11 & 0xffff) > n->params.max_ioqpairs + 1) {
> > > +            return NVME_INVALID_FIELD | NVME_DNR;
> > > +        }
> > 
> > I still think that this should be >= since the interrupt vector is not zero based.
> > So if we have for example 3 IO queues, then we have 4 queues in total
> > which translates to irq numbers 0..3.
> > 
> 
> Yes you are right. The device will support max_ioqpairs + 1 IVs, so
> trying to access that would actually go 1 beyond the array.
> 
> Fixed.
> 
> > BTW the user of the device doesn't have to have 1:1 mapping between qid and msi interrupt index,
> > in fact when MSI is not used, all the queues will map to the same vector, which will be interrupt 0
> > from point of view of the device IMHO.
> > So it kind of makes sense IMHO to have num_irqs or something, even if it technically equals to number of queues.
> > 
> 
> Yeah, but the device will still *support* the N IVs, so they can still
> be configured even though they will not be used. So I don't think we
> need to introduce an additional parameter?

Yes and no.
I wasn't thinking to add a new parameter for number of supporter interrupt vectors,
but just to have an internal variable to represent it so that we could support in future
case where these are not equal.

Also from point of view of validating the users of this virtual nvme drive, I think it kind
of makes sense to allow having less supported IRQ vectors than IO queues, so to check
how userspace copes with it. It is valid after all to have same interrupt vector shared between
multiple queues.

In fact in theory (but that would complicate the implementation greatly) we should even support
case when number of submission queues is not equal to number of completion queues. Yes nobody does in real hardware,
and at least Linux nvme driver hard assumes 1:1 SQ/CQ mapping but still.

My nvme-mdev doesn't make this assumpiton (and neither any assumptions on interrupt vector counts) 
and allows the user to have any SQ/CQ mapping as far as the spec allows
(but it does hardcode maximum number of SQ/CQ supported)

BTW, I haven't looked at that but we should check that the virtual nvme drive can cope with using legacy
interrupt (that is MSI disabled) - nvme-mdev does support this and was tested with it.


> 
> > > @@ -1120,6 +1146,10 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
> > >  
> > >          break;
> > >      case NVME_VOLATILE_WRITE_CACHE:
> > > +        if (blk_enable_write_cache(n->conf.blk)) {
> > > +            blk_flush(n->conf.blk);
> > > +        }
> > 
> > (not your fault) but the blk_enable_write_cache function name is highly misleading,
> > since it doesn't enable anything but just gets the flag if the write cache is enabled.
> > It really should be called blk_get_enable_write_cache.
> > 
> 
> Agreed :)
> 
> > > @@ -1804,6 +1860,7 @@ static void nvme_init_ctrl(NvmeCtrl *n)
> > >      id->cqes = (0x4 << 4) | 0x4;
> > >      id->nn = cpu_to_le32(n->num_namespaces);
> > >      id->oncs = cpu_to_le16(NVME_ONCS_WRITE_ZEROS | NVME_ONCS_TIMESTAMP);
> > > +
> > 
> > Unrelated whitespace change
> 
> Fixed.
> 
> > 
> > Best regards,
> > 	Maxim Levitsky
> > 
> > 
> > 
> > 

Best regards,
	Maxim Levitsky

> 
>
Klaus Jensen April 8, 2020, 11:28 a.m. UTC | #4
On Mar 31 12:39, Maxim Levitsky wrote:
> On Tue, 2020-03-31 at 07:41 +0200, Klaus Birkelund Jensen wrote:
> > On Mar 25 12:41, Maxim Levitsky wrote:
> > > BTW the user of the device doesn't have to have 1:1 mapping between qid and msi interrupt index,
> > > in fact when MSI is not used, all the queues will map to the same vector, which will be interrupt 0
> > > from point of view of the device IMHO.
> > > So it kind of makes sense IMHO to have num_irqs or something, even if it technically equals to number of queues.
> > > 
> > 
> > Yeah, but the device will still *support* the N IVs, so they can still
> > be configured even though they will not be used. So I don't think we
> > need to introduce an additional parameter?
> 
> Yes and no.
> I wasn't thinking to add a new parameter for number of supporter interrupt vectors,
> but just to have an internal variable to represent it so that we could support in future
> case where these are not equal.
> 
> Also from point of view of validating the users of this virtual nvme drive, I think it kind
> of makes sense to allow having less supported IRQ vectors than IO queues, so to check
> how userspace copes with it. It is valid after all to have same interrupt vector shared between
> multiple queues.
> 

I see that this could be useful for testing, but I think we can defer
that to a later patch. Would you be okay with that for now?

> In fact in theory (but that would complicate the implementation greatly) we should even support
> case when number of submission queues is not equal to number of completion queues. Yes nobody does in real hardware,
> and at least Linux nvme driver hard assumes 1:1 SQ/CQ mapping but still.
> 

It is not the hardware that decides this and I believe that there
definitely are applications that chooses to associate multiple SQs with
a single CQ. The CQ is an attribute of the SQ and the IV of the CQ is
also specified in the create command. I believe this is already
supported.

> My nvme-mdev doesn't make this assumpiton (and neither any assumptions on interrupt vector counts) 
> and allows the user to have any SQ/CQ mapping as far as the spec allows
> (but it does hardcode maximum number of SQ/CQ supported)
> 
> BTW, I haven't looked at that but we should check that the virtual nvme drive can cope with using legacy
> interrupt (that is MSI disabled) - nvme-mdev does support this and was tested with it.
> 

Yes, this is definitely not very well tested.

If you insist on all of the above being implemented, then I will do it,
but I would rather defer this to later patches as this series is already
pretty large ;)
diff mbox series

Patch

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index ff8975cd6667..eb9c722df968 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -1025,7 +1025,15 @@  static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
     uint32_t dw11 = le32_to_cpu(cmd->cdw11);
     uint32_t result;
 
+    trace_nvme_dev_getfeat(nvme_cid(req), dw10);
+
     switch (dw10) {
+    case NVME_ARBITRATION:
+        result = cpu_to_le32(n->features.arbitration);
+        break;
+    case NVME_POWER_MANAGEMENT:
+        result = cpu_to_le32(n->features.power_mgmt);
+        break;
     case NVME_TEMPERATURE_THRESHOLD:
         result = 0;
 
@@ -1046,9 +1054,12 @@  static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
             break;
         }
 
+        break;
+    case NVME_ERROR_RECOVERY:
+        result = cpu_to_le32(n->features.err_rec);
         break;
     case NVME_VOLATILE_WRITE_CACHE:
-        result = blk_enable_write_cache(n->conf.blk);
+        result = cpu_to_le32(blk_enable_write_cache(n->conf.blk));
         trace_nvme_dev_getfeat_vwcache(result ? "enabled" : "disabled");
         break;
     case NVME_NUMBER_OF_QUEUES:
@@ -1058,6 +1069,19 @@  static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
         break;
     case NVME_TIMESTAMP:
         return nvme_get_feature_timestamp(n, cmd);
+    case NVME_INTERRUPT_COALESCING:
+        result = cpu_to_le32(n->features.int_coalescing);
+        break;
+    case NVME_INTERRUPT_VECTOR_CONF:
+        if ((dw11 & 0xffff) > n->params.max_ioqpairs + 1) {
+            return NVME_INVALID_FIELD | NVME_DNR;
+        }
+
+        result = cpu_to_le32(n->features.int_vector_config[dw11 & 0xffff]);
+        break;
+    case NVME_WRITE_ATOMICITY:
+        result = cpu_to_le32(n->features.write_atomicity);
+        break;
     case NVME_ASYNCHRONOUS_EVENT_CONF:
         result = cpu_to_le32(n->features.async_config);
         break;
@@ -1093,6 +1117,8 @@  static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
     uint32_t dw10 = le32_to_cpu(cmd->cdw10);
     uint32_t dw11 = le32_to_cpu(cmd->cdw11);
 
+    trace_nvme_dev_setfeat(nvme_cid(req), dw10, dw11);
+
     switch (dw10) {
     case NVME_TEMPERATURE_THRESHOLD:
         if (NVME_TEMP_TMPSEL(dw11)) {
@@ -1120,6 +1146,10 @@  static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
 
         break;
     case NVME_VOLATILE_WRITE_CACHE:
+        if (blk_enable_write_cache(n->conf.blk)) {
+            blk_flush(n->conf.blk);
+        }
+
         blk_set_enable_write_cache(n->conf.blk, dw11 & 1);
         break;
     case NVME_NUMBER_OF_QUEUES:
@@ -1135,6 +1165,13 @@  static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
     case NVME_ASYNCHRONOUS_EVENT_CONF:
         n->features.async_config = dw11;
         break;
+    case NVME_ARBITRATION:
+    case NVME_POWER_MANAGEMENT:
+    case NVME_ERROR_RECOVERY:
+    case NVME_INTERRUPT_COALESCING:
+    case NVME_INTERRUPT_VECTOR_CONF:
+    case NVME_WRITE_ATOMICITY:
+        return NVME_FEAT_NOT_CHANGABLE | NVME_DNR;
     default:
         trace_nvme_dev_err_invalid_setfeat(dw10);
         return NVME_INVALID_FIELD | NVME_DNR;
@@ -1716,6 +1753,25 @@  static void nvme_init_state(NvmeCtrl *n)
     n->temperature = NVME_TEMPERATURE;
     n->features.temp_thresh_hi = NVME_TEMPERATURE_WARNING;
     n->starttime_ms = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
+
+    /*
+     * There is no limit on the number of commands that the controller may
+     * launch at one time from a particular Submission Queue.
+     */
+    n->features.arbitration = NVME_ARB_AB_NOLIMIT;
+
+    n->features.int_vector_config = g_malloc0_n(n->params.max_ioqpairs + 1,
+        sizeof(*n->features.int_vector_config));
+
+    for (int i = 0; i < n->params.max_ioqpairs + 1; i++) {
+        n->features.int_vector_config[i] = i;
+
+        /* interrupt coalescing is not supported for the admin queue */
+        if (i == 0) {
+            n->features.int_vector_config[i] |= NVME_INTVC_NOCOALESCING;
+        }
+    }
+
     n->aer_reqs = g_new0(NvmeRequest *, n->params.aerl + 1);
 }
 
@@ -1804,6 +1860,7 @@  static void nvme_init_ctrl(NvmeCtrl *n)
     id->cqes = (0x4 << 4) | 0x4;
     id->nn = cpu_to_le32(n->num_namespaces);
     id->oncs = cpu_to_le16(NVME_ONCS_WRITE_ZEROS | NVME_ONCS_TIMESTAMP);
+
     id->psd[0].mp = cpu_to_le16(0x9c4);
     id->psd[0].enlat = cpu_to_le32(0x10);
     id->psd[0].exlat = cpu_to_le32(0x4);
@@ -1879,6 +1936,7 @@  static void nvme_exit(PCIDevice *pci_dev)
     g_free(n->cq);
     g_free(n->sq);
     g_free(n->aer_reqs);
+    g_free(n->features.int_vector_config);
 
     if (n->params.cmb_size_mb) {
         g_free(n->cmbuf);
diff --git a/hw/block/trace-events b/hw/block/trace-events
index 3952c36774cf..4cf39961989d 100644
--- a/hw/block/trace-events
+++ b/hw/block/trace-events
@@ -41,6 +41,8 @@  nvme_dev_del_cq(uint16_t cqid) "deleted completion queue, sqid=%"PRIu16""
 nvme_dev_identify_ctrl(void) "identify controller"
 nvme_dev_identify_ns(uint16_t ns) "identify namespace, nsid=%"PRIu16""
 nvme_dev_identify_nslist(uint16_t ns) "identify namespace list, nsid=%"PRIu16""
+nvme_dev_getfeat(uint16_t cid, uint32_t fid) "cid %"PRIu16" fid 0x%"PRIx32""
+nvme_dev_setfeat(uint16_t cid, uint32_t fid, uint32_t val) "cid %"PRIu16" fid 0x%"PRIx32" val 0x%"PRIx32""
 nvme_dev_getfeat_vwcache(const char* result) "get feature volatile write cache, result=%s"
 nvme_dev_getfeat_numq(int result) "get feature number of queues, result=%d"
 nvme_dev_setfeat_numq(int reqcq, int reqsq, int gotcq, int gotsq) "requested cq_count=%d sq_count=%d, responding with cq_count=%d sq_count=%d"
diff --git a/include/block/nvme.h b/include/block/nvme.h
index f2a8b07c0f2f..ecc02fbe8bb8 100644
--- a/include/block/nvme.h
+++ b/include/block/nvme.h
@@ -490,7 +490,8 @@  enum NvmeStatusCodes {
     NVME_FW_REQ_RESET           = 0x010b,
     NVME_INVALID_QUEUE_DEL      = 0x010c,
     NVME_FID_NOT_SAVEABLE       = 0x010d,
-    NVME_FID_NOT_NSID_SPEC      = 0x010f,
+    NVME_FEAT_NOT_CHANGABLE     = 0x010e,
+    NVME_FEAT_NOT_NS_SPEC       = 0x010f,
     NVME_FW_REQ_SUSYSTEM_RESET  = 0x0110,
     NVME_CONFLICTING_ATTRS      = 0x0180,
     NVME_INVALID_PROT_INFO      = 0x0181,
@@ -706,6 +707,7 @@  typedef struct NvmeFeatureVal {
 } NvmeFeatureVal;
 
 #define NVME_ARB_AB(arb)    (arb & 0x7)
+#define NVME_ARB_AB_NOLIMIT 0x7
 #define NVME_ARB_LPW(arb)   ((arb >> 8) & 0xff)
 #define NVME_ARB_MPW(arb)   ((arb >> 16) & 0xff)
 #define NVME_ARB_HPW(arb)   ((arb >> 24) & 0xff)
@@ -713,6 +715,8 @@  typedef struct NvmeFeatureVal {
 #define NVME_INTC_THR(intc)     (intc & 0xff)
 #define NVME_INTC_TIME(intc)    ((intc >> 8) & 0xff)
 
+#define NVME_INTVC_NOCOALESCING (0x1 << 16)
+
 #define NVME_TEMP_THSEL(temp)  ((temp >> 20) & 0x3)
 #define NVME_TEMP_TMPSEL(temp) ((temp >> 16) & 0xf)
 #define NVME_TEMP_TMPTH(temp)  ((temp >>  0) & 0xffff)