hw/block/nvme: Add doorbell buffer config support

Message ID	20180305194906.GA3630@gmail.com
State	New
Headers	show Return-Path: <qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org> Date: Mon, 5 Mar 2018 13:49:10 -0600 From: Huaicheng Li <huaicheng@cs.uchicago.edu> To: qemu-devel@nongnu.org Message-ID: <20180305194906.GA3630@gmail.com> MIME-Version: 1.0 Content-Type: text/plain; charset=utf-8; format=flowed Content-Disposition: inline User-Agent: Mutt/1.9.1 (2017-09-22) Subject: [Qemu-devel] [PATCH] hw/block/nvme: Add doorbell buffer config support Precedence: list Cc: keith.busch@intel.com, pbonzini@redhat.com Errors-To: qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org Sender: "Qemu-devel" <qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org>
Series	hw/block/nvme: Add doorbell buffer config support \| expand hw/block/nvme: Add doorbell buffer config support

Message ID

20180305194906.GA3630@gmail.com

State

New

Headers

Date: Mon, 5 Mar 2018 13:49:10 -0600
From: Huaicheng Li <huaicheng@cs.uchicago.edu>
To: qemu-devel@nongnu.org
Message-ID: <20180305194906.GA3630@gmail.com>
MIME-Version: 1.0
Content-Type: text/plain; charset=utf-8; format=flowed
Content-Disposition: inline
User-Agent: Mutt/1.9.1 (2017-09-22)
Subject: [Qemu-devel] [PATCH] hw/block/nvme: Add doorbell buffer config
	support
Precedence: list
Cc: keith.busch@intel.com, pbonzini@redhat.com
Errors-To: qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org
Sender: "Qemu-devel"
	<qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org>

Series

hw/block/nvme: Add doorbell buffer config support | expand

Commit Message

Huaicheng Li March 5, 2018, 7:49 p.m. UTC

This patch adds Doorbell Buffer Config support (NVMe 1.3) to QEMU NVMe,
based on Mihai Rusu / Lin Ming's Google vendor extension patch [1]. The
basic idea of this optimization is to use a shared buffer between guest
OS and QEMU to reduce # of MMIO operations (doorbell writes). This patch
ports the original code to work under current QEMU and make it also
work with SPDK.

Unlike Linux kernel NVMe driver which builds the shadow buffer first and
then creates SQ/CQ, SPDK first creates SQ/CQ and then issues this command
to create shadow buffer. Thus, in this implementation, we also try to
associate shadow buffer entry with each SQ/CQ during queue initialization.

[1] http://lists.nongnu.org/archive/html/qemu-devel/2015-11/msg04127.html

Peroformance results using a **ramdisk** backed virtual NVMe device in guest
Linux 4.14 is as below:

Note: "QEMU" represent stock QEMU and "+dbbuf" is QEMU with this patch.
For psync, QD represents # of threads being used.


IOPS (Linux kernel NVMe driver)
       psync                 libaio
QD QEMU  +dbbuf  QEMU +dbbuf
1      47k      50k      45k    47k
4      86k      107k     59k   143k
16    95k      198k     58k   185k
64    97k      259k     59k   216k


 IOPS (SPDK)
QD  QEMU  +dbbuf
1      62k     71k
4      61k     191k
16    60k     319k
64    62k     364k

We can see that this patch can greatly increase the IOPS (and lower the
latency, not shown) (2.7x for psync, 3.7x for libaio and 5.9x for SPDK).

==Setup==:

(1) VM script:
x86_64-softmmu/qemu-system-x86_64 \
-name "nvme-FEMU-test" \
-enable-kvm \
-cpu host \
-smp 4 \
-m 8G \
-drive file=$IMGDIR/u14s.qcow2,if=ide,aio=native,cache=none,format=qcow2,id=hd0 \
-drive file=/mnt/tmpfs/test1.raw,if=none,aio=threads,format=raw,id=id0 \
-device nvme,drive=id0,serial=serial0,id=nvme0 \
-net user,hostfwd=tcp::8080-:22 \
-net nic,model=virtio \
-nographic \

(2) FIO configuration:

[global]
ioengine=libaio
filename=/dev/nvme0n1
thread=1
group_reporting=1
direct=1
verify=0
time_based=1
ramp_time=0
runtime=30
;size=1G
iodepth=16
rw=randread
bs=4k

[test]
numjobs=1


Signed-off-by: Huaicheng Li <huaicheng@cs.uchicago.edu>
---
 hw/block/nvme.c      | 97 +++++++++++++++++++++++++++++++++++++++++++++++++---
 hw/block/nvme.h      |  7 ++++
 include/block/nvme.h |  2 ++
 3 files changed, 102 insertions(+), 4 deletions(-)

Comments

Paolo Bonzini April 30, 2018, 10:07 a.m. UTC | #1

Ping... Keith, can you review this?

Thanks,

Paolo

On 05/03/2018 20:49, Huaicheng Li wrote:
> This patch adds Doorbell Buffer Config support (NVMe 1.3) to QEMU NVMe,
> based on Mihai Rusu / Lin Ming's Google vendor extension patch [1]. The
> basic idea of this optimization is to use a shared buffer between guest
> OS and QEMU to reduce # of MMIO operations (doorbell writes). This patch
> ports the original code to work under current QEMU and make it also
> work with SPDK.
> 
> Unlike Linux kernel NVMe driver which builds the shadow buffer first and
> then creates SQ/CQ, SPDK first creates SQ/CQ and then issues this command
> to create shadow buffer. Thus, in this implementation, we also try to
> associate shadow buffer entry with each SQ/CQ during queue initialization.
> 
> [1] http://lists.nongnu.org/archive/html/qemu-devel/2015-11/msg04127.html
> 
> Peroformance results using a **ramdisk** backed virtual NVMe device in
> guest
> Linux 4.14 is as below:
> 
> Note: "QEMU" represent stock QEMU and "+dbbuf" is QEMU with this patch.
> For psync, QD represents # of threads being used.
> 
> 
> IOPS (Linux kernel NVMe driver)
>       psync                 libaio
> QD QEMU  +dbbuf  QEMU +dbbuf
> 1      47k      50k      45k    47k
> 4      86k      107k     59k   143k
> 16    95k      198k     58k   185k
> 64    97k      259k     59k   216k
> 
> 
> IOPS (SPDK)
> QD  QEMU  +dbbuf
> 1      62k     71k
> 4      61k     191k
> 16    60k     319k
> 64    62k     364k
> 
> We can see that this patch can greatly increase the IOPS (and lower the
> latency, not shown) (2.7x for psync, 3.7x for libaio and 5.9x for SPDK).
> 
> ==Setup==:
> 
> (1) VM script:
> x86_64-softmmu/qemu-system-x86_64 \
> -name "nvme-FEMU-test" \
> -enable-kvm \
> -cpu host \
> -smp 4 \
> -m 8G \
> -drive
> file=$IMGDIR/u14s.qcow2,if=ide,aio=native,cache=none,format=qcow2,id=hd0 \
> -drive file=/mnt/tmpfs/test1.raw,if=none,aio=threads,format=raw,id=id0 \
> -device nvme,drive=id0,serial=serial0,id=nvme0 \
> -net user,hostfwd=tcp::8080-:22 \
> -net nic,model=virtio \
> -nographic \
> 
> (2) FIO configuration:
> 
> [global]
> ioengine=libaio
> filename=/dev/nvme0n1
> thread=1
> group_reporting=1
> direct=1
> verify=0
> time_based=1
> ramp_time=0
> runtime=30
> ;size=1G
> iodepth=16
> rw=randread
> bs=4k
> 
> [test]
> numjobs=1
> 
> 
> Signed-off-by: Huaicheng Li <huaicheng@cs.uchicago.edu>
> ---
> hw/block/nvme.c      | 97
> +++++++++++++++++++++++++++++++++++++++++++++++++---
> hw/block/nvme.h      |  7 ++++
> include/block/nvme.h |  2 ++
> 3 files changed, 102 insertions(+), 4 deletions(-)
> 
> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index 85d2406400..3882037e36 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -9,7 +9,7 @@
>  */
> 
> /**
> - * Reference Specs: http://www.nvmexpress.org, 1.2, 1.1, 1.0e
> + * Reference Specs: http://www.nvmexpress.org, 1.3, 1.2, 1.1, 1.0e
>  *
>  *  http://www.nvmexpress.org/resources/
>  */
> @@ -33,6 +33,7 @@
> #include "qapi/error.h"
> #include "qapi/visitor.h"
> #include "sysemu/block-backend.h"
> +#include "exec/memory.h"
> 
> #include "qemu/log.h"
> #include "trace.h"
> @@ -244,6 +245,14 @@ static uint16_t nvme_dma_read_prp(NvmeCtrl *n,
> uint8_t *ptr, uint32_t len,
>     return status;
> }
> 
> +static void nvme_update_cq_head(NvmeCQueue *cq)
> +{
> +    if (cq->db_addr) {
> +        pci_dma_read(&cq->ctrl->parent_obj, cq->db_addr, &cq->head,
> +                sizeof(cq->head));
> +    }
> +}
> +
> static void nvme_post_cqes(void *opaque)
> {
>     NvmeCQueue *cq = opaque;
> @@ -254,6 +263,8 @@ static void nvme_post_cqes(void *opaque)
>         NvmeSQueue *sq;
>         hwaddr addr;
> 
> +        nvme_update_cq_head(cq);
> +
>         if (nvme_cq_full(cq)) {
>             break;
>         }
> @@ -461,6 +472,7 @@ static uint16_t nvme_del_sq(NvmeCtrl *n, NvmeCmd *cmd)
> static void nvme_init_sq(NvmeSQueue *sq, NvmeCtrl *n, uint64_t dma_addr,
>     uint16_t sqid, uint16_t cqid, uint16_t size)
> {
> +    uint32_t stride = 4 << NVME_CAP_DSTRD(n->bar.cap);
>     int i;
>     NvmeCQueue *cq;
> 
> @@ -480,6 +492,11 @@ static void nvme_init_sq(NvmeSQueue *sq, NvmeCtrl
> *n, uint64_t dma_addr,
>     }
>     sq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_process_sq, sq);
> 
> +    if (sqid && n->dbbuf_dbs && n->dbbuf_eis) {
> +        sq->db_addr = n->dbbuf_dbs + 2 * sqid * stride;
> +        sq->ei_addr = n->dbbuf_eis + 2 * sqid * stride;
> +    }
> +
>     assert(n->cq[cqid]);
>     cq = n->cq[cqid];
>     QTAILQ_INSERT_TAIL(&(cq->sq_list), sq, entry);
> @@ -559,6 +576,8 @@ static uint16_t nvme_del_cq(NvmeCtrl *n, NvmeCmd *cmd)
> static void nvme_init_cq(NvmeCQueue *cq, NvmeCtrl *n, uint64_t dma_addr,
>     uint16_t cqid, uint16_t vector, uint16_t size, uint16_t irq_enabled)
> {
> +    uint32_t stride = 4 << NVME_CAP_DSTRD(n->bar.cap);
> +
>     cq->ctrl = n;
>     cq->cqid = cqid;
>     cq->size = size;
> @@ -569,11 +588,51 @@ static void nvme_init_cq(NvmeCQueue *cq, NvmeCtrl
> *n, uint64_t dma_addr,
>     cq->head = cq->tail = 0;
>     QTAILQ_INIT(&cq->req_list);
>     QTAILQ_INIT(&cq->sq_list);
> +    if (cqid && n->dbbuf_dbs && n->dbbuf_eis) {
> +        cq->db_addr = n->dbbuf_dbs + (2 * cqid + 1) * stride;
> +        cq->ei_addr = n->dbbuf_eis + (2 * cqid + 1) * stride;
> +    }
>     msix_vector_use(&n->parent_obj, cq->vector);
>     n->cq[cqid] = cq;
>     cq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_post_cqes, cq);
> }
> 
> +static uint16_t nvme_dbbuf_config(NvmeCtrl *n, const NvmeCmd *cmd)
> +{
> +    uint32_t stride = 4 << NVME_CAP_DSTRD(n->bar.cap);
> +    uint64_t dbs_addr = le64_to_cpu(cmd->prp1);
> +    uint64_t eis_addr = le64_to_cpu(cmd->prp2);
> +    int i;
> +
> +    /* Address should not be NULL and should be page aligned */
> +    if (dbs_addr == 0 || dbs_addr & (n->page_size - 1) ||
> +            eis_addr == 0 || eis_addr & (n->page_size - 1)) {
> +        return NVME_INVALID_FIELD | NVME_DNR;
> +    }
> +
> +    /* Save shadow buffer base addr for use during queue creation */
> +    n->dbbuf_dbs = dbs_addr;
> +    n->dbbuf_eis = eis_addr;
> +
> +    for (i = 1; i < n->num_queues; i++) {
> +        NvmeSQueue *sq = n->sq[i];
> +        NvmeCQueue *cq = n->cq[i];
> +
> +        if (sq) {
> +            /* Submission queue tail pointer location, 2 * QID * stride */
> +            sq->db_addr = dbs_addr + 2 * i * stride;
> +            sq->ei_addr = eis_addr + 2 * i * stride;
> +        }
> +
> +        if (cq) {
> +            /* Completion queue head pointer location, (2 * QID + 1) *
> stride */
> +            cq->db_addr = dbs_addr + (2 * i + 1) * stride;
> +            cq->ei_addr = eis_addr + (2 * i + 1) * stride;
> +        }
> +    }
> +    return NVME_SUCCESS;
> +}
> +
> static uint16_t nvme_create_cq(NvmeCtrl *n, NvmeCmd *cmd)
> {
>     NvmeCQueue *cq;
> @@ -753,12 +812,30 @@ static uint16_t nvme_admin_cmd(NvmeCtrl *n,
> NvmeCmd *cmd, NvmeRequest *req)
>         return nvme_set_feature(n, cmd, req);
>     case NVME_ADM_CMD_GET_FEATURES:
>         return nvme_get_feature(n, cmd, req);
> +    case NVME_ADM_CMD_DBBUF_CONFIG:
> +        return nvme_dbbuf_config(n, cmd);
>     default:
>         trace_nvme_err_invalid_admin_opc(cmd->opcode);
>         return NVME_INVALID_OPCODE | NVME_DNR;
>     }
> }
> 
> +static void nvme_update_sq_eventidx(const NvmeSQueue *sq)
> +{
> +    if (sq->ei_addr) {
> +        pci_dma_write(&sq->ctrl->parent_obj, sq->ei_addr, &sq->tail,
> +                sizeof(sq->tail));
> +    }
> +}
> +
> +static void nvme_update_sq_tail(NvmeSQueue *sq)
> +{
> +    if (sq->db_addr) {
> +        pci_dma_read(&sq->ctrl->parent_obj, sq->db_addr, &sq->tail,
> +                sizeof(sq->tail));
> +    }
> +}
> +
> static void nvme_process_sq(void *opaque)
> {
>     NvmeSQueue *sq = opaque;
> @@ -770,6 +847,8 @@ static void nvme_process_sq(void *opaque)
>     NvmeCmd cmd;
>     NvmeRequest *req;
> 
> +    nvme_update_sq_tail(sq);
> +
>     while (!(nvme_sq_empty(sq) || QTAILQ_EMPTY(&sq->req_list))) {
>         addr = sq->dma_addr + sq->head * n->sqe_size;
>         nvme_addr_read(n, addr, (void *)&cmd, sizeof(cmd));
> @@ -787,6 +866,9 @@ static void nvme_process_sq(void *opaque)
>             req->status = status;
>             nvme_enqueue_req_completion(cq, req);
>         }
> +
> +        nvme_update_sq_eventidx(sq);
> +        nvme_update_sq_tail(sq);
>     }
> }
> 
> @@ -1105,7 +1187,9 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr
> addr, int val)
>         }
> 
>         start_sqs = nvme_cq_full(cq) ? 1 : 0;
> -        cq->head = new_head;
> +        if (!cq->db_addr) {
> +            cq->head = new_head;
> +        }
>         if (start_sqs) {
>             NvmeSQueue *sq;
>             QTAILQ_FOREACH(sq, &cq->sq_list, entry) {
> @@ -1142,7 +1226,9 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr
> addr, int val)
>             return;
>         }
> 
> -        sq->tail = new_tail;
> +        if (!sq->db_addr) {
> +            sq->tail = new_tail;
> +        }
>         timer_mod(sq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
>     }
> }
> @@ -1256,7 +1342,7 @@ static void nvme_realize(PCIDevice *pci_dev, Error
> **errp)
>     id->ieee[0] = 0x00;
>     id->ieee[1] = 0x02;
>     id->ieee[2] = 0xb3;
> -    id->oacs = cpu_to_le16(0);
> +    id->oacs = cpu_to_le16(NVME_OACS_DBBUF);
>     id->frmw = 7 << 1;
>     id->lpa = 1 << 0;
>     id->sqes = (0x6 << 4) | 0x6;
> @@ -1320,6 +1406,9 @@ static void nvme_realize(PCIDevice *pci_dev, Error
> **errp)
>             cpu_to_le64(n->ns_size >>
>                 id_ns->lbaf[NVME_ID_NS_FLBAS_INDEX(ns->id_ns.flbas)].ds);
>     }
> +
> +    n->dbbuf_dbs = 0;
> +    n->dbbuf_eis = 0;
> }
> 
> static void nvme_exit(PCIDevice *pci_dev)
> diff --git a/hw/block/nvme.h b/hw/block/nvme.h
> index 8f3981121d..b532dbe160 100644
> --- a/hw/block/nvme.h
> +++ b/hw/block/nvme.h
> @@ -33,6 +33,8 @@ typedef struct NvmeSQueue {
>     QTAILQ_HEAD(sq_req_list, NvmeRequest) req_list;
>     QTAILQ_HEAD(out_req_list, NvmeRequest) out_req_list;
>     QTAILQ_ENTRY(NvmeSQueue) entry;
> +    uint64_t    db_addr;
> +    uint64_t    ei_addr;
> } NvmeSQueue;
> 
> typedef struct NvmeCQueue {
> @@ -48,6 +50,8 @@ typedef struct NvmeCQueue {
>     QEMUTimer   *timer;
>     QTAILQ_HEAD(sq_list, NvmeSQueue) sq_list;
>     QTAILQ_HEAD(cq_req_list, NvmeRequest) req_list;
> +    uint64_t    db_addr;
> +    uint64_t    ei_addr;
> } NvmeCQueue;
> 
> typedef struct NvmeNamespace {
> @@ -88,6 +92,9 @@ typedef struct NvmeCtrl {
>     NvmeSQueue      admin_sq;
>     NvmeCQueue      admin_cq;
>     NvmeIdCtrl      id_ctrl;
> +
> +    uint64_t        dbbuf_dbs;
> +    uint64_t        dbbuf_eis;
> } NvmeCtrl;
> 
> #endif /* HW_NVME_H */
> diff --git a/include/block/nvme.h b/include/block/nvme.h
> index 849a6f3fa3..4890aaf491 100644
> --- a/include/block/nvme.h
> +++ b/include/block/nvme.h
> @@ -235,6 +235,7 @@ enum NvmeAdminCommands {
>     NVME_ADM_CMD_ASYNC_EV_REQ   = 0x0c,
>     NVME_ADM_CMD_ACTIVATE_FW    = 0x10,
>     NVME_ADM_CMD_DOWNLOAD_FW    = 0x11,
> +    NVME_ADM_CMD_DBBUF_CONFIG   = 0x7c,
>     NVME_ADM_CMD_FORMAT_NVM     = 0x80,
>     NVME_ADM_CMD_SECURITY_SEND  = 0x81,
>     NVME_ADM_CMD_SECURITY_RECV  = 0x82,
> @@ -572,6 +573,7 @@ enum NvmeIdCtrlOacs {
>     NVME_OACS_SECURITY  = 1 << 0,
>     NVME_OACS_FORMAT    = 1 << 1,
>     NVME_OACS_FW        = 1 << 2,
> +    NVME_OACS_DBBUF     = 1 << 8,
> };
> 
> enum NvmeIdCtrlOncs {

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 85d2406400..3882037e36 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -9,7 +9,7 @@ 
  */
 
 /**
- * Reference Specs: http://www.nvmexpress.org, 1.2, 1.1, 1.0e
+ * Reference Specs: http://www.nvmexpress.org, 1.3, 1.2, 1.1, 1.0e
  *
  *  http://www.nvmexpress.org/resources/
  */
@@ -33,6 +33,7 @@ 
 #include "qapi/error.h"
 #include "qapi/visitor.h"
 #include "sysemu/block-backend.h"
+#include "exec/memory.h"
 
 #include "qemu/log.h"
 #include "trace.h"
@@ -244,6 +245,14 @@  static uint16_t nvme_dma_read_prp(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
     return status;
 }
 
+static void nvme_update_cq_head(NvmeCQueue *cq)
+{
+    if (cq->db_addr) {
+        pci_dma_read(&cq->ctrl->parent_obj, cq->db_addr, &cq->head,
+                sizeof(cq->head));
+    }
+}
+
 static void nvme_post_cqes(void *opaque)
 {
     NvmeCQueue *cq = opaque;
@@ -254,6 +263,8 @@  static void nvme_post_cqes(void *opaque)
         NvmeSQueue *sq;
         hwaddr addr;
 
+        nvme_update_cq_head(cq);
+
         if (nvme_cq_full(cq)) {
             break;
         }
@@ -461,6 +472,7 @@  static uint16_t nvme_del_sq(NvmeCtrl *n, NvmeCmd *cmd)
 static void nvme_init_sq(NvmeSQueue *sq, NvmeCtrl *n, uint64_t dma_addr,
     uint16_t sqid, uint16_t cqid, uint16_t size)
 {
+    uint32_t stride = 4 << NVME_CAP_DSTRD(n->bar.cap);
     int i;
     NvmeCQueue *cq;
 
@@ -480,6 +492,11 @@  static void nvme_init_sq(NvmeSQueue *sq, NvmeCtrl *n, uint64_t dma_addr,
     }
     sq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_process_sq, sq);
 
+    if (sqid && n->dbbuf_dbs && n->dbbuf_eis) {
+        sq->db_addr = n->dbbuf_dbs + 2 * sqid * stride;
+        sq->ei_addr = n->dbbuf_eis + 2 * sqid * stride;
+    }
+
     assert(n->cq[cqid]);
     cq = n->cq[cqid];
     QTAILQ_INSERT_TAIL(&(cq->sq_list), sq, entry);
@@ -559,6 +576,8 @@  static uint16_t nvme_del_cq(NvmeCtrl *n, NvmeCmd *cmd)
 static void nvme_init_cq(NvmeCQueue *cq, NvmeCtrl *n, uint64_t dma_addr,
     uint16_t cqid, uint16_t vector, uint16_t size, uint16_t irq_enabled)
 {
+    uint32_t stride = 4 << NVME_CAP_DSTRD(n->bar.cap);
+
     cq->ctrl = n;
     cq->cqid = cqid;
     cq->size = size;
@@ -569,11 +588,51 @@  static void nvme_init_cq(NvmeCQueue *cq, NvmeCtrl *n, uint64_t dma_addr,
     cq->head = cq->tail = 0;
     QTAILQ_INIT(&cq->req_list);
     QTAILQ_INIT(&cq->sq_list);
+    if (cqid && n->dbbuf_dbs && n->dbbuf_eis) {
+        cq->db_addr = n->dbbuf_dbs + (2 * cqid + 1) * stride;
+        cq->ei_addr = n->dbbuf_eis + (2 * cqid + 1) * stride;
+    }
     msix_vector_use(&n->parent_obj, cq->vector);
     n->cq[cqid] = cq;
     cq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_post_cqes, cq);
 }
 
+static uint16_t nvme_dbbuf_config(NvmeCtrl *n, const NvmeCmd *cmd)
+{
+    uint32_t stride = 4 << NVME_CAP_DSTRD(n->bar.cap);
+    uint64_t dbs_addr = le64_to_cpu(cmd->prp1);
+    uint64_t eis_addr = le64_to_cpu(cmd->prp2);
+    int i;
+
+    /* Address should not be NULL and should be page aligned */
+    if (dbs_addr == 0 || dbs_addr & (n->page_size - 1) ||
+            eis_addr == 0 || eis_addr & (n->page_size - 1)) {
+        return NVME_INVALID_FIELD | NVME_DNR;
+    }
+
+    /* Save shadow buffer base addr for use during queue creation */
+    n->dbbuf_dbs = dbs_addr;
+    n->dbbuf_eis = eis_addr;
+
+    for (i = 1; i < n->num_queues; i++) {
+        NvmeSQueue *sq = n->sq[i];
+        NvmeCQueue *cq = n->cq[i];
+
+        if (sq) {
+            /* Submission queue tail pointer location, 2 * QID * stride */
+            sq->db_addr = dbs_addr + 2 * i * stride;
+            sq->ei_addr = eis_addr + 2 * i * stride;
+        }
+
+        if (cq) {
+            /* Completion queue head pointer location, (2 * QID + 1) * stride */
+            cq->db_addr = dbs_addr + (2 * i + 1) * stride;
+            cq->ei_addr = eis_addr + (2 * i + 1) * stride;
+        }
+    }
+    return NVME_SUCCESS;
+}
+
 static uint16_t nvme_create_cq(NvmeCtrl *n, NvmeCmd *cmd)
 {
     NvmeCQueue *cq;
@@ -753,12 +812,30 @@  static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
         return nvme_set_feature(n, cmd, req);
     case NVME_ADM_CMD_GET_FEATURES:
         return nvme_get_feature(n, cmd, req);
+    case NVME_ADM_CMD_DBBUF_CONFIG:
+        return nvme_dbbuf_config(n, cmd);
     default:
         trace_nvme_err_invalid_admin_opc(cmd->opcode);
         return NVME_INVALID_OPCODE | NVME_DNR;
     }
 }
 
+static void nvme_update_sq_eventidx(const NvmeSQueue *sq)
+{
+    if (sq->ei_addr) {
+        pci_dma_write(&sq->ctrl->parent_obj, sq->ei_addr, &sq->tail,
+                sizeof(sq->tail));
+    }
+}
+
+static void nvme_update_sq_tail(NvmeSQueue *sq)
+{
+    if (sq->db_addr) {
+        pci_dma_read(&sq->ctrl->parent_obj, sq->db_addr, &sq->tail,
+                sizeof(sq->tail));
+    }
+}
+
 static void nvme_process_sq(void *opaque)
 {
     NvmeSQueue *sq = opaque;
@@ -770,6 +847,8 @@  static void nvme_process_sq(void *opaque)
     NvmeCmd cmd;
     NvmeRequest *req;
 
+    nvme_update_sq_tail(sq);
+
     while (!(nvme_sq_empty(sq) || QTAILQ_EMPTY(&sq->req_list))) {
         addr = sq->dma_addr + sq->head * n->sqe_size;
         nvme_addr_read(n, addr, (void *)&cmd, sizeof(cmd));
@@ -787,6 +866,9 @@  static void nvme_process_sq(void *opaque)
             req->status = status;
             nvme_enqueue_req_completion(cq, req);
         }
+
+        nvme_update_sq_eventidx(sq);
+        nvme_update_sq_tail(sq);
     }
 }
 
@@ -1105,7 +1187,9 @@  static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
         }
 
         start_sqs = nvme_cq_full(cq) ? 1 : 0;
-        cq->head = new_head;
+        if (!cq->db_addr) {
+            cq->head = new_head;
+        }
         if (start_sqs) {
             NvmeSQueue *sq;
             QTAILQ_FOREACH(sq, &cq->sq_list, entry) {
@@ -1142,7 +1226,9 @@  static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
             return;
         }
 
-        sq->tail = new_tail;
+        if (!sq->db_addr) {
+            sq->tail = new_tail;
+        }
         timer_mod(sq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
     }
 }
@@ -1256,7 +1342,7 @@  static void nvme_realize(PCIDevice *pci_dev, Error **errp)
     id->ieee[0] = 0x00;
     id->ieee[1] = 0x02;
     id->ieee[2] = 0xb3;
-    id->oacs = cpu_to_le16(0);
+    id->oacs = cpu_to_le16(NVME_OACS_DBBUF);
     id->frmw = 7 << 1;
     id->lpa = 1 << 0;
     id->sqes = (0x6 << 4) | 0x6;
@@ -1320,6 +1406,9 @@  static void nvme_realize(PCIDevice *pci_dev, Error **errp)
             cpu_to_le64(n->ns_size >>
                 id_ns->lbaf[NVME_ID_NS_FLBAS_INDEX(ns->id_ns.flbas)].ds);
     }
+
+    n->dbbuf_dbs = 0;
+    n->dbbuf_eis = 0;
 }
 
 static void nvme_exit(PCIDevice *pci_dev)
diff --git a/hw/block/nvme.h b/hw/block/nvme.h
index 8f3981121d..b532dbe160 100644
--- a/hw/block/nvme.h
+++ b/hw/block/nvme.h
@@ -33,6 +33,8 @@  typedef struct NvmeSQueue {
     QTAILQ_HEAD(sq_req_list, NvmeRequest) req_list;
     QTAILQ_HEAD(out_req_list, NvmeRequest) out_req_list;
     QTAILQ_ENTRY(NvmeSQueue) entry;
+    uint64_t    db_addr;
+    uint64_t    ei_addr;
 } NvmeSQueue;
 
 typedef struct NvmeCQueue {
@@ -48,6 +50,8 @@  typedef struct NvmeCQueue {
     QEMUTimer   *timer;
     QTAILQ_HEAD(sq_list, NvmeSQueue) sq_list;
     QTAILQ_HEAD(cq_req_list, NvmeRequest) req_list;
+    uint64_t    db_addr;
+    uint64_t    ei_addr;
 } NvmeCQueue;
 
 typedef struct NvmeNamespace {
@@ -88,6 +92,9 @@  typedef struct NvmeCtrl {
     NvmeSQueue      admin_sq;
     NvmeCQueue      admin_cq;
     NvmeIdCtrl      id_ctrl;
+
+    uint64_t        dbbuf_dbs;
+    uint64_t        dbbuf_eis;
 } NvmeCtrl;
 
 #endif /* HW_NVME_H */
diff --git a/include/block/nvme.h b/include/block/nvme.h
index 849a6f3fa3..4890aaf491 100644
--- a/include/block/nvme.h
+++ b/include/block/nvme.h
@@ -235,6 +235,7 @@  enum NvmeAdminCommands {
     NVME_ADM_CMD_ASYNC_EV_REQ   = 0x0c,
     NVME_ADM_CMD_ACTIVATE_FW    = 0x10,
     NVME_ADM_CMD_DOWNLOAD_FW    = 0x11,
+    NVME_ADM_CMD_DBBUF_CONFIG   = 0x7c,
     NVME_ADM_CMD_FORMAT_NVM     = 0x80,
     NVME_ADM_CMD_SECURITY_SEND  = 0x81,
     NVME_ADM_CMD_SECURITY_RECV  = 0x82,
@@ -572,6 +573,7 @@  enum NvmeIdCtrlOacs {
     NVME_OACS_SECURITY  = 1 << 0,
     NVME_OACS_FORMAT    = 1 << 1,
     NVME_OACS_FW        = 1 << 2,
+    NVME_OACS_DBBUF     = 1 << 8,
 };
 
 enum NvmeIdCtrlOncs {

hw/block/nvme: Add doorbell buffer config support

Commit Message

Comments

Patch