diff --git a/default-configs/pci.mak b/default-configs/pci.mak
index ae9d1eb..eb67914 100644
--- a/default-configs/pci.mak
+++ b/default-configs/pci.mak
@@ -11,6 +11,7 @@ CONFIG_PCNET_PCI=y
 CONFIG_PCNET_COMMON=y
 CONFIG_LSI_SCSI_PCI=y
 CONFIG_MEGASAS_SCSI_PCI=y
+CONFIG_NVME_PCI=y
 CONFIG_RTL8139_PCI=y
 CONFIG_E1000_PCI=y
 CONFIG_IDE_CORE=y
diff --git a/hw/Makefile.objs b/hw/Makefile.objs
index d581d8d..fa5a5b3 100644
--- a/hw/Makefile.objs
+++ b/hw/Makefile.objs
@@ -41,6 +41,8 @@ common-obj-$(CONFIG_I8259) += i8259_common.o i8259.o
 common-obj-y += fifo.o
 common-obj-y += pam.o
 
+common-obj-$(CONFIG_NVME_PCI) += nvme.o
+
 # PPC devices
 common-obj-$(CONFIG_PREP_PCI) += prep_pci.o
 common-obj-$(CONFIG_I82378) += i82378.o
diff --git a/hw/nvme.c b/hw/nvme.c
new file mode 100644
index 0000000..94b448b
--- /dev/null
+++ b/hw/nvme.c
@@ -0,0 +1,2876 @@
+/*
+ * QEMU NVM Express Controller
+ *
+ * Copyright (c) 2012, Intel Corporation
+ *
+ * Written by Keith Busch <keith.busch@intel.com>
+ *
+ * This code is licensed under the GNU GPL v2.
+ */
+
+/**
+ * Reference Specs: http://www.nvmexpress.org, 1.1, 1.0d
+ *
+ *  http://www.nvmexpress.org/index.php/download_file/view/102/1/
+ *  http://www.nvmexpress.org/index.php/download_file/view/100/1/
+ *
+ * Usage:
+ *
+ * Add "-device nvme[,options]" parameters when starting QEMU.
+ * Repeat "-device nvme" for multiple nvme controller pci devices.
+ *
+ * Options:
+ *
+ * namespaces=<1-256> : Number of namespaces. Default: 1
+ * size=<1-131072>    : Size of each namespace in MB. Default: 512
+ * queues=<1-2047>    : Number of controller IO queues. Default:64
+ * entries=<1-16535>  : Maximum number of queue entries. Default:2047
+ * aerl=<0-255>       : Number of async event request to accept. Default:3
+ * acl=<0-255>        : The abort command limit. Default:3
+ * mdts=<0-255>       : Maximum data transfer size, see NVMe spec. Default: 5
+ * cqr=<0,1>          : Contiguous Queues Required. Default:1
+ * stride=<0-12>      : Doorbell stride. Default:0
+ * path=<"path">      : Directory path to create and store the persistent
+ *                      namespace files. Default:./
+ */
+
+/**
+ * TODO:
+ * - Single and Multiple Message MSI
+ * - Scatter Gather List support
+ * - NVMe Subsystem Reset
+ * - Fused commands
+ * - Check/regen protection information
+ * - Reservations
+ * - Arbitration
+ * - Coalescing
+ * - Security and firmware (vendor specific)
+ * - Persistent features
+ * - Test page sizes > 4k
+ */
+
+#include "bitmap.h"
+#include "bitops.h"
+#include "hw.h"
+#include "pci.h"
+#include "msix.h"
+#include "msi.h"
+#include "qemu-thread.h"
+#include "qemu-common.h"
+
+#include <sys/mman.h>
+
+#define NVME_DEBUG
+#ifdef NVME_DEBUG
+enum {
+    IO_DBG, DBG, INFO, ERR
+};
+
+#define DBGBIT(x) (1 << x)
+int debug_flags = DBGBIT(INFO) | DBGBIT(ERR);
+#define NVME_LOG(level, fmt, ...) \
+    do {\
+        if (debug_flags & DBGBIT(level)) { \
+            fprintf(stderr, "%s: " fmt "\n", __func__, ##__VA_ARGS__);\
+        } \
+    } while (0)
+#else
+#define DBGBIT(x) 0
+#define NVME_LOG(level, fmt, ...) do {} while (0)
+#endif
+
+#define NVME_MAX_QS             PCI_MSIX_FLAGS_QSIZE
+#define NVME_MAX_QUEUE_ENTRIES  0xffff
+#define NVME_MAX_STRIDE         12
+#define NVME_MAX_NAMESPACE_SIZE 131072
+#define NVME_MAX_NUM_NAMESPACES 256
+#define BYTES_PER_MB            (1024ULL * 1024ULL)
+#define NVME_OP_ABORTED         0xff
+#define NVME_SPARE_THRESHOLD    20
+#define NVME_TEMPERATURE        0x143
+#define min(x, y) ((x) < (y) ? (x) : (y))
+
+static int instance;
+static void *nvme_sq_thread(void *arg);
+
+typedef struct bsem {
+    QemuCond    cv;
+    QemuMutex   mutex;
+    int flag;
+} bsem;
+
+typedef struct NvmeBar {
+    uint64_t    cap;
+    uint32_t    vs;
+    uint32_t    intms;
+    uint32_t    intmc;
+    uint32_t    cc;
+    uint32_t    rsvd1;
+    uint32_t    csts;
+    uint32_t    nssrc;
+    uint32_t    aqa;
+    uint64_t    asq;
+    uint64_t    acq;
+} NvmeBar;
+
+enum NvmeCapShift {
+    CAP_MQES_SHIFT     = 0,
+    CAP_CQR_SHIFT      = 16,
+    CAP_AMS_SHIFT      = 17,
+    CAP_TO_SHIFT       = 24,
+    CAP_DSTRD_SHIFT    = 32,
+    CAP_NSSRS_SHIFT    = 33,
+    CAP_CSS_SHIFT      = 37,
+    CAP_MPSMIN_SHIFT   = 48,
+    CAP_MPSMAX_SHIFT   = 52,
+};
+
+enum NvmeCapMask {
+    CAP_MQES_MASK      = 0xffff,
+    CAP_CQR_MASK       = 0x1,
+    CAP_AMS_MASK       = 0x3,
+    CAP_TO_MASK        = 0xff,
+    CAP_DSTRD_MASK     = 0xf,
+    CAP_NSSRS_MASK     = 0x1,
+    CAP_CSS_MASK       = 0xff,
+    CAP_MPSMIN_MASK    = 0xf,
+    CAP_MPSMAX_MASK    = 0xf,
+};
+
+#define NVME_CAP_MQES(cap)  (((cap) >> CAP_MQES_SHIFT)   & CAP_MQES_MASK)
+#define NVME_CAP_CQR(cap)   (((cap) >> CAP_CQR_SHIFT)    & CAP_CQR_MASK)
+#define NVME_CAP_AMS(cap)   (((cap) >> CAP_AMS_SHIFT)    & CAP_AMS_MASK)
+#define NVME_CAP_TO(cap)    (((cap) >> CAP_TO_SHIFT)     & CAP_TO_MASK)
+#define NVME_CAP_DSTRD(cap) (((cap) >> CAP_DSTRD_SHIFT)  & CAP_DSTRD_MASK)
+#define NVME_CAP_NSSRS(cap) (((cap) >> CAP_NSSRS_SHIFT)  & CAP_NSSRS_MASK)
+#define NVME_CAP_CSS(cap)   (((cap) >> CAP_CSS_SHIFT)    & CAP_CSS_MASK)
+#define NVME_CAP_MPSMIN(cap)(((cap) >> CAP_MPSMIN_SHIFT) & CAP_MPSMIN_MASK)
+#define NVME_CAP_MPSMAX(cap)(((cap) >> CAP_MPSMAX_SHIFT) & CAP_MPSMAX_MASK)
+
+enum NvmeCcShift {
+    CC_EN_SHIFT     = 0,
+    CC_CSS_SHIFT    = 4,
+    CC_MPS_SHIFT    = 7,
+    CC_AMS_SHIFT    = 11,
+    CC_SHN_SHIFT    = 14,
+    CC_IOSQES_SHIFT = 16,
+    CC_IOCQES_SHIFT = 20,
+};
+
+enum NvmeCcMask {
+    CC_EN_MASK      = 0x1,
+    CC_CSS_MASK     = 0x7,
+    CC_MPS_MASK     = 0xf,
+    CC_AMS_MASK     = 0x7,
+    CC_SHN_MASK     = 0x3,
+    CC_IOSQES_MASK  = 0xf,
+    CC_IOCQES_MASK  = 0xf,
+};
+
+#define NVME_CC_EN(cc)     ((cc >> CC_EN_SHIFT)     & CC_EN_MASK)
+#define NVME_CC_CSS(cc)    ((cc >> CC_CSS_SHIFT)    & CC_CSS_MASK)
+#define NVME_CC_MPS(cc)    ((cc >> CC_MPS_SHIFT)    & CC_MPS_MASK)
+#define NVME_CC_AMS(cc)    ((cc >> CC_AMS_SHIFT)    & CC_AMS_MASK)
+#define NVME_CC_SHN(cc)    ((cc >> CC_SHN_SHIFT)    & CC_SHN_MASK)
+#define NVME_CC_IOSQES(cc) ((cc >> CC_IOSQES_SHIFT) & CC_IOSQES_MASK)
+#define NVME_CC_IOCQES(cc) ((cc >> CC_IOCQES_SHIFT) & CC_IOCQES_MASK)
+
+enum NvmeCstsShift {
+    CSTS_RDY_SHIFT      = 0,
+    CSTS_CFS_SHIFT      = 1,
+    CSTS_SHST_SHIFT     = 2,
+    CSTS_NSSRO_SHIFT    = 4,
+};
+
+enum NvmeCstsMask {
+    CSTS_RDY_MASK   = 0x1,
+    CSTS_CFS_MASK   = 0x1,
+    CSTS_SHST_MASK  = 0x3,
+    CSTS_NSSRO_MASK = 0x1,
+};
+
+enum NvmeCsts {
+    NVME_CSTS_READY         = 1 << CSTS_RDY_SHIFT,
+    NVME_CSTS_FAILED        = 1 << CSTS_CFS_SHIFT,
+    NVME_CSTS_SHST_NORMAL   = 0 << CSTS_SHST_SHIFT,
+    NVME_CSTS_SHST_PROGRESS = 1 << CSTS_SHST_SHIFT,
+    NVME_CSTS_SHST_COMPLETE = 2 << CSTS_SHST_SHIFT,
+    NVME_CSTS_NSSRO         = 1 << CSTS_NSSRO_SHIFT,
+};
+
+#define NVME_CSTS_RDY(csts)     ((csts >> CSTS_RDY_SHIFT)   & CSTS_RDY_MASK)
+#define NVME_CSTS_CFS(csts)     ((csts >> CSTS_CFS_SHIFT)   & CSTS_CFS_MASK)
+#define NVME_CSTS_SHST(csts)    ((csts >> CSTS_SHST_SHIFT)  & CSTS_SHST_MASK)
+#define NVME_CSTS_NSSRO(csts)   ((csts >> CSTS_NSSRO_SHIFT) & CSTS_NSSRO_MASK)
+
+enum NvmeAqaShift {
+    AQA_ASQS_SHIFT  = 0,
+    AQA_ACQS_SHIFT  = 16,
+};
+
+enum NvmeAqaMask {
+    AQA_ASQS_MASK   = 0xfff,
+    AQA_ACQS_MASK   = 0xfff,
+};
+
+#define NVME_AQA_ASQS(aqa) ((aqa >> AQA_ASQS_SHIFT) & AQA_ASQS_MASK)
+#define NVME_AQA_ACQS(aqa) ((aqa >> AQA_ACQS_SHIFT) & AQA_ACQS_MASK)
+
+typedef struct NvmeCmd {
+    uint8_t     opcode;
+    uint8_t     fuse;
+    uint16_t    cid;
+    uint32_t    nsid;
+    uint64_t    res1;
+    uint64_t    mptr;
+    uint64_t    prp1;
+    uint64_t    prp2;
+    uint32_t    cdw10;
+    uint32_t    cdw11;
+    uint32_t    cdw12;
+    uint32_t    cdw13;
+    uint32_t    cdw14;
+    uint32_t    cdw15;
+} NvmeCmd;
+
+typedef struct NvmeDeleteQ {
+    uint8_t     opcode;
+    uint8_t     flags;
+    uint16_t    command_id;
+    uint32_t    rsvd1[9];
+    uint16_t    qid;
+    uint16_t    rsvd10;
+    uint32_t    rsvd11[5];
+} NvmeDeleteQ;
+
+typedef struct NvmeCreateCq {
+    uint8_t     opcode;
+    uint8_t     flags;
+    uint16_t    command_id;
+    uint32_t    rsvd1[5];
+    uint64_t    prp1;
+    uint64_t    rsvd8;
+    uint16_t    cqid;
+    uint16_t    qsize;
+    uint16_t    cq_flags;
+    uint16_t    irq_vector;
+    uint32_t    rsvd12[4];
+} NvmeCreateCq;
+
+typedef struct NvmeCreateSq {
+    uint8_t     opcode;
+    uint8_t     flags;
+    uint16_t    command_id;
+    uint32_t    rsvd1[5];
+    uint64_t    prp1;
+    uint64_t    rsvd8;
+    uint16_t    sqid;
+    uint16_t    qsize;
+    uint16_t    sq_flags;
+    uint16_t    cqid;
+    uint32_t    rsvd12[4];
+} NvmeCreateSq;
+
+enum QueueFlags {
+    NVME_Q_PC           = 1 << 0,
+    NVME_Q_PRIO_URGENT  = 0 << 1,
+    NVME_Q_PRIO_HIGH    = 1 << 1,
+    NVME_Q_PRIO_NORMAL  = 2 << 1,
+    NVME_Q_PRIO_LOW     = 3 << 1,
+};
+
+typedef struct NvmeIdentify {
+    uint8_t     opcode;
+    uint8_t     flags;
+    uint16_t    command_id;
+    uint32_t    nsid;
+    uint64_t    rsvd2[2];
+    uint64_t    prp1;
+    uint64_t    prp2;
+    uint32_t    cns;
+    uint32_t    rsvd11[5];
+} NvmeIdentify;
+
+typedef struct NvmeRwCmd {
+    uint8_t     opcode;
+    uint8_t     flags;
+    uint16_t    command_id;
+    uint32_t    nsid;
+    uint64_t    rsvd2;
+    uint64_t    mptr;
+    uint64_t    prp1;
+    uint64_t    prp2;
+    uint64_t    slba;
+    uint16_t    nlb;
+    uint16_t    control;
+    uint32_t    dsmgmt;
+    uint32_t    reftag;
+    uint16_t    apptag;
+    uint16_t    appmask;
+} NvmeRwCmd;
+
+typedef struct NvmeDsmCmd {
+    uint8_t     opcode;
+    uint8_t     flags;
+    uint16_t    command_id;
+    uint32_t    nsid;
+    uint64_t    rsvd2[2];
+    uint64_t    prp1;
+    uint64_t    prp2;
+    uint32_t    nr;
+    uint32_t    attributes;
+    uint32_t    rsvd12[4];
+} NvmeDsmCmd;
+
+enum {
+    NVME_DSMGMT_IDR = 1 << 0,
+    NVME_DSMGMT_IDW = 1 << 1,
+    NVME_DSMGMT_AD  = 1 << 2,
+};
+
+typedef struct NvmeDsmRange {
+    uint32_t    cattr;
+    uint32_t    nlb;
+    uint64_t    slba;
+} NvmeDsmRange;
+
+enum AsyncEventRequest {
+    NVME_AER_TYPE_ERROR                     = 0,
+    NVME_AER_TYPE_SMART                     = 1,
+    NVME_AER_TYPE_IO_SPECIFIC               = 6,
+    NVME_AER_TYPE_VENDOR_SPECIFIC           = 7,
+    NVME_AER_INFO_ERR_INVALID_SQ            = 0,
+    NVME_AER_INFO_ERR_INVALID_DB            = 1,
+    NVME_AER_INFO_ERR_DIAG_FAIL             = 2,
+    NVME_AER_INFO_ERR_PERS_INTERNAL_ERR     = 3,
+    NVME_AER_INFO_ERR_TRANS_INTERNAL_ERR    = 4,
+    NVME_AER_INFO_ERR_FW_IMG_LOAD_ERR       = 5,
+    NVME_AER_INFO_SMART_RELIABILITY         = 0,
+    NVME_AER_INFO_SMART_TEMP_THRESH         = 1,
+    NVME_AER_INFO_SMART_SPARE_THRESH        = 2,
+};
+
+typedef struct AerResult {
+    uint8_t event_type;
+    uint8_t event_info;
+    uint8_t log_page;
+    uint8_t resv;
+} AerResult;
+
+typedef struct AsyncEvent {
+    QSIMPLEQ_ENTRY(AsyncEvent) entry;
+    AerResult result;
+} AsyncEvent;
+
+typedef struct NvmeCqe {
+    uint32_t    result;
+    uint32_t    rsvd;
+    uint16_t    sq_head;
+    uint16_t    sq_id;
+    uint16_t    command_id;
+    uint16_t    status;
+} NvmeCqe;
+
+typedef struct NvmeFwSlotInfoLog {
+    uint8_t     afi;
+    uint8_t     reserved1[7];
+    uint8_t     frs1[8];
+    uint8_t     frs2[8];
+    uint8_t     frs3[8];
+    uint8_t     frs4[8];
+    uint8_t     frs5[8];
+    uint8_t     frs6[8];
+    uint8_t     frs7[8];
+    uint8_t     reserved2[448];
+} NvmeFwSlotInfoLog;
+
+typedef struct NvmeErrorLog {
+    uint64_t    error_count;
+    uint16_t    sqid;
+    uint16_t    cid;
+    uint16_t    status_field;
+    uint16_t    param_error_location;
+    uint64_t    lba;
+    uint32_t    nsid;
+    uint8_t     vs;
+    uint8_t     resv[35];
+} NvmeErrorLog;
+
+typedef struct NvmeSmartLog {
+    uint8_t     critical_warning;
+    uint8_t     temperature[2];
+    uint8_t     available_spare;
+    uint8_t     available_spare_threshold;
+    uint8_t     percentage_used;
+    uint8_t     reserved1[26];
+    uint64_t    data_units_read[2];
+    uint64_t    data_units_written[2];
+    uint64_t    host_read_commands[2];
+    uint64_t    host_write_commands[2];
+    uint64_t    controller_busy_time[2];
+    uint64_t    power_cycles[2];
+    uint64_t    power_on_hours[2];
+    uint64_t    unsafe_shutdowns[2];
+    uint64_t    media_errors[2];
+    uint64_t    number_of_error_log_entries[2];
+    uint8_t     reserved2[320];
+} NvmeSmartLog;
+
+enum NvmeSmartWarn {
+    NVME_SMART_SPARE                  = 1 << 0,
+    NVME_SMART_TEMPERATURE            = 1 << 1,
+    NVME_SMART_RELIABILITY            = 1 << 2,
+    NVME_SMART_MEDIA_READ_ONLY        = 1 << 3,
+    NVME_SMART_FAILED_VOLATILE_MEDIA  = 1 << 4,
+};
+
+typedef struct NvmeSQueue {
+    struct NvmeCtrl *ctrl;
+    uint8_t     is_active;
+    uint8_t     phys_contig;
+    uint16_t    id;
+    uint16_t    cqid;
+    uint32_t    head;
+    uint32_t    tail;
+    uint32_t    size;
+    uint64_t    dma_addr;
+    uint64_t    completed;
+    uint64_t    *prp_list;
+
+    QemuMutex   queue_lock;
+    QemuThread  process_thread;
+    bsem        event_lock;
+
+    QTAILQ_ENTRY(NvmeSQueue) entry;
+} NvmeSQueue;
+
+typedef struct NvmeCQueue {
+    struct NvmeCtrl *ctrl;
+    uint8_t     phys_contig;
+    uint8_t     phase;
+    uint16_t    id;
+    uint16_t    irq_enabled;
+    uint32_t    head;
+    uint32_t    tail;
+    uint32_t    vector;
+    uint32_t    size;
+    uint64_t    dma_addr;
+    uint64_t    *prp_list;
+
+    QemuMutex   queue_lock;
+    QTAILQ_HEAD(sq_list, NvmeSQueue) sq_list;
+} NvmeCQueue;
+
+typedef struct NvmePSD {
+    uint16_t    mp;
+    uint16_t    reserved;
+    uint32_t    enlat;
+    uint32_t    exlat;
+    uint8_t     rrt;
+    uint8_t     rrl;
+    uint8_t     rwt;
+    uint8_t     rwl;
+    uint8_t     resv[16];
+} NvmePSD;
+
+typedef struct NvmeIdCtrl {
+    uint16_t    vid;
+    uint16_t    ssvid;
+    uint8_t     sn[20];
+    uint8_t     mn[40];
+    uint8_t     fr[8];
+    uint8_t     rab;
+    uint8_t     ieee[3];
+    uint8_t     cmic;
+    uint8_t     mdts;
+    uint8_t     rsvd255[178];
+    uint16_t    oacs;
+    uint8_t     acl;
+    uint8_t     aerl;
+    uint8_t     frmw;
+    uint8_t     lpa;
+    uint8_t     elpe;
+    uint8_t     npss;
+    uint8_t     rsvd511[248];
+    uint8_t     sqes;
+    uint8_t     cqes;
+    uint16_t    rsvd515;
+    uint32_t    nn;
+    uint16_t    oncs;
+    uint16_t    fuses;
+    uint8_t     fna;
+    uint8_t     vwc;
+    uint16_t    awun;
+    uint16_t    awupf;
+    uint8_t     rsvd703[174];
+    uint8_t     rsvd2047[1344];
+    NvmePSD     psd[32];
+    uint8_t     vs[1024];
+} NvmeIdCtrl;
+
+enum NvmeIdCtrlOacs {
+    NVME_OACS_SECURITY  = 1 << 0,
+    NVME_OACS_FORMAT    = 1 << 1,
+    NVME_OACS_FW        = 1 << 2,
+};
+
+enum NvmeIdCtrlOncs {
+    NVME_ONCS_COMPARE       = 1 << 0,
+    NVME_ONCS_WRITE_UNCORR  = 1 << 1,
+    NVME_ONCS_DSM           = 1 << 2,
+    NVME_ONCS_WRITE_ZEROS   = 1 << 3,
+    NVME_ONCS_FEATURES      = 1 << 4,
+    NVME_ONCS_RESRVATIONS   = 1 << 5,
+};
+
+#define NVME_CTRL_SQES_MIN(sqes) ((sqes) & 0xf)
+#define NVME_CTRL_SQES_MAX(sqes) (((sqes) >> 4) & 0xf)
+#define NVME_CTRL_CQES_MIN(cqes) ((cqes) & 0xf)
+#define NVME_CTRL_CQES_MAX(cqes) (((cqes) >> 4) & 0xf)
+
+typedef struct NvmeFeatureVal {
+    uint32_t    arbitration;
+    uint32_t    power_mgmt;
+    uint32_t    temp_thresh;
+    uint32_t    err_rec;
+    uint32_t    volatile_wc;
+    uint32_t    num_queues;
+    uint32_t    int_coalescing;
+    uint32_t    *int_vector_config;
+    uint32_t    write_atomicity;
+    uint32_t    async_config;
+    uint32_t    sw_prog_marker;
+} NvmeFeatureVal;
+
+typedef struct NvmeRangeType {
+    uint8_t     type;
+    uint8_t     attributes;
+    uint8_t     rsvd2[14];
+    uint64_t    slba;
+    uint64_t    nlb;
+    uint8_t     guid[16];
+    uint8_t     rsvd48[16];
+} NvmeRangeType;
+
+typedef struct NvmeLBAF {
+    uint16_t    ms;
+    uint8_t     ds;
+    uint8_t     rp;
+} NvmeLBAF;
+
+typedef struct NvmeIdNs {
+    uint64_t    nsze;
+    uint64_t    ncap;
+    uint64_t    nuse;
+    uint8_t     nsfeat;
+    uint8_t     nlbaf;
+    uint8_t     flbas;
+    uint8_t     mc;
+    uint8_t     dpc;
+    uint8_t     dps;
+    uint8_t     res30[98];
+    NvmeLBAF    lbaf[16];
+    uint8_t     res192[192];
+    uint8_t     vs[3712];
+} NvmeIdNs;
+
+#define NVME_ID_NS_NSFEAT_THIN(nsfeat)      ((nsfeat & 0x1))
+#define NVME_ID_NS_FLBAS_EXTENDED(flbas)    ((flbas >> 4) & 0x1)
+#define NVME_ID_NS_FLBAS_INDEX(flbas)       ((flbas & 0xf))
+#define NVME_ID_NS_MC_SEPARATE(mc)          ((mc >> 1) & 0x1)
+#define NVME_ID_NS_MC_EXTENDED(mc)          ((mc & 0x1))
+#define NVME_ID_NS_DPC_LAST_EIGHT(dpc)      ((dpc >> 4) & 0x1)
+#define NVME_ID_NS_DPC_FIRST_EIGHT(dpc)     ((dpc >> 3) & 0x1)
+#define NVME_ID_NS_DPC_TYPE_3(dpc)          ((dpc >> 2) & 0x1)
+#define NVME_ID_NS_DPC_TYPE_2(dpc)          ((dpc >> 1) & 0x1)
+#define NVME_ID_NS_DPC_TYPE_1(dpc)          ((dpc & 0x1))
+
+enum NvmeIdNsDps {
+    DPS_TYPE_NONE   = 0,
+    DPS_TYPE_0      = 1,
+    DPS_TYPE_1      = 2,
+    DPS_TYPE_2      = 3,
+    DPS_TYPE_3      = 4,
+};
+
+typedef struct NvmeNamespace {
+    struct NvmeCtrl *ctrl;
+    NvmeIdNs        id_ns;
+    NvmeRangeType   lba_range[64];
+    uint32_t        id;
+
+    int             fd;
+    size_t          mapping_size;
+    uint8_t         *mapping_addr;
+
+    int             mfd;
+    size_t          meta_mapping_size;
+    uint8_t         *meta_mapping_addr;
+
+    pthread_rwlock_t format_lock;
+
+    uint32_t        write_data_counter;
+    uint32_t        read_data_counter;
+    uint64_t        data_units_read[2];
+    uint64_t        data_units_written[2];
+    uint64_t        host_read_commands[2];
+    uint64_t        host_write_commands[2];
+
+    uint64_t        nuse_thresh;
+    uint8_t         thresh_warn_issued;
+    unsigned long   *util;
+    unsigned long   *uncorrectable;
+} NvmeNamespace;
+
+typedef struct NvmeCtrl {
+    PCIDevice       dev;
+    MemoryRegion    iomem;
+    NvmeBar         bar;
+
+    time_t      start_time;
+    int         instance;
+    uint16_t    temperature;
+    uint16_t    page_size;
+    uint16_t    page_bits;
+    uint16_t    max_prp_ents;
+    uint16_t    cqe_size;
+    uint16_t    sqe_size;
+    uint8_t     percentage_used;
+    uint8_t     outstanding_aers;
+    uint8_t     elp_index;
+    uint8_t     error_count;
+    uint16_t    *aer_cid;
+    uint32_t    reg_size;
+
+    /* parameters */
+    char        *disk_path;
+    uint32_t    num_namespaces;
+    uint32_t    ns_size;
+    uint32_t    num_queues;
+    uint32_t    max_q_ents;
+    uint8_t     db_stride;
+    uint8_t     acl;
+    uint8_t     aerl;
+    uint8_t     mdts;
+    uint8_t     elpe;
+    uint8_t     cqr;
+
+    QemuMutex       log_lock;
+    NvmeErrorLog    *elpes;
+    NvmeNamespace   *namespaces;
+    NvmeFeatureVal  features;
+    NvmeIdCtrl      id_ctrl;
+    NvmeSQueue      admin_sq;
+    NvmeCQueue      admin_cq;
+    NvmeSQueue      **sq;
+    NvmeCQueue      **cq;
+
+    QSIMPLEQ_HEAD(aer_queue, AsyncEvent) aer_queue;
+    QEMUTimer   *aer_timer;
+    uint8_t     aer_mask;
+    uint8_t     temp_warn_issued;
+} NvmeCtrl;
+
+enum NvmeStatusCodes {
+    NVME_SUCCESS                = 0x0000,
+    NVME_INVALID_OPCODE         = 0x0001,
+    NVME_INVALID_FIELD          = 0x0002,
+    NVME_CID_CONFLICT           = 0x0003,
+    NVME_DATA_TRAS_ERROR        = 0x0004,
+    NVME_POWER_LOSS_ABORT       = 0x0005,
+    NVME_INTERNAL_DEV_ERROR     = 0x0006,
+    NVME_CMD_ABORT_REQ          = 0x0007,
+    NVME_CMD_ABORT_SQ_DEL       = 0x0008,
+    NVME_CMD_ABORT_FAILED_FUSE  = 0x0009,
+    NVME_CMD_ABORT_MISSING_FUSE = 0x000a,
+    NVME_INVALID_NSID           = 0x000b,
+    NVME_CMD_SEQ_ERROR          = 0x000c,
+    NVME_LBA_RANGE              = 0x0080,
+    NVME_CAP_EXCEEDED           = 0x0081,
+    NVME_NS_NOT_READY           = 0x0082,
+    NVME_NS_RESV_CONFLICT       = 0x0083,
+    NVME_INVALID_CQID           = 0x0100,
+    NVME_INVALID_QID            = 0x0101,
+    NVME_MAX_QSIZE_EXCEEDED     = 0x0102,
+    NVME_ACL_EXCEEDED           = 0x0103,
+    NVME_RESERVED               = 0x0104,
+    NVME_AER_LIMIT_EXCEEDED     = 0x0105,
+    NVME_INVALID_FW_SLOT        = 0x0106,
+    NVME_INVALID_FW_IMAGE       = 0x0107,
+    NVME_INVALID_IRQ_VECTOR     = 0x0108,
+    NVME_INVALID_LOG_ID         = 0x0109,
+    NVME_INVALID_FORMAT         = 0x010a,
+    NVME_FW_REQ_RESET           = 0x010b,
+    NVME_INVALID_QUEUE_DEL      = 0x010c,
+    NVME_FID_NOT_SAVEABLE       = 0x010d,
+    NVME_FID_NOT_NSID_SPEC      = 0x010f,
+    NVME_FW_REQ_SUSYSTEM_RESET  = 0x0110,
+    NVME_CONFLICTING_ATTRS      = 0x0180,
+    NVME_INVALID_PROT_INFO      = 0x0181,
+    NVME_WRITE_TO_RO            = 0x0182,
+    NVME_WRITE_FAULT            = 0x0280,
+    NVME_UNRECOVERED_READ       = 0x0281,
+    NVME_E2E_GUARD_ERROR        = 0x0282,
+    NVME_E2E_APP_ERROR          = 0x0283,
+    NVME_E2E_REF_ERROR          = 0x0284,
+    NVME_CMP_FAILURE            = 0x0285,
+    NVME_ACCESS_DENIED          = 0x0286,
+    NVME_MORE                   = 0x2000,
+    NVME_DNR                    = 0x4000,
+    NVME_NO_COMPLETE            = 0xffff,
+};
+
+enum NvmeAdminCommands {
+    NVME_ADM_CMD_DELETE_SQ      = 0x00,
+    NVME_ADM_CMD_CREATE_SQ      = 0x01,
+    NVME_ADM_CMD_GET_LOG_PAGE   = 0x02,
+    NVME_ADM_CMD_DELETE_CQ      = 0x04,
+    NVME_ADM_CMD_CREATE_CQ      = 0x05,
+    NVME_ADM_CMD_IDENTIFY       = 0x06,
+    NVME_ADM_CMD_ABORT          = 0x08,
+    NVME_ADM_CMD_SET_FEATURES   = 0x09,
+    NVME_ADM_CMD_GET_FEATURES   = 0x0a,
+    NVME_ADM_CMD_ASYNC_EV_REQ   = 0x0c,
+    NVME_ADM_CMD_ACTIVATE_FW    = 0x10,
+    NVME_ADM_CMD_DOWNLOAD_FW    = 0x11,
+    NVME_ADM_CMD_FORMAT_NVM     = 0x80,
+    NVME_ADM_CMD_SECURITY_SEND  = 0x81,
+    NVME_ADM_CMD_SECURITY_RECV  = 0x82,
+};
+
+enum NvmeIoCommands {
+    NVME_CMD_FLUSH              = 0x00,
+    NVME_CMD_WRITE              = 0x01,
+    NVME_CMD_READ               = 0x02,
+    NVME_CMD_WRITE_UNCOR        = 0x04,
+    NVME_CMD_COMPARE            = 0x05,
+    NVME_CMD_DSM                = 0x09,
+};
+
+enum LogIdentifier {
+    NVME_LOG_ERROR_INFO     = 0x01,
+    NVME_LOG_SMART_INFO     = 0x02,
+    NVME_LOG_FW_SLOT_INFO   = 0x03,
+};
+
+enum NvmeFeatureIds {
+    NVME_ARBITRATION                = 1,
+    NVME_POWER_MANAGEMENT           = 2,
+    NVME_LBA_RANGE_TYPE             = 3,
+    NVME_TEMPERATURE_THRESHOLD      = 4,
+    NVME_ERROR_RECOVERY             = 5,
+    NVME_VOLATILE_WRITE_CACHE       = 6,
+    NVME_NUMBER_OF_QUEUES           = 7,
+    NVME_INTERRUPT_COALESCING       = 8,
+    NVME_INTERRUPT_VECTOR_CONF      = 9,
+    NVME_WRITE_ATOMICITY            = 0x0a,
+    NVME_ASYNCHRONOUS_EVENT_CONF    = 0x0b,
+    NVME_SOFTWARE_PROGRESS_MARKER   = 0x80
+};
+
+static inline void _nvme_check_size(void)
+{
+    QEMU_BUILD_BUG_ON(sizeof(NvmeCmd) != 64);
+    QEMU_BUILD_BUG_ON(sizeof(NvmeDeleteQ) != 64);
+    QEMU_BUILD_BUG_ON(sizeof(NvmeCreateCq) != 64);
+    QEMU_BUILD_BUG_ON(sizeof(NvmeCreateSq) != 64);
+    QEMU_BUILD_BUG_ON(sizeof(NvmeIdentify) != 64);
+    QEMU_BUILD_BUG_ON(sizeof(NvmeRwCmd) != 64);
+    QEMU_BUILD_BUG_ON(sizeof(NvmeDsmCmd) != 64);
+    QEMU_BUILD_BUG_ON(sizeof(NvmeCqe) != 16);
+    QEMU_BUILD_BUG_ON(sizeof(NvmeRangeType) != 64);
+    QEMU_BUILD_BUG_ON(sizeof(NvmeFwSlotInfoLog) != 512);
+    QEMU_BUILD_BUG_ON(sizeof(NvmeErrorLog) != 64);
+    QEMU_BUILD_BUG_ON(sizeof(NvmeSmartLog) != 512);
+    QEMU_BUILD_BUG_ON(sizeof(NvmeIdCtrl) != 4096);
+    QEMU_BUILD_BUG_ON(sizeof(NvmeIdNs) != 4096);
+}
+
+static void bsem_init(bsem *s)
+{
+    qemu_mutex_init(&s->mutex);
+    qemu_cond_init(&s->cv);
+    s->flag = 1;
+}
+
+static void bsem_destroy(bsem *s)
+{
+    qemu_mutex_destroy(&s->mutex);
+    qemu_cond_destroy(&s->cv);
+}
+
+static void bsem_get(bsem *s)
+{
+    qemu_mutex_lock(&s->mutex);
+    while (s->flag == 0) {
+        qemu_cond_wait(&s->cv, &s->mutex);
+    }
+    s->flag = 0;
+    qemu_mutex_unlock(&s->mutex);
+}
+
+static void bsem_put(bsem *s)
+{
+    qemu_mutex_lock(&s->mutex);
+    qemu_cond_signal(&s->cv);
+    s->flag = 1;
+    qemu_mutex_unlock(&s->mutex);
+}
+
+static void nvme_init_file(int *fd, off_t size, uint8_t **mapping_addr,
+    size_t *mapping_size, int nsid, NvmeCtrl *n, const char *name_fmt)
+{
+    off_t f_size;
+    char str[64];
+    char path[256];
+
+    snprintf(str, sizeof(str), name_fmt, n->instance, nsid);
+    if (n->disk_path) {
+        snprintf(path, sizeof(path), "%s/%s", n->disk_path, str);
+    } else {
+        snprintf(path, sizeof(path), "%s", str);
+    }
+
+    NVME_LOG(DBG, "open file:%s size:%lu", path, size);
+    *fd = open(path, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR);
+    if (*fd <= 0) {
+        NVME_LOG(ERR, "failed to open namespace backing storage");
+        return;
+    }
+
+    f_size = lseek(*fd, 0, SEEK_END);
+    lseek(*fd, 0 , SEEK_SET);
+    if (f_size != size) {
+        if (posix_fallocate(*fd, 0, size) != 0) {
+            NVME_LOG(ERR, "failed to allocate %lu bytes", size);
+            return;
+        }
+    }
+
+    *mapping_addr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED,
+        *fd, 0);
+    if (*mapping_addr == NULL) {
+        NVME_LOG(ERR, "failed to map the namespace backing storage");
+        return;
+    }
+    *mapping_size = size;
+}
+
+static void nvme_open_namespace(NvmeCtrl *n, NvmeNamespace *ns, int id)
+{
+    off_t ns_size, meta_size;
+    uint16_t lba_size, lba_idx;
+    ns->ctrl = n;
+    NvmeIdNs *id_ns = &ns->id_ns;
+
+    lba_idx = NVME_ID_NS_FLBAS_INDEX(id_ns->flbas);
+    lba_size = 1 << id_ns->lbaf[lba_idx].ds;
+    ns_size = id_ns->nsze * lba_size;
+    meta_size = id_ns->nsze * id_ns->lbaf[lba_idx].ms;
+
+    NVME_LOG(DBG, "controller:%u nsid:%u flbas:%x nsze:%lu ds:%u ms:%u",
+        n->instance, id, id_ns->flbas, id_ns->nsze, id_ns->lbaf[lba_idx].ds,
+        id_ns->lbaf[lba_idx].ms);
+
+    if (meta_size) {
+        if (NVME_ID_NS_FLBAS_EXTENDED(id_ns->flbas)) {
+            ns_size += meta_size;
+        } else {
+            nvme_init_file(&ns->mfd, meta_size, &ns->meta_mapping_addr,
+                &ns->meta_mapping_size, id, n, "nvmem%dn%d.img");
+        }
+    }
+
+    nvme_init_file(&ns->fd, ns_size, &ns->mapping_addr, &ns->mapping_size,
+        id, n, "nvme%dn%d.img");
+    ns->util = bitmap_new(id_ns->nsze);
+    ns->uncorrectable = bitmap_new(id_ns->nsze);
+    ns->thresh_warn_issued = 0;
+    ns->nuse_thresh = ((double)ns->id_ns.nsze) *
+        (1 - ((double)NVME_SPARE_THRESHOLD) / 100.0);
+}
+
+static void nvme_close_namespace(NvmeNamespace *ns)
+{
+    NVME_LOG(DBG, "nsid:%u fd:%d mfd:%d size:%lu meta size:%lu", ns->id, ns->fd,
+        ns->mfd, ns->mapping_size, ns->meta_mapping_size);
+    if (ns->mapping_addr) {
+        munmap(ns->mapping_addr, ns->mapping_size);
+        ns->mapping_addr = NULL;
+        ns->mapping_size = 0;
+    }
+    if (ns->fd) {
+        close(ns->fd);
+        ns->fd = 0;
+    }
+    if (ns->meta_mapping_addr) {
+        munmap(ns->meta_mapping_addr, ns->meta_mapping_size);
+        ns->meta_mapping_addr = NULL;
+        ns->meta_mapping_size = 0;
+    }
+    if (ns->mfd) {
+        close(ns->mfd);
+        ns->mfd = 0;
+    }
+    if (ns->util) {
+        g_free(ns->util);
+        ns->util = NULL;
+    }
+    if (ns->uncorrectable) {
+        g_free(ns->uncorrectable);
+        ns->uncorrectable = NULL;
+    }
+}
+
+static void nvme_enqueue_event(NvmeCtrl *n, uint8_t event_type,
+    uint8_t event_info, uint8_t log_page)
+{
+    AsyncEvent *event = (AsyncEvent *)g_malloc(sizeof(*event));
+    event->result.event_type = event_type;
+    event->result.event_info = event_info;
+    event->result.log_page   = log_page;
+    QSIMPLEQ_INSERT_TAIL(&(n->aer_queue), event, entry);
+
+    NVME_LOG(INFO, "controller:%u type:%x info:%x page:%x", n->instance,
+        event_type, event_info, log_page);
+    qemu_mod_timer(n->aer_timer, qemu_get_clock_ns(vm_clock) + 10000);
+}
+
+static uint64_t *nvme_setup_discontig(uint64_t prp_addr, uint16_t queue_depth,
+    uint16_t page_size, uint16_t entry_size)
+{
+    int i;
+    uint16_t prps_per_page = page_size / sizeof(uint64_t);
+    uint64_t prp[prps_per_page];
+    uint16_t total_prps = DIV_ROUND_UP(queue_depth * entry_size, page_size);
+    uint64_t *prp_list = g_malloc0(total_prps * sizeof(*prp_list));
+
+    NVME_LOG(DBG,
+        "queue depth:%u page size:%u entry size:%u prps per page:%u total:%u\n",
+        queue_depth, page_size, entry_size, prps_per_page, total_prps);
+
+    for (i = 0; i < total_prps; i++) {
+        if (i % prps_per_page == 0 && i < total_prps - 1) {
+            if (!prp_addr || prp_addr & (page_size - 1)) {
+                NVME_LOG(ERR,
+                    "invalid prp list address for discontig queue:%lx",
+                    prp_addr);
+                g_free(prp_list);
+            }
+            cpu_physical_memory_rw(prp_addr, (uint8_t *)&prp, sizeof(prp), 1);
+            prp_addr = prp[prps_per_page - 1];
+        }
+        prp_list[i] = prp[i % prps_per_page];
+        NVME_LOG(DBG, "map prp:%lx\n", prp_list[i]);
+        if (!prp_list[i] || prp_list[i] & (page_size - 1)) {
+            NVME_LOG(ERR, "invalid prp for discontig queue:%lx", prp_list[i]);
+            g_free(prp_list);
+            return NULL;
+        }
+    }
+
+    return prp_list;
+}
+
+static hwaddr nvme_discontig(uint64_t *dma_addr, uint16_t page_size,
+    uint16_t queue_idx, uint16_t entry_size)
+{
+    uint16_t entries_per_page = page_size / entry_size;
+    uint16_t prp_index = queue_idx / entries_per_page;
+    uint16_t index_in_prp = queue_idx % entries_per_page;
+    NVME_LOG(IO_DBG, "dma_addr:%lx page size:%u queue index:%u entry size:%u",
+        *dma_addr, page_size, queue_idx, entry_size);
+    return dma_addr[prp_index] + index_in_prp * entry_size;
+}
+
+static uint32_t nvme_do_prp(uint64_t prp1, uint64_t prp2, uint8_t *buf,
+    uint32_t len, int data_dir, NvmeCtrl *n)
+{
+    uint32_t trans_len = n->page_size - (prp1 % n->page_size);
+    trans_len = min(len, trans_len);
+
+    if (!prp1) {
+        NVME_LOG(ERR, "null prp1");
+        return NVME_INVALID_FIELD | NVME_DNR;
+    }
+
+    NVME_LOG(IO_DBG,
+        "controller:%u page size:%u prp1:%lx prp2:%lx buf:%p len:%u dir:%d",
+        n->instance, n->page_size, prp1, prp2, buf, len, data_dir);
+
+    cpu_physical_memory_rw(prp1, buf, trans_len, data_dir);
+    len -= trans_len;
+    buf += trans_len;
+
+    if (len) {
+        if (!prp2) {
+            NVME_LOG(ERR, "null prp2");
+            return NVME_INVALID_FIELD | NVME_DNR;
+        }
+        if (len > n->page_size) {
+            uint64_t prp_list[n->max_prp_ents], nents, prp_trans;
+            int i = 0;
+
+            nents = (uint64_t)((len + n->page_size - 1) >> n->page_bits);
+            prp_trans = min(n->max_prp_ents, nents) * sizeof(uint64_t);
+            cpu_physical_memory_rw(prp2, (uint8_t *)prp_list, prp_trans, 0);
+
+            while (len != 0) {
+                if (i == n->max_prp_ents - 1 && len > n->page_size) {
+                    if (!prp_list[i] || prp_list[i] & (n->page_size - 1)) {
+                        NVME_LOG(ERR,
+                            "null or unaligned prp chain:%u entry %lx", i,
+                            prp_list[i]);
+                        return NVME_INVALID_FIELD | NVME_DNR;
+                    }
+                    nents = (uint64_t)((len + n->page_size - 1) >>
+                        n->page_bits);
+                    prp_trans = min(n->max_prp_ents, nents) * sizeof(uint64_t);
+                    cpu_physical_memory_rw(prp_list[i], (uint8_t *)prp_list,
+                        prp_trans, 0);
+                    i = 0;
+                }
+                if (!prp_list[i] || prp_list[i] & (n->page_size - 1)) {
+                    NVME_LOG(ERR, "null or unaligned prp list:%u entry %lx", i,
+                        prp_list[i]);
+                    return NVME_INVALID_FIELD | NVME_DNR;
+                }
+
+                NVME_LOG(IO_DBG, "  prp[%u]:%lx", i, prp_list[i]);
+                trans_len = min(len, n->page_size);
+                cpu_physical_memory_rw(prp_list[i], buf, trans_len, data_dir);
+
+                len -= trans_len;
+                buf += trans_len;
+                i++;
+            }
+        } else {
+            if (prp2 & (n->page_size - 1)) {
+                NVME_LOG(ERR, "prp2 alignment");
+                return NVME_INVALID_FIELD | NVME_DNR;
+            }
+            cpu_physical_memory_rw(prp2, buf, len, data_dir);
+        }
+    }
+
+    return NVME_SUCCESS;
+}
+
+static int nvme_check_sqid(NvmeCtrl *n, uint16_t sqid)
+{
+    return sqid < n->num_queues && n->sq[sqid] != NULL ? 0 : -1;
+}
+
+static int nvme_check_cqid(NvmeCtrl *n, uint16_t cqid)
+{
+    return cqid < n->num_queues && n->cq[cqid] != NULL ? 0 : -1;
+}
+
+static void nvme_wait_sq(NvmeSQueue *sq)
+{
+    NVME_LOG(IO_DBG, "controller:%u sqid:%u wait for work",
+        sq->ctrl->instance, sq->id);
+    bsem_get(&sq->event_lock);
+    if (sq->is_active) {
+        NVME_LOG(IO_DBG, "controller:%u sqid:%u wake up",
+            sq->ctrl->instance, sq->id);
+        return;
+    }
+    NVME_LOG(INFO,
+        "submission queue:%u completed:%lu requests, thread exiting",
+        sq->id, sq->completed);
+    qemu_thread_exit(NULL);
+}
+
+static void nvme_isr_notify(NvmeCtrl *n, NvmeCQueue *cq)
+{
+    if (cq->irq_enabled) {
+        if (msix_enabled(&(n->dev))) {
+            msix_notify(&(n->dev), cq->vector);
+        } else if (msi_enabled(&(n->dev))) {
+            msi_notify(&(n->dev), cq->vector);
+        } else {
+            qemu_irq_pulse(n->dev.irq[0]);
+        }
+    }
+}
+
+static void nvme_inc_cq_tail(NvmeCQueue *cq)
+{
+    cq->tail++;
+    if (cq->tail >= cq->size) {
+        cq->tail = 0;
+        cq->phase = !cq->phase;
+    }
+}
+
+static void nvme_inc_sq_head(NvmeSQueue *sq)
+{
+    sq->head = (sq->head + 1) % sq->size;
+}
+
+static uint8_t nvme_cq_full(NvmeCQueue *cq)
+{
+    return (cq->tail + 1) % cq->size == cq->head;
+}
+
+static void nvme_post_cqe(NvmeCtrl *n, NvmeCQueue *cq, NvmeSQueue *sq,
+    NvmeCqe *cqe)
+{
+    hwaddr addr;
+    uint32_t tail;
+
+    NVME_LOG(IO_DBG, "controller:%u cqid:%u sqid:%u cq head:%u cq tail:%u",
+        n->instance, cq->id, sq->id, cq->head, cq->tail);
+    qemu_mutex_lock(&cq->queue_lock);
+    while (nvme_cq_full(cq)) {
+        qemu_mutex_unlock(&cq->queue_lock);
+        qemu_mutex_unlock(&sq->queue_lock);
+
+        NVME_LOG(INFO, "controller:%u cq:%u full, wait things to clear it",
+            n->instance, cq->id);
+        nvme_isr_notify(n, cq);
+        nvme_wait_sq(sq);
+
+        qemu_mutex_lock(&sq->queue_lock);
+        qemu_mutex_lock(&cq->queue_lock);
+    }
+
+    tail = cq->tail;
+    cqe->status |= cq->phase;
+    cqe->sq_id = sq->id;
+    cqe->sq_head = sq->head;
+    nvme_inc_cq_tail(cq);
+
+    qemu_mutex_unlock(&cq->queue_lock);
+
+    if (cq->phys_contig) {
+        addr = cq->dma_addr + tail * n->cqe_size;
+    } else {
+        addr = nvme_discontig(cq->prp_list, tail, n->page_size, n->cqe_size);
+    }
+
+    NVME_LOG(IO_DBG, "controller:%u cqid:%u cq tail:%u addr:%lx", n->instance,
+        cq->id, cq->tail, addr);
+    cpu_physical_memory_rw(addr, (uint8_t *)cqe, sizeof(*cqe), 1);
+}
+
+static void nvme_set_error_page(NvmeCtrl *n, uint16_t sqid, uint16_t cid,
+    uint16_t status, uint16_t location, uint64_t lba, uint32_t nsid)
+{
+    NvmeErrorLog *elp;
+    qemu_mutex_lock(&n->log_lock);
+    elp = &n->elpes[n->elp_index];
+    elp->error_count = n->error_count++;
+    elp->sqid = sqid;
+    elp->cid = cid;
+    elp->status_field = status;
+    elp->param_error_location = location;
+    elp->lba = lba;
+    elp->nsid = nsid;
+    n->elp_index = (n->elp_index + 1) % n->elpe;
+    qemu_mutex_unlock(&n->log_lock);
+}
+
+static void nvme_aer_process_cb(void *param)
+{
+    NvmeCqe cqe;
+    AerResult *result;
+    AsyncEvent *event, *next;;
+    NvmeCtrl *n = param;
+
+    NVME_LOG(DBG, "controller:%u outstanding aers:%u mask:%x queue empty:%u",
+        n->instance, n->outstanding_aers, n->aer_mask,
+        QSIMPLEQ_EMPTY(&n->aer_queue));
+
+    QSIMPLEQ_FOREACH_SAFE(event, &n->aer_queue, entry, next) {
+        if (n->outstanding_aers <= 0) {
+            break;
+        }
+        if (n->aer_mask & (1 << event->result.event_type)) {
+            continue;
+        }
+
+        memset(&cqe, 0x0, sizeof(cqe));
+        QSIMPLEQ_REMOVE_HEAD(&n->aer_queue, entry);
+        n->aer_mask |= 1 << event->result.event_type;
+        n->outstanding_aers--;
+
+        result = (AerResult *)&cqe.result;
+        result->event_type = event->result.event_type;
+        result->event_info = event->result.event_info;
+        result->log_page   = event->result.log_page;
+        g_free(event);
+
+        cqe.sq_head = n->sq[0]->head;
+        cqe.command_id = n->aer_cid[n->outstanding_aers];
+        cqe.status = NVME_SUCCESS << 1;
+
+        nvme_post_cqe(n, n->cq[0], n->sq[0], &cqe);
+        nvme_isr_notify(n, n->cq[0]);
+    }
+}
+
+static void nvme_update_ns_util(NvmeNamespace *ns, uint64_t slba, uint16_t nlb)
+{
+    uint64_t nr;
+    uint64_t elba = slba + nlb;
+    unsigned long *addr = ns->util;
+
+    for (nr = slba; nr <= elba; nr++) {
+        if (!test_and_set_bit(nr, addr)) {
+            assert(ns->id_ns.nuse < ns->id_ns.nsze);
+            ++ns->id_ns.nuse;
+        }
+    }
+    bitmap_clear(ns->uncorrectable, slba, nlb);
+}
+
+static int nvme_is_unrecovered(NvmeNamespace *ns, uint64_t slba, uint16_t nlb)
+{
+    uint64_t nr;
+    uint64_t elba = slba + nlb;
+    unsigned long *addr = ns->uncorrectable;
+    for (nr = slba; nr <= elba; nr++) {
+        if (test_bit(nr, addr)) {
+            return 1;
+        }
+    }
+    return 0;
+}
+
+static void nvme_dsm_dealloc(NvmeNamespace *ns, uint64_t slba, uint64_t nlb)
+{
+    uint64_t nr;
+    uint64_t elba = nlb + slba;
+    unsigned long *addr = ns->util;
+    for (nr = slba; nr < elba; nr++) {
+        if (test_and_clear_bit(nr, addr)) {
+            assert(ns->id_ns.nuse > 0);
+            --ns->id_ns.nuse;
+        }
+    }
+}
+
+static void nvme_update_stats(NvmeNamespace *ns, uint16_t nlb, int rw)
+{
+    uint64_t tmp;
+    if (!rw) {
+        if (++ns->host_write_commands[0] == 0) {
+            ++ns->host_write_commands[1];
+        }
+
+        tmp = ns->data_units_written[0];
+        ns->write_data_counter += nlb + 1;
+        ns->data_units_written[0] += (ns->write_data_counter / 1000);
+        ns->write_data_counter %= 1000;
+        if (tmp > ns->data_units_written[0]) {
+            ++ns->data_units_written[1];
+        }
+    } else {
+        if (++ns->host_read_commands[0] == 0) {
+            ++ns->host_read_commands[1];
+        }
+
+        tmp = ns->data_units_read[0];
+        ns->read_data_counter += nlb + 1;
+        ns->data_units_read[0] += (ns->read_data_counter / 1000);
+        ns->read_data_counter %= 1000;
+        if (tmp > ns->data_units_read[0]) {
+            ++ns->data_units_read[1];
+        }
+    }
+}
+
+static uint16_t nvme_dsm(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd,
+    uint16_t sqid)
+{
+    uint16_t nr = (cmd->cdw10 & 0xff) + 1;
+    NvmeDsmRange range[nr];
+
+    if (nvme_do_prp(cmd->prp1, cmd->prp2, (uint8_t *) range, sizeof(range),
+            1, n)) {
+        nvme_set_error_page(n, sqid, cmd->cid, NVME_INVALID_FIELD,
+            offsetof(NvmeCmd, prp1), 0, ns->id);
+        return NVME_INVALID_FIELD | NVME_DNR;
+    }
+
+    if (cmd->cdw11 & NVME_DSMGMT_AD) {
+        int i;
+        uint64_t slba, nlb;
+        for (i = 0; i < nr; i++) {
+            slba = range[i].slba;
+            nlb = range[i].nlb;
+            if (slba + nlb > ns->id_ns.ncap) {
+                NVME_LOG(ERR, "range error, slba:%ld nlb:%ld size:%ld", slba,
+                    nlb, ns->id_ns.nsze);
+                nvme_set_error_page(n, sqid, cmd->cid, NVME_LBA_RANGE,
+                    offsetof(NvmeCmd, cdw10), slba + nlb, ns->id);
+                return NVME_LBA_RANGE | NVME_DNR;
+            }
+            nvme_dsm_dealloc(ns, slba, nlb);
+        }
+    }
+    return NVME_SUCCESS;
+}
+
+static uint16_t nvme_rw(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd,
+    uint16_t sqid)
+{
+    NvmeRwCmd *rw = (NvmeRwCmd *)cmd;
+    uint8_t lba_index = NVME_ID_NS_FLBAS_INDEX(ns->id_ns.flbas);
+    uint32_t blk_sze = (1 << ns->id_ns.lbaf[lba_index].ds);
+    uint64_t data_size = (rw->nlb + 1) * blk_sze;
+    uint64_t meta_size = (rw->nlb + 1) * ns->id_ns.lbaf[lba_index].ms;
+    uint8_t *buf = ns->mapping_addr;
+    int data_dir = rw->opcode == NVME_CMD_WRITE ? 0 : 1;
+    uint16_t ret;
+
+    NVME_LOG(IO_DBG, "nsid:%u slba:%lu nlb:%u data size:%lu meta:%lu", ns->id,
+        rw->slba, rw->nlb, data_size, meta_size);
+    if ((rw->slba + rw->nlb) > ns->id_ns.nsze) {
+        NVME_LOG(ERR, "range error, slba:%ld nlb:%u size:%ld", rw->slba,
+            rw->nlb, ns->id_ns.nsze);
+        nvme_set_error_page(n, sqid, cmd->cid, NVME_LBA_RANGE,
+            offsetof(NvmeRwCmd, slba), rw->slba + rw->nlb, ns->id);
+        return NVME_LBA_RANGE | NVME_DNR;
+    }
+    if (n->id_ctrl.mdts && data_size > n->page_size * (1 << n->id_ctrl.mdts)) {
+        NVME_LOG(ERR, "transfer size error, mdts:%u data size:%lu",
+            n->id_ctrl.mdts, data_size);
+        nvme_set_error_page(n, sqid, cmd->cid, NVME_INVALID_FIELD,
+            offsetof(NvmeRwCmd, nlb), rw->slba + rw->nlb, ns->id);
+        return NVME_INVALID_FIELD | NVME_DNR;
+    }
+    if (!buf) {
+        NVME_LOG(ERR, "namespace:%u is not ready", ns->id);
+        nvme_set_error_page(n, sqid, cmd->cid, NVME_NS_NOT_READY,
+            offsetof(NvmeRwCmd, nsid), 0, ns->id);
+        return NVME_NS_NOT_READY;
+    }
+    if (data_dir && nvme_is_unrecovered(ns, rw->slba, rw->nlb)) {
+        NVME_LOG(ERR, "nsid:%u slba:%lx nlb:%u access uncorrectable range",
+            ns->id, rw->slba, rw->nlb);
+        nvme_set_error_page(n, sqid, cmd->cid, NVME_UNRECOVERED_READ,
+            offsetof(NvmeRwCmd, slba), rw->slba + rw->nlb, ns->id);
+        return NVME_UNRECOVERED_READ;
+    }
+    if (meta_size) {
+        if (NVME_ID_NS_FLBAS_EXTENDED(ns->id_ns.flbas)) {
+            data_size += meta_size;
+        } else {
+            uint8_t *meta_buf = ns->meta_mapping_addr;
+            if (!rw->mptr) {
+                NVME_LOG(ERR, "no meta pointer provided for command");
+                nvme_set_error_page(n, sqid, cmd->cid, NVME_INVALID_FIELD,
+                    offsetof(NvmeRwCmd, mptr), 0, ns->id);
+                return NVME_INVALID_FIELD | NVME_DNR;
+            }
+            if (!meta_buf) {
+                NVME_LOG(ERR, "namespace:%u meta data is not ready", ns->id);
+                nvme_set_error_page(n, sqid, cmd->cid, NVME_NS_NOT_READY,
+                    offsetof(NvmeRwCmd, nsid), 0, ns->id);
+                return NVME_NS_NOT_READY;
+            }
+
+            meta_buf += (rw->slba * ns->id_ns.lbaf[lba_index].ms);
+            cpu_physical_memory_rw(rw->mptr, meta_buf, meta_size, data_dir);
+        }
+    }
+
+    buf += (rw->slba * blk_sze);
+    ret = nvme_do_prp(rw->prp1, rw->prp2, buf, data_size, data_dir, n);
+    if (ret == NVME_SUCCESS) {
+        nvme_update_stats(ns, rw->nlb, data_dir);
+        if (!data_dir) {
+            nvme_update_ns_util(ns, rw->slba, rw->nlb);
+            if (!ns->thresh_warn_issued && ns->id_ns.nuse > ns->nuse_thresh) {
+                nvme_enqueue_event(n, NVME_AER_TYPE_SMART,
+                    NVME_AER_INFO_SMART_SPARE_THRESH,
+                    NVME_LOG_SMART_INFO);
+                ns->thresh_warn_issued = 1;
+            }
+        }
+    }
+    return ret;
+}
+
+static uint16_t nvme_write_uncor(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd,
+    uint16_t sqid)
+{
+    NvmeRwCmd *rw = (NvmeRwCmd *)cmd;
+    unsigned long *addr = ns->uncorrectable;
+
+    NVME_LOG(IO_DBG, "nsid:%u slba:%lu nlb:%u", ns->id, rw->slba, rw->nlb);
+    if ((rw->slba + rw->nlb) > ns->id_ns.nsze) {
+        NVME_LOG(ERR, "range error, slba:%ld nlb:%u nsze:%ld", rw->slba,
+            rw->nlb, ns->id_ns.nsze);
+        nvme_set_error_page(n, sqid, cmd->cid, NVME_LBA_RANGE,
+            offsetof(NvmeRwCmd, slba), rw->slba + rw->nlb, ns->id);
+        return NVME_LBA_RANGE | NVME_DNR;
+    }
+
+    nvme_update_ns_util(ns, rw->slba, rw->nlb);
+    bitmap_set(addr, rw->slba, rw->nlb);
+    return NVME_SUCCESS;
+}
+
+static uint16_t nvme_flush(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd,
+    uint16_t sqid)
+{
+    NVME_LOG(IO_DBG, "controller:%u nsid:%u", n->instance, ns->id);
+    if (!ns->mapping_addr) {
+        NVME_LOG(ERR, "namespace:%u is not ready", ns->id);
+        nvme_set_error_page(n, sqid, cmd->cid, NVME_NS_NOT_READY,
+            offsetof(NvmeCmd, nsid), 0, ns->id);
+        return NVME_NS_NOT_READY;
+    }
+    if (msync(ns->mapping_addr, ns->mapping_size, MS_SYNC)) {
+        nvme_set_error_page(n, sqid, cmd->cid, NVME_WRITE_FAULT, 0, 0, ns->id);
+        return NVME_WRITE_FAULT;
+    }
+    return NVME_SUCCESS;
+}
+
+static uint16_t nvme_compare(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd,
+    uint16_t sqid)
+{
+    NvmeRwCmd *rw = (NvmeRwCmd *)cmd;
+    uint8_t lba_index = NVME_ID_NS_FLBAS_INDEX(ns->id_ns.flbas);
+    uint32_t blk_sze = (1 << ns->id_ns.lbaf[lba_index].ds);
+    uint64_t data_size = (rw->nlb + 1) * blk_sze;
+    uint8_t *buf = ns->mapping_addr;
+    uint8_t *cmp_buf = g_malloc(data_size);
+    uint16_t ret;
+
+    NVME_LOG(IO_DBG, "nsid:%u slba:%lu nlb:%u data size:%lu", ns->id, rw->slba,
+        rw->nlb, data_size);
+    if ((rw->slba + rw->nlb) > ns->id_ns.nsze) {
+        NVME_LOG(ERR, "range error, nsid:%u slba:%ld nlb:%u nsze:%ld", ns->id,
+            rw->slba, rw->nlb, ns->id_ns.nsze);
+        nvme_set_error_page(n, sqid, cmd->cid, NVME_LBA_RANGE,
+            offsetof(NvmeRwCmd, slba), rw->slba + rw->nlb, ns->id);
+        return NVME_LBA_RANGE | NVME_DNR;
+    }
+    if (n->id_ctrl.mdts && data_size > n->page_size * (1 << n->id_ctrl.mdts)) {
+        NVME_LOG(ERR, "transfer size error, nsid:%u mdts:%u data size:%lu",
+            ns->id, n->id_ctrl.mdts, data_size);
+        nvme_set_error_page(n, sqid, cmd->cid, NVME_INVALID_FIELD,
+            offsetof(NvmeRwCmd, nlb), rw->slba + rw->nlb, ns->id);
+        return NVME_INVALID_FIELD | NVME_DNR;
+    }
+    if (!buf) {
+        NVME_LOG(ERR, "nsid:%u is not ready", ns->id);
+        nvme_set_error_page(n, sqid, cmd->cid, NVME_NS_NOT_READY,
+            offsetof(NvmeRwCmd, nsid), 0, ns->id);
+        return NVME_NS_NOT_READY;
+    }
+    if (nvme_is_unrecovered(ns, rw->slba, rw->nlb)) {
+        NVME_LOG(ERR, "nsid:%u slba:%lx nlb:%u access uncorrectable range",
+            ns->id, rw->slba, rw->nlb);
+        nvme_set_error_page(n, sqid, cmd->cid, NVME_UNRECOVERED_READ,
+            offsetof(NvmeRwCmd, slba), rw->slba + rw->nlb, ns->id);
+        return NVME_UNRECOVERED_READ;
+    }
+
+    buf += (rw->slba * blk_sze);
+    ret = nvme_do_prp(rw->prp1, rw->prp2, cmp_buf, data_size, 1, n);
+    if (ret == NVME_SUCCESS) {
+        if (memcmp(buf, cmp_buf, data_size)) {
+            ret = NVME_CMP_FAILURE;
+        }
+    }
+    return ret;
+}
+
+static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeCmd *cmd, uint32_t *result,
+    uint16_t sqid)
+{
+    uint16_t ret;
+    NvmeNamespace *ns;
+    if (cmd->nsid == 0 || cmd->nsid > n->num_namespaces) {
+        NVME_LOG(ERR, "invalid nsid:%u", cmd->nsid);
+        nvme_set_error_page(n, sqid, cmd->cid, NVME_INVALID_NSID,
+                offsetof(NvmeCmd, nsid), 0, cmd->nsid);
+        return NVME_INVALID_NSID | NVME_DNR;
+    }
+    ns = &n->namespaces[cmd->nsid - 1];
+
+    NVME_LOG(IO_DBG, "controller:%u nsid:%u opcode:%x", n->instance, ns->id,
+        cmd->opcode);
+
+    pthread_rwlock_rdlock(&ns->format_lock);
+    switch (cmd->opcode) {
+    case NVME_CMD_FLUSH:
+        ret = nvme_flush(n, ns, cmd, sqid);
+        break;
+
+    case NVME_CMD_WRITE:
+    case NVME_CMD_READ:
+        ret = nvme_rw(n, ns, cmd, sqid);
+        break;
+
+    case NVME_CMD_DSM:
+        ret = nvme_dsm(n, ns, cmd, sqid);
+        break;
+
+    case NVME_CMD_WRITE_UNCOR:
+        ret = nvme_write_uncor(n, ns, cmd, sqid);
+        break;
+
+    case NVME_CMD_COMPARE:
+        ret = nvme_compare(n, ns, cmd, sqid);
+        break;
+
+    default:
+        ret = NVME_INVALID_OPCODE | NVME_DNR;
+        break;
+    }
+    pthread_rwlock_unlock(&ns->format_lock);
+    return ret;
+}
+
+static uint16_t nvme_del_sq(NvmeCtrl *n, NvmeCmd *cmd)
+{
+    NvmeDeleteQ *c = (NvmeDeleteQ *)cmd;
+    NvmeSQueue *sq;
+
+    if (!c->qid || nvme_check_sqid(n, c->qid)) {
+        NVME_LOG(ERR, "invalid sqid:%u", c->qid);
+        return NVME_INVALID_QID | NVME_DNR;
+    }
+
+
+    sq = n->sq[c->qid];
+    qemu_mutex_lock(&sq->queue_lock);
+    if (!nvme_check_cqid(n, sq->cqid)) {
+        NvmeCQueue *cq = n->cq[sq->cqid];
+        QTAILQ_REMOVE(&cq->sq_list, sq, entry);
+    } else {
+        NVME_LOG(ERR, "sq:%u does not contain valid cq:%u", sq->id,
+            sq->cqid);
+    }
+    n->sq[c->qid] = NULL;
+
+    NVME_LOG(INFO, "controller:%u sqid:%u cqid:%u", n->instance, c->qid,
+        sq->cqid);
+    sq->is_active = 0;
+    bsem_put(&sq->event_lock);
+    qemu_thread_join(&sq->process_thread);
+    bsem_destroy(&sq->event_lock);
+    qemu_mutex_unlock(&sq->queue_lock);
+
+    qemu_mutex_destroy(&sq->queue_lock);
+    if (sq->prp_list) {
+        g_free(sq->prp_list);
+    }
+    g_free(sq);
+
+    return NVME_SUCCESS;
+}
+
+static uint16_t nvme_create_sq(NvmeCtrl *n, NvmeCmd *cmd)
+{
+    NvmeCreateSq *c = (NvmeCreateSq *)cmd;
+
+    NvmeSQueue *sq;
+    NvmeCQueue *cq;
+
+    if (!c->cqid || nvme_check_cqid(n, c->cqid)) {
+        NVME_LOG(ERR, "invalid cqid:%u", c->cqid);
+        return NVME_INVALID_CQID | NVME_DNR;
+    }
+    if (!c->sqid || (c->sqid && !nvme_check_sqid(n, c->sqid))) {
+        NVME_LOG(ERR, "invalid sqid:%u", c->sqid);
+        return NVME_INVALID_QID | NVME_DNR;
+    }
+    if (!c->qsize || c->qsize > NVME_CAP_MQES(n->bar.cap)) {
+        NVME_LOG(ERR, "invalid size");
+        return NVME_MAX_QSIZE_EXCEEDED | NVME_DNR;
+    }
+    if (!c->prp1 || c->prp1 & (n->page_size - 1)) {
+        NVME_LOG(ERR, "invalid prp:%lx", c->prp1);
+        return NVME_INVALID_FIELD | NVME_DNR;
+    }
+    if (!(c->sq_flags & 1) && NVME_CAP_CQR(n->bar.cap)) {
+        NVME_LOG(ERR, "invalid prp discontinuity");
+        return NVME_INVALID_FIELD | NVME_DNR;
+    }
+
+    sq = g_malloc0(sizeof(*sq));
+    if (!(c->sq_flags & 1)) {
+        sq->prp_list = nvme_setup_discontig(c->prp1, c->qsize + 1,
+            n->page_size, n->sqe_size);
+        if (!sq->prp_list) {
+            g_free(sq);
+            return NVME_INVALID_FIELD | NVME_DNR;
+        }
+    } else {
+        sq->dma_addr = c->prp1;
+    }
+    sq->id = c->sqid;
+    sq->size = c->qsize + 1;
+    sq->phys_contig = c->sq_flags & 1;
+    sq->cqid = c->cqid;
+    sq->is_active = 1;
+    sq->ctrl = n;
+
+    NVME_LOG(INFO, "controller:%u sq:%u cq:%u size:%u", n->instance, c->sqid,
+        c->cqid, c->qsize);
+
+    qemu_mutex_init(&sq->queue_lock);
+    bsem_init(&sq->event_lock);
+
+    qemu_thread_create(&sq->process_thread, nvme_sq_thread, sq,
+        QEMU_THREAD_JOINABLE);
+
+    n->sq[c->sqid] = sq;
+    cq = n->cq[c->cqid];
+    QTAILQ_INSERT_TAIL(&(cq->sq_list), sq, entry);
+
+    return NVME_SUCCESS;
+}
+
+static uint16_t nvme_del_cq(NvmeCtrl *n, NvmeCmd *cmd)
+{
+    NvmeDeleteQ *c = (NvmeDeleteQ *)cmd;
+    NvmeCQueue *cq;
+
+    if (!c->qid || nvme_check_cqid(n, c->qid)) {
+        NVME_LOG(ERR, "invalid cqid:%u", c->qid);
+        return NVME_INVALID_CQID | NVME_DNR;
+    }
+
+    cq = n->cq[c->qid];
+    qemu_mutex_lock(&cq->queue_lock);
+    if (!QTAILQ_EMPTY(&cq->sq_list)) {
+        NVME_LOG(ERR, "error, cq:%u sq(s) are still connected to CQ",
+            c->qid);
+        qemu_mutex_unlock(&cq->queue_lock);
+        return NVME_INVALID_QUEUE_DEL;
+    }
+
+    NVME_LOG(INFO, "controller:%u cqid:%u vector:%u", n->instance, c->qid,
+        cq->vector);
+
+    msix_vector_unuse(&n->dev, cq->vector);
+    n->cq[c->qid] = NULL;
+    qemu_mutex_unlock(&cq->queue_lock);
+    qemu_mutex_destroy(&cq->queue_lock);
+    if (cq->prp_list) {
+        g_free(cq->prp_list);
+    }
+    g_free(cq);
+
+    return NVME_SUCCESS;
+}
+
+static uint16_t nvme_create_cq(NvmeCtrl *n, NvmeCmd *cmd)
+{
+    NvmeCreateCq *c = (NvmeCreateCq *)cmd;
+    NvmeCQueue *cq;
+
+    if (!c->cqid || (c->cqid && !nvme_check_cqid(n, c->cqid))) {
+        NVME_LOG(ERR, "invalid cqid:%u", c->cqid);
+        return NVME_INVALID_CQID | NVME_DNR;
+    }
+    if (!c->qsize || c->qsize > NVME_CAP_MQES(n->bar.cap)) {
+        NVME_LOG(ERR, "invalid size:%u", c->qsize);
+        return NVME_MAX_QSIZE_EXCEEDED | NVME_DNR;
+    }
+    if (!c->prp1) {
+        NVME_LOG(ERR, "invalid null prp");
+        return NVME_INVALID_FIELD | NVME_DNR;
+    }
+    if (c->irq_vector > n->num_queues) {
+        NVME_LOG(ERR, "invalid vector:%u", c->irq_vector);
+        return NVME_INVALID_IRQ_VECTOR;
+    }
+    if (!(c->cq_flags & 1) && NVME_CAP_CQR(n->bar.cap)) {
+        NVME_LOG(ERR, "invalid prp discontinuity");
+        return NVME_INVALID_FIELD | NVME_DNR;
+    }
+
+    cq = g_malloc0(sizeof(*cq));
+    if (!(c->cq_flags & 1)) {
+        cq->prp_list = nvme_setup_discontig(c->prp1, c->qsize + 1,
+            n->page_size, n->cqe_size);
+        if (!cq->prp_list) {
+            g_free(cq);
+            return NVME_INVALID_FIELD | NVME_DNR;
+        }
+    } else {
+        cq->dma_addr = c->prp1;
+    }
+    cq->id = c->cqid;
+    cq->dma_addr = c->prp1;
+    cq->irq_enabled = (c->cq_flags & 2) >> 1;
+    cq->vector = c->irq_vector;
+    cq->phase = 1;
+    cq->size = c->qsize + 1;
+    cq->phys_contig = c->cq_flags & 1;
+
+    QTAILQ_INIT(&cq->sq_list);
+    qemu_mutex_init(&cq->queue_lock);
+
+    n->cq[c->cqid] = cq;
+    msix_vector_use(&n->dev, cq->vector);
+
+    NVME_LOG(INFO,
+        "controller:%u cq:%u size:%u vector:%u", n->instance, c->cqid,
+        c->qsize, cq->vector);
+    return NVME_SUCCESS;
+}
+
+static uint16_t nvme_identify(NvmeCtrl *n, NvmeCmd *cmd)
+{
+    NvmeIdentify *c = (NvmeIdentify *)cmd;
+    NVME_LOG(DBG, "controller:%u cns:%u nsid:%u", n->instance, c->cns, c->nsid);
+    if (c->cns) {
+        return nvme_do_prp(cmd->prp1, cmd->prp2, (uint8_t *)&n->id_ctrl,
+            sizeof(n->id_ctrl), 1, n);
+    }
+
+    if (c->nsid == 0 || c->nsid > n->num_namespaces) {
+        NVME_LOG(ERR, "controller:%u invalid nsid:%u, namespaces:%u",
+            n->instance, c->nsid, n->num_namespaces);
+        return NVME_INVALID_NSID | NVME_DNR;
+    }
+
+    return nvme_do_prp(cmd->prp1, cmd->prp2,
+        (uint8_t *)&n->namespaces[c->nsid - 1].id_ns,
+        sizeof(n->id_ctrl), 1, n);
+}
+
+static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeCmd *cmd, uint32_t *result)
+{
+    uint32_t dw11 = cmd->cdw11;
+    NVME_LOG(DBG, "controller:%u feature id:%x", n->instance, cmd->cdw10);
+    switch (cmd->cdw10) {
+    case NVME_ARBITRATION:
+        *result = n->features.arbitration;
+        break;
+    case NVME_POWER_MANAGEMENT:
+        *result = n->features.power_mgmt;
+        break;
+    case NVME_LBA_RANGE_TYPE:
+        return nvme_do_prp(cmd->prp1, cmd->prp2,
+                (uint8_t *)n->namespaces[cmd->nsid].lba_range,
+                min(sizeof(n->namespaces[cmd->nsid].lba_range),
+                    (dw11 & 0x3f) * sizeof(NvmeRangeType)), 1, n);
+    case NVME_TEMPERATURE_THRESHOLD:
+        *result = n->features.temp_thresh;
+        break;
+    case NVME_ERROR_RECOVERY:
+        *result = n->features.err_rec;
+        break;
+    case NVME_VOLATILE_WRITE_CACHE:
+        *result = n->features.volatile_wc;
+        break;
+    case NVME_NUMBER_OF_QUEUES:
+        *result = n->features.num_queues;
+        break;
+    case NVME_INTERRUPT_COALESCING:
+        *result = n->features.int_coalescing;
+        break;
+    case NVME_INTERRUPT_VECTOR_CONF:
+        if ((dw11 & 0xffff) > n->num_queues) {
+            return NVME_INVALID_FIELD | NVME_DNR;
+        }
+        *result = n->features.int_vector_config[dw11 & 0xffff];
+        break;
+    case NVME_WRITE_ATOMICITY:
+        *result = n->features.write_atomicity;
+        break;
+    case NVME_ASYNCHRONOUS_EVENT_CONF:
+        *result = n->features.async_config;
+        break;
+    case NVME_SOFTWARE_PROGRESS_MARKER:
+        *result = n->features.sw_prog_marker;
+        break;
+    default:
+        return NVME_INVALID_FIELD | NVME_DNR;
+    }
+    return NVME_SUCCESS;
+}
+
+static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeCmd *cmd, uint32_t *result)
+{
+    uint32_t dw11 = cmd->cdw11;
+    NVME_LOG(DBG, "controller:%u feature id:%x", n->instance, cmd->cdw10);
+    switch (cmd->cdw10) {
+    case NVME_ARBITRATION:
+        n->features.arbitration = dw11;
+        break;
+    case NVME_POWER_MANAGEMENT:
+        n->features.power_mgmt = dw11;
+        break;
+    case NVME_LBA_RANGE_TYPE:
+        return nvme_do_prp(cmd->prp1, cmd->prp2,
+                (uint8_t *)n->namespaces[cmd->nsid].lba_range,
+                min(sizeof(n->namespaces[cmd->nsid].lba_range),
+                    (dw11 & 0x3f) * sizeof(NvmeRangeType)), 0, n);
+    case NVME_TEMPERATURE_THRESHOLD:
+        n->features.temp_thresh = dw11;
+        if (n->features.temp_thresh <= n->temperature && !n->temp_warn_issued) {
+            n->temp_warn_issued = 1;
+            nvme_enqueue_event(n, NVME_AER_TYPE_SMART,
+                    NVME_AER_INFO_SMART_TEMP_THRESH,
+                    NVME_LOG_SMART_INFO);
+        } else if (n->features.temp_thresh > n->temperature &&
+                !(n->aer_mask & 1 << NVME_AER_TYPE_SMART)) {
+            n->temp_warn_issued = 0;
+        }
+        break;
+    case NVME_ERROR_RECOVERY:
+        n->features.err_rec = dw11;
+        break;
+    case NVME_VOLATILE_WRITE_CACHE:
+        break;
+    case NVME_NUMBER_OF_QUEUES:
+        *result = n->features.num_queues;
+        break;
+    case NVME_INTERRUPT_COALESCING:
+        break;
+    case NVME_INTERRUPT_VECTOR_CONF:
+        if ((dw11 & 0xffff) > n->num_queues) {
+            return NVME_INVALID_FIELD | NVME_DNR;
+        }
+        n->features.int_vector_config[dw11 & 0xffff] = dw11 & 0x1ffff;
+        break;
+    case NVME_WRITE_ATOMICITY:
+        n->features.write_atomicity = dw11;
+        break;
+    case NVME_ASYNCHRONOUS_EVENT_CONF:
+        n->features.async_config = dw11;
+        break;
+    case NVME_SOFTWARE_PROGRESS_MARKER:
+        n->features.sw_prog_marker = dw11;
+        break;
+    default:
+        return NVME_INVALID_FIELD | NVME_DNR;
+    }
+    return NVME_SUCCESS;
+}
+
+static uint16_t nvme_fw_log_info(NvmeCtrl *n, NvmeCmd *cmd, uint32_t buf_len)
+{
+    uint32_t trans_len;
+    NvmeFwSlotInfoLog fw_log;
+
+    trans_len = min(sizeof(fw_log), buf_len);
+    if (buf_len < sizeof(fw_log)) {
+        NVME_LOG(ERR, "not enough memory, needs %ld, has %u bytes",
+            sizeof(fw_log), buf_len);
+    }
+
+    return nvme_do_prp(cmd->prp1, cmd->prp2, (uint8_t *)&fw_log, trans_len,
+        1, n);
+}
+
+static uint16_t nvme_error_log_info(NvmeCtrl *n, NvmeCmd *cmd, uint32_t buf_len)
+{
+    uint32_t trans_len;
+    trans_len = min(sizeof(*n->elpes) * n->elpe, buf_len);
+    if (buf_len % sizeof(*n->elpes)) {
+        NVME_LOG(ERR, "buffer not multiple of log page size:%lu",
+            sizeof(*n->elpes));
+    }
+
+    n->aer_mask &= ~(1 << NVME_AER_TYPE_ERROR);
+    if (!QSIMPLEQ_EMPTY(&n->aer_queue)) {
+        qemu_mod_timer(n->aer_timer, qemu_get_clock_ns(vm_clock) + 10000);
+    }
+    return nvme_do_prp(cmd->prp1, cmd->prp2, (uint8_t *)n->elpes, trans_len,
+        1, n);
+}
+
+static uint16_t nvme_smart_info(NvmeCtrl *n, NvmeCmd *cmd, uint32_t buf_len)
+{
+    uint32_t trans_len;
+    time_t current_seconds;
+    NvmeSmartLog smart_log;
+
+    trans_len = min(sizeof(smart_log), buf_len);
+    if (buf_len < sizeof(smart_log)) {
+        NVME_LOG(ERR, "not enough memory for log, needs %ld, has %u bytes",
+            sizeof(smart_log), buf_len);
+    }
+
+    memset(&smart_log, 0x0, sizeof(smart_log));
+    if (cmd->nsid == 0xffffffff || !(n->id_ctrl.lpa & 0x1)) {
+        int i;
+        uint64_t dur[2] = {0, 0};
+        uint64_t duw[2] = {0, 0};
+        uint64_t hrc[2] = {0, 0};
+        uint64_t hwc[2] = {0, 0};
+        uint64_t total_use = 0;
+        uint64_t total_size = 0;
+        for (i = 0; i < n->num_namespaces; ++i) {
+            uint64_t tmp;
+            NvmeNamespace *ns = &n->namespaces[i];
+            if (ns == NULL) {
+                continue;
+            }
+
+            tmp = dur[0];
+            dur[0] += ns->data_units_read[0];
+            dur[1] += ns->data_units_read[1];
+            if (tmp > dur[0]) {
+                ++dur[1];
+            }
+
+            tmp = duw[0];
+            duw[0] += ns->data_units_written[0];
+            duw[1] += ns->data_units_written[1];
+            if (tmp > duw[0]) {
+                ++duw[1];
+            }
+
+            tmp = hrc[0];
+            hrc[0] += ns->host_read_commands[0];
+            hrc[1] += ns->host_read_commands[1];
+            if (tmp > hrc[0]) {
+                ++hrc[1];
+            }
+
+            tmp = hwc[0];
+            hwc[0] += ns->host_write_commands[0];
+            hwc[1] += ns->host_write_commands[1];
+            if (tmp > hwc[0]) {
+                ++hwc[1];
+            }
+
+            total_size += ns->id_ns.nsze;
+            total_use += ns->id_ns.nuse;
+        }
+
+        smart_log.data_units_read[0] = dur[0];
+        smart_log.data_units_read[1] = dur[1];
+        smart_log.data_units_written[0] = duw[0];
+        smart_log.data_units_written[1] = duw[1];
+        smart_log.host_read_commands[0] = hrc[0];
+        smart_log.host_read_commands[1] = hrc[1];
+        smart_log.host_write_commands[0] = hwc[0];
+        smart_log.host_write_commands[1] = hwc[1];
+        smart_log.available_spare = 100 - (uint32_t)((((double)total_use) /
+                                                total_size) * 100);
+    } else if (cmd->nsid > 0 && cmd->nsid <= n->num_namespaces &&
+            (n->id_ctrl.lpa & 0x1)) {
+        NvmeNamespace *ns = &n->namespaces[cmd->nsid - 1];
+        smart_log.data_units_read[0] = ns->data_units_read[0];
+        smart_log.data_units_read[1] = ns->data_units_read[1];
+        smart_log.data_units_written[0] = ns->data_units_written[0];
+        smart_log.data_units_written[1] = ns->data_units_written[1];
+        smart_log.host_read_commands[0] = ns->host_read_commands[0];
+        smart_log.host_read_commands[1] = ns->host_read_commands[1];
+        smart_log.host_write_commands[0] = ns->host_write_commands[0];
+        smart_log.host_write_commands[1] = ns->host_write_commands[1];
+        smart_log.available_spare = 100 - (uint32_t)
+            ((((double)ns->id_ns.nuse) / ns->id_ns.nsze) * 100);
+    } else {
+        return NVME_INVALID_NSID | NVME_DNR;
+    }
+
+    smart_log.temperature[0] = n->temperature & 0xff;
+    smart_log.temperature[1] = (n->temperature >> 8) & 0xff;
+    smart_log.percentage_used = n->percentage_used;
+
+    current_seconds = time(NULL);
+    smart_log.power_on_hours[0] = ((current_seconds - n->start_time) / 60) / 60;
+
+    smart_log.available_spare_threshold = NVME_SPARE_THRESHOLD;
+    if (smart_log.available_spare <= NVME_SPARE_THRESHOLD) {
+        smart_log.critical_warning |= NVME_SMART_SPARE;
+    }
+    if (n->features.temp_thresh <= n->temperature) {
+        smart_log.critical_warning |= NVME_SMART_TEMPERATURE;
+    }
+
+    n->aer_mask &= ~(1 << NVME_AER_TYPE_SMART);
+    if (!QSIMPLEQ_EMPTY(&n->aer_queue)) {
+        qemu_mod_timer(n->aer_timer, qemu_get_clock_ns(vm_clock) + 10000);
+    }
+    return nvme_do_prp(cmd->prp1, cmd->prp2, (uint8_t *)&smart_log, trans_len,
+        1, n);
+}
+
+static uint16_t nvme_get_log(NvmeCtrl *n, NvmeCmd *cmd)
+{
+    uint16_t lid = cmd->cdw10 & 0xffff;
+    uint32_t numd = ((cmd->cdw10 >> 16) & 0xff) << 2;
+
+    NVME_LOG(DBG, "controller:%u lod id:%u dwords:%u", n->instance, lid, numd);
+    switch (lid) {
+    case NVME_LOG_ERROR_INFO:
+        return nvme_error_log_info(n, cmd, numd);
+    case NVME_LOG_SMART_INFO:
+        return nvme_smart_info(n, cmd, numd);
+    case NVME_LOG_FW_SLOT_INFO:
+        return nvme_fw_log_info(n, cmd, numd);
+    default:
+        return NVME_INVALID_LOG_ID | NVME_DNR;
+    }
+}
+
+static uint16_t nvme_format_namespace(NvmeNamespace *ns, uint8_t lba_idx,
+                uint8_t meta_loc, uint8_t pil, uint8_t pi, uint8_t sec_erase)
+{
+    uint64_t old_size;
+    uint8_t lbaf = NVME_ID_NS_FLBAS_INDEX(ns->id_ns.flbas);
+
+    if (lba_idx > ns->id_ns.nlbaf) {
+        NVME_LOG(ERR, "invalid lba index:%u", lba_idx);
+        return NVME_INVALID_FORMAT | NVME_DNR;
+    }
+    if (pi) {
+        if (pil && !NVME_ID_NS_DPC_LAST_EIGHT(ns->id_ns.dpc)) {
+            NVME_LOG(ERR, "pi requested as last 8 bytes, dpc:%x",
+                ns->id_ns.dpc);
+            return NVME_INVALID_FORMAT | NVME_DNR;
+        }
+        if (!pil && !NVME_ID_NS_DPC_FIRST_EIGHT(ns->id_ns.dpc)) {
+            NVME_LOG(ERR, "pi requested as first 8 bytes, dpc:%x",
+                ns->id_ns.dpc);
+            return NVME_INVALID_FORMAT | NVME_DNR;
+        }
+        if (!((ns->id_ns.dpc & 0x7) & (1 << (pi - 1)))) {
+            NVME_LOG(ERR, "invalid pi type:%u, dpc:%x",
+                pi, ns->id_ns.dpc);
+            return NVME_INVALID_FORMAT | NVME_DNR;
+        }
+    }
+    if (meta_loc && ns->id_ns.lbaf[lba_idx].ms &&
+            !NVME_ID_NS_MC_EXTENDED(ns->id_ns.mc)) {
+        NVME_LOG(ERR, "invalid meta location:%x, mc:%x",
+            meta_loc, ns->id_ns.mc);
+        return NVME_INVALID_FORMAT | NVME_DNR;
+    }
+    if (!meta_loc && ns->id_ns.lbaf[lba_idx].ms &&
+            !NVME_ID_NS_MC_SEPARATE(ns->id_ns.mc)) {
+        NVME_LOG(ERR, "invalid meta location:%x, mc:%x",
+            meta_loc, ns->id_ns.mc);
+        return NVME_INVALID_FORMAT | NVME_DNR;
+    }
+
+    NVME_LOG(DBG,
+        "controller:%u nsid:%x lba index:%u meta loc:%u pil:%u pi:%u erase:%u",
+        ns->ctrl->instance, ns->id, lba_idx, meta_loc, pil, pi, sec_erase);
+
+    pthread_rwlock_wrlock(&ns->format_lock);
+    if (sec_erase) {
+        memset(ns->mapping_addr, 0x0, ns->mapping_size);
+    }
+    nvme_close_namespace(ns);
+    old_size = ns->id_ns.nsze * (1 << ns->id_ns.lbaf[lbaf].ds);
+    ns->id_ns.nuse = 0;
+    ns->id_ns.flbas = lba_idx | meta_loc;
+    ns->id_ns.nsze = old_size >> ns->id_ns.lbaf[lba_idx].ds;
+    ns->id_ns.ncap = ns->id_ns.nsze;
+    ns->id_ns.dps = pil | pi;
+    nvme_open_namespace(ns->ctrl, ns, ns->id);
+    pthread_rwlock_unlock(&ns->format_lock);
+
+    return NVME_SUCCESS;
+}
+
+static uint16_t nvme_format(NvmeCtrl *n, NvmeCmd *cmd)
+{
+    NvmeNamespace *ns;
+    uint32_t dw10 = cmd->cdw10;
+    uint32_t nsid = cmd->nsid;
+    uint8_t lba_idx = dw10 & 0xf;
+    uint8_t meta_loc = dw10 & 0x10;
+    uint8_t pil = (dw10 >> 5) & 0x8;
+    uint8_t pi = (dw10 >> 5) & 0x7;
+    uint8_t sec_erase = (dw10 >> 8) & 0x7;
+
+    if (nsid == 0xffffffff) {
+        uint32_t i;
+        uint16_t ret;
+
+        for (i = 0; i < n->num_namespaces; ++i) {
+            ns = &n->namespaces[i];
+            ret = nvme_format_namespace(ns, lba_idx, meta_loc, pil, pi,
+                sec_erase);
+            if (ret != NVME_SUCCESS) {
+                return ret;
+            }
+        }
+        return ret;
+    }
+
+    if (nsid == 0 || nsid > n->num_namespaces) {
+        NVME_LOG(ERR, "invalid nsid:%u", nsid);
+        return NVME_INVALID_NSID | NVME_DNR;
+    }
+
+    ns = &n->namespaces[cmd->nsid - 1];
+    return nvme_format_namespace(ns, lba_idx, meta_loc, pil, pi,
+        sec_erase);
+}
+
+static uint16_t nvme_async_req(NvmeCtrl *n, NvmeCmd *cmd)
+{
+    if (n->outstanding_aers > n->aerl + 1) {
+        NVME_LOG(ERR, "exceeded async requests:%u", n->aerl);
+        return NVME_AER_LIMIT_EXCEEDED;
+    }
+
+    NVME_LOG(DBG, "controller:%u cid:%u outstanding aers:%u", n->instance,
+        cmd->cid, n->outstanding_aers);
+    n->aer_cid[n->outstanding_aers] = cmd->cid;
+    qemu_mod_timer(n->aer_timer, qemu_get_clock_ns(vm_clock) + 10000);
+    n->outstanding_aers++;
+
+    return NVME_NO_COMPLETE;
+}
+
+static uint16_t nvme_abort_req(NvmeCtrl *n, NvmeCmd *cmd, uint32_t *result)
+{
+    uint32_t index = 0;
+    uint16_t sqid = cmd->cdw10 & 0xffff;
+    uint16_t cid = (cmd->cdw10 >> 16) & 0xffff;
+    NvmeSQueue *sq;
+
+    *result = 1;
+    NVME_LOG(ERR, "sqid:%u cid:%u", sqid, cid);
+    if (nvme_check_sqid(n, sqid)) {
+        return NVME_SUCCESS;
+    }
+
+    sq = n->sq[sqid];
+    if (sqid != 0) {
+        qemu_mutex_lock(&sq->queue_lock);
+    }
+
+    while ((sq->head + index) % sq->size != sq->tail) {
+        NvmeCmd abort_cmd;
+        hwaddr addr;
+
+        if (sq->phys_contig) {
+            addr = sq->dma_addr + ((sq->head + index) % sq->size) * n->sqe_size;
+        } else {
+            addr = nvme_discontig(sq->prp_list, (sq->head + index) % sq->size,
+                n->page_size, n->sqe_size);
+        }
+
+        cpu_physical_memory_rw(addr, (uint8_t *)&abort_cmd,
+            sizeof(abort_cmd), 0);
+        if (abort_cmd.cid == cid) {
+            NVME_LOG(ERR, "found sqid:%u cid:%u to abort", sqid, cid);
+            NvmeCqe cqe;
+            cqe.command_id = cid;
+            cqe.status = NVME_CMD_ABORT_REQ << 1;
+            abort_cmd.opcode = NVME_OP_ABORTED;
+            cpu_physical_memory_rw(addr, (uint8_t *)&abort_cmd,
+                sizeof(abort_cmd), 1);
+            *result = 0;
+            break;
+        }
+        ++index;
+    }
+    if (sqid != 0) {
+        qemu_mutex_unlock(&sq->queue_lock);
+    }
+    if (*result) {
+        NVME_LOG(ERR, "failed to find req to abort sqid:%u cid:%u", sqid, cid);
+    }
+
+    return NVME_SUCCESS;
+}
+
+static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeCmd *cmd, uint32_t *result)
+{
+    NVME_LOG(DBG, "opcode:%x", cmd->opcode);
+    switch (cmd->opcode) {
+    case NVME_ADM_CMD_DELETE_SQ:
+        return nvme_del_sq(n, cmd);
+    case NVME_ADM_CMD_CREATE_SQ:
+        return nvme_create_sq(n, cmd);
+    case NVME_ADM_CMD_GET_LOG_PAGE:
+        return nvme_get_log(n, cmd);
+    case NVME_ADM_CMD_DELETE_CQ:
+        return nvme_del_cq(n, cmd);
+    case NVME_ADM_CMD_CREATE_CQ:
+        return nvme_create_cq(n, cmd);
+    case NVME_ADM_CMD_IDENTIFY:
+        return nvme_identify(n, cmd);
+    case NVME_ADM_CMD_ABORT:
+        return nvme_abort_req(n, cmd, result);
+    case NVME_ADM_CMD_SET_FEATURES:
+        return nvme_set_feature(n, cmd, result);
+    case NVME_ADM_CMD_GET_FEATURES:
+        return nvme_get_feature(n, cmd, result);
+    case NVME_ADM_CMD_ASYNC_EV_REQ:
+        return nvme_async_req(n, cmd);
+    case NVME_ADM_CMD_FORMAT_NVM:
+        return nvme_format(n, cmd);
+    case NVME_ADM_CMD_ACTIVATE_FW:
+    case NVME_ADM_CMD_DOWNLOAD_FW:
+    case NVME_ADM_CMD_SECURITY_SEND:
+    case NVME_ADM_CMD_SECURITY_RECV:
+    default:
+        NVME_LOG(ERR, "invalid opcode:%x", cmd->opcode);
+        return NVME_INVALID_OPCODE | NVME_DNR;
+    }
+}
+
+static int nvme_process_sq(NvmeSQueue *sq, NvmeCQueue *cq, NvmeCtrl *n)
+{
+    hwaddr addr;
+    NvmeCmd cmd;
+    NvmeCqe cqe;
+    uint16_t status;
+
+    NVME_LOG(IO_DBG, "controller:%u sqid:%u head:%u tail:%u", n->instance,
+        sq->id, sq->head, sq->tail);
+    if (sq->head == sq->tail) {
+        return -1;
+    }
+
+    if (cq->phys_contig) {
+        addr = sq->dma_addr + sq->head * n->sqe_size;
+    } else {
+        addr = nvme_discontig(sq->prp_list, sq->head, n->page_size,
+            n->sqe_size);
+    }
+    cpu_physical_memory_rw(addr, (uint8_t *)&cmd, sizeof(cmd), 0);
+    memset(&cqe, 0, sizeof(cqe));
+    nvme_inc_sq_head(sq);
+
+    if (cmd.opcode == NVME_OP_ABORTED) {
+        return 0;
+    }
+
+    if (sq->id == 0) {
+        status = nvme_admin_cmd(n, &cmd, &cqe.result);
+    } else {
+        status = nvme_io_cmd(n, &cmd, &cqe.result, sq->id);
+    }
+    if (status == NVME_NO_COMPLETE) {
+        return 0;
+    }
+
+    cqe.status = status << 1;
+    cqe.command_id = cmd.cid;
+    nvme_post_cqe(n, cq, sq, &cqe);
+    return 0;
+}
+
+static void *nvme_sq_thread(void *arg)
+{
+    NvmeSQueue *sq = arg;
+    NvmeCtrl *n = sq->ctrl;
+    NvmeCQueue *cq = n->cq[sq->cqid];
+
+    NVME_LOG(INFO, "started submission queue thread for sq:%u cq:%u",
+        sq->id, cq->id);
+    for (;;) {
+        int processed = 0;
+        nvme_wait_sq(sq);
+
+        qemu_mutex_lock(&sq->queue_lock);
+        while (!nvme_process_sq(sq, cq, n)) {
+            ++processed;
+        }
+
+        qemu_mutex_unlock(&sq->queue_lock);
+        if (processed) {
+            sq->completed += processed;
+            nvme_isr_notify(n, cq);
+        }
+    }
+    return NULL;
+}
+
+static void nvme_clear_ctrl(NvmeCtrl *n)
+{
+    int i;
+    AsyncEvent *event;
+
+    for (i = 1; i < n->num_queues; i++) {
+        if (n->sq[i] != NULL) {
+            NvmeSQueue *sq = n->sq[i];
+            qemu_mutex_lock(&sq->queue_lock);
+            sq->is_active = 0;
+            bsem_put(&sq->event_lock);
+            qemu_thread_join(&sq->process_thread);
+            bsem_destroy(&sq->event_lock);
+            qemu_mutex_unlock(&sq->queue_lock);
+            qemu_mutex_destroy(&sq->queue_lock);
+            if (sq->prp_list) {
+                g_free(sq->prp_list);
+            }
+            g_free(sq);
+            n->sq[i] = NULL;
+        }
+    }
+    for (i = 1; i < n->num_queues; i++) {
+        if (n->cq[i] != NULL) {
+            NvmeCQueue *cq = n->cq[i];
+            qemu_mutex_destroy(&cq->queue_lock);
+            msix_vector_unuse(&n->dev, cq->vector);
+            if (cq->prp_list) {
+                g_free(cq->prp_list);
+            }
+            g_free(n->cq[i]);
+            n->cq[i] = NULL;
+        }
+    }
+    if (n->sq[0]->is_active) {
+        n->sq[0]->is_active = 0;
+        bsem_put(&n->sq[0]->event_lock);
+        qemu_thread_join(&n->sq[0]->process_thread);
+    }
+
+    n->bar.csts &= ~NVME_CSTS_READY;
+    n->bar.cc = 0;
+    n->admin_cq.vector = 0;
+    n->admin_cq.head = n->admin_cq.tail = 0;
+    n->admin_sq.head = n->admin_sq.tail = 0;
+
+    qemu_del_timer(n->aer_timer);
+    while ((event = QSIMPLEQ_FIRST(&n->aer_queue)) != NULL) {
+        QSIMPLEQ_REMOVE_HEAD(&n->aer_queue, entry);
+        g_free(event);
+    }
+    n->outstanding_aers = 0;
+    n->aer_mask = 0;
+    n->temp_warn_issued = 0;
+}
+
+static int nvme_start_ctrl(NvmeCtrl *n)
+{
+    uint32_t page_bits = NVME_CC_MPS(n->bar.cc) + 12;
+    uint32_t page_size = 1 << page_bits;
+
+    if (!n->bar.asq || !n->bar.acq) {
+        NVME_LOG(ERR, "initialize attempted before admin queues created");
+        return -1;
+    }
+    if (NVME_CC_MPS(n->bar.cc) < NVME_CAP_MPSMIN(n->bar.cap) ||
+            NVME_CC_MPS(n->bar.cc) > NVME_CAP_MPSMAX(n->bar.cap)) {
+        NVME_LOG(ERR, "bad page size:%u min:%u max:%u",
+            NVME_CC_MPS(n->bar.cc), (uint32_t)NVME_CAP_MPSMIN(n->bar.cap),
+            (uint32_t)NVME_CAP_MPSMAX(n->bar.cap));
+        return -1;
+    }
+    if (n->bar.asq & (page_size - 1) || n->bar.acq & (page_size - 1)) {
+        NVME_LOG(ERR, "asq:%lx acq:%lx not page aligned:%x", n->bar.asq,
+            n->bar.acq, page_size);
+        return -1;
+    }
+    if (NVME_CC_IOCQES(n->bar.cc) < NVME_CTRL_CQES_MIN(n->id_ctrl.cqes) ||
+            NVME_CC_IOCQES(n->bar.cc) > NVME_CTRL_CQES_MAX(n->id_ctrl.cqes)) {
+        NVME_LOG(ERR, "bad cq entry size:%u min:%x max:%u",
+            NVME_CC_IOCQES(n->bar.cc), NVME_CTRL_CQES_MIN(n->id_ctrl.cqes),
+            NVME_CTRL_CQES_MAX(n->id_ctrl.cqes));
+        return -1;
+    }
+    if (NVME_CC_IOSQES(n->bar.cc) < NVME_CTRL_SQES_MIN(n->id_ctrl.cqes) ||
+            NVME_CC_IOSQES(n->bar.cc) > NVME_CTRL_SQES_MAX(n->id_ctrl.cqes)) {
+        NVME_LOG(ERR, "bad sq entry size:%u min:%x max:%u",
+            NVME_CC_IOSQES(n->bar.cc), NVME_CTRL_SQES_MIN(n->id_ctrl.cqes),
+            NVME_CTRL_SQES_MAX(n->id_ctrl.cqes));
+        return -1;
+    }
+
+    n->page_bits = NVME_CC_MPS(n->bar.cc) + 12;
+    n->page_size = 1 << n->page_bits;
+    n->max_prp_ents = n->page_size / sizeof(uint64_t);
+    n->cqe_size = 1 << NVME_CC_IOCQES(n->bar.cc);
+    n->sqe_size = 1 << NVME_CC_IOSQES(n->bar.cc);
+
+    n->admin_cq.vector = 0;
+    n->admin_cq.phase = 1;
+    n->admin_cq.dma_addr = n->bar.acq;
+    n->admin_cq.irq_enabled = 1;
+
+    n->admin_sq.is_active = 1;
+    n->admin_sq.dma_addr = n->bar.asq;
+
+    qemu_thread_create(&n->admin_sq.process_thread,
+        nvme_sq_thread, &n->admin_sq, QEMU_THREAD_JOINABLE);
+
+    n->aer_timer = qemu_new_timer_ns(vm_clock, nvme_aer_process_cb, n);
+    QSIMPLEQ_INIT(&n->aer_queue);
+
+    return 0;
+}
+
+static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, uint64_t data,
+    unsigned size)
+{
+    NVME_LOG(IO_DBG, "controller:%u offset:%lx data:%lx", n->instance, offset,
+        data);
+    switch (offset) {
+    case 0xc:
+        n->bar.intms |= data & 0xffffffff;
+        n->bar.intmc = n->bar.intms;
+        break;
+    case 0x10:
+        n->bar.intms &= ~(data & 0xffffffff);
+        n->bar.intmc = n->bar.intms;
+        break;
+    case 0x14:
+        if ((data & CC_EN_MASK) && !(n->bar.cc & CC_EN_MASK)) {
+            NVME_LOG(ERR, "start controller:%u", n->instance);
+            n->bar.cc = data;
+            if (nvme_start_ctrl(n)) {
+                n->bar.csts = NVME_CSTS_FAILED;
+            } else {
+                n->bar.csts = NVME_CSTS_READY;
+            }
+        } else if (!(data & CC_EN_MASK) && n->bar.cc) {
+            NVME_LOG(ERR, "shut down controller:%u", n->instance);
+            nvme_clear_ctrl(n);
+        } else {
+            NVME_LOG(ERR, "invalid controller transition controller:%u",
+                n->instance);
+        }
+        break;
+    case 0x24:
+        n->bar.aqa = data & 0xffffffff;
+        if (!NVME_CC_EN(n->bar.cc)) {
+            n->admin_cq.size = NVME_AQA_ACQS(n->bar.aqa) + 1;
+            n->admin_sq.size = NVME_AQA_ASQS(n->bar.aqa) + 1;
+        }
+        break;
+    case 0x28:
+        n->bar.asq = data;
+        break;
+    case 0x2c:
+        n->bar.asq |= data << 32;
+        break;
+    case 0x30:
+        n->bar.acq = data;
+        break;
+    case 0x34:
+        n->bar.acq |= data << 32;
+        break;
+    };
+}
+
+static uint64_t nvme_mmio_read(void *opaque, hwaddr addr, unsigned size)
+{
+    NvmeCtrl *n = (NvmeCtrl *)opaque;
+    uint8_t *ptr = (uint8_t *)&n->bar;
+    uint64_t val = 0;
+
+    if (addr < sizeof(n->bar)) {
+        memcpy(&val, ptr + addr, size);
+    } else {
+        NVME_LOG(ERR, "bad bar offset %ld", addr);
+    }
+
+    return val;
+}
+
+static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
+{
+    uint32_t qid;
+    NVME_LOG(IO_DBG, "controller:%u addr:%lx val:%u", instance, addr, val);
+    if (addr & ((1 << (2 + n->db_stride)) - 1)) {
+        NVME_LOG(ERR, "invalid doorbell:%lx, within stride:%u", addr,
+            n->db_stride);
+        nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
+            NVME_AER_INFO_ERR_INVALID_DB, NVME_LOG_ERROR_INFO);
+        return;
+    }
+
+    if (((addr - 0x1000) >> (2 + n->db_stride)) & 1) {
+        NvmeCQueue *cq;
+        uint16_t new_head = val & 0xffff;
+        int start_sqs;
+
+        qid = (addr - (0x1000 + (1 << (n->db_stride + 2)))) >>
+            (3 + n->db_stride);
+        if (nvme_check_cqid(n, qid)) {
+            NVME_LOG(ERR, "invalid cq:%u for addr:%lx", qid, addr);
+            nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
+                NVME_AER_INFO_ERR_INVALID_DB, NVME_LOG_ERROR_INFO);
+            return;
+        }
+
+        cq = n->cq[qid];
+        if (new_head >= cq->size) {
+            NVME_LOG(ERR, "invalid head:%u size:%u", new_head, cq->size);
+            nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
+                NVME_AER_INFO_ERR_INVALID_DB, NVME_LOG_ERROR_INFO);
+            return;
+        }
+
+        NVME_LOG(IO_DBG,
+            "controller:%u cqid:%u tail:%u current head:%u new head:%u",
+            n->instance, qid, cq->tail, cq->head, new_head);
+        start_sqs = nvme_cq_full(cq) ? 1 : 0;
+        cq->head = new_head;
+
+        if (start_sqs) {
+            NvmeSQueue *sq;
+            QTAILQ_FOREACH(sq, &cq->sq_list, entry) {
+                bsem_put(&sq->event_lock);
+            }
+        }
+        if (cq->tail != cq->head) {
+            nvme_isr_notify(n, cq);
+        }
+    } else {
+        uint16_t new_tail;
+        NvmeSQueue *sq;
+
+        qid = (addr - 0x1000) >> (3 + n->db_stride);
+        if (nvme_check_sqid(n, qid)) {
+            NVME_LOG(ERR, "invalid sq:%u", qid);
+            nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
+                NVME_AER_INFO_ERR_INVALID_SQ, NVME_LOG_ERROR_INFO);
+            return;
+        }
+
+        sq = n->sq[qid];
+        new_tail = val & 0xffff;
+        if (new_tail >= sq->size) {
+            NVME_LOG(ERR, "invalid tail:%u size:%u", new_tail, sq->size);
+            nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
+                NVME_AER_INFO_ERR_INVALID_DB, NVME_LOG_ERROR_INFO);
+            return;
+        }
+
+        NVME_LOG(IO_DBG,
+            "controller:%u sqid:%u head:%u current tail:%u new tail:%u",
+            n->instance, qid, sq->head, sq->tail, new_tail);
+        sq->tail = new_tail;
+        bsem_put(&sq->event_lock);
+    }
+}
+
+static void nvme_mmio_write(void *opaque, hwaddr addr, uint64_t data,
+    unsigned size)
+{
+    NvmeCtrl *n = (NvmeCtrl *)opaque;
+    if (addr < sizeof(n->bar)) {
+        nvme_write_bar(n, addr, data, size);
+    } else if (addr >= 0x1000) {
+        nvme_process_db(n, addr, data);
+    } else {
+        NVME_LOG(ERR, "bad bar offset %ld", addr);
+    }
+}
+
+static const MemoryRegionOps nvme_mmio_ops = {
+    .read = nvme_mmio_read,
+    .write = nvme_mmio_write,
+    .endianness = DEVICE_LITTLE_ENDIAN,
+    .impl = {
+        .min_access_size = 2,
+        .max_access_size = 8,
+    },
+};
+
+static void nvme_close_namespaces(NvmeCtrl *n)
+{
+    int i;
+    NVME_LOG(DBG, "controller:%u namespaces:%u", n->instance,
+        n->num_namespaces);
+    for (i = 0; i < n->num_namespaces; i++) {
+        nvme_close_namespace(&n->namespaces[i]);
+        pthread_rwlock_destroy(&n->namespaces[i].format_lock);
+    }
+}
+
+static void nvme_init_namespace(NvmeCtrl *n, NvmeNamespace *ns, int id)
+{
+    NvmeIdNs *id_ns = &ns->id_ns;
+    int i, ms[4] = {0, 8, 32, 64};
+
+    NVME_LOG(DBG, "controller:%u nsid:%u", n->instance, id);
+    ns->id = id;
+    id_ns->ncap   = id_ns->nsze = (n->ns_size * BYTES_PER_MB) / 512;
+    id_ns->nuse   = 0;
+    id_ns->nlbaf  = 0xf;
+    id_ns->flbas  = 0;
+    id_ns->nsfeat = 0;
+    id_ns->mc     = 0x3;
+    id_ns->dpc    = 0x1f;
+    id_ns->dps    = 0;
+
+    for (i = 0; i <= id_ns->nlbaf; i++) {
+        id_ns->lbaf[i].ds = 9 + (i / 4);
+        id_ns->lbaf[i].ms = ms[i % 4];
+    }
+
+    pthread_rwlock_init(&ns->format_lock, NULL);
+    nvme_open_namespace(n, ns, id);
+}
+
+static void nvme_init_namespaces(NvmeCtrl *n)
+{
+    int i;
+    NVME_LOG(DBG, "controller:%u namespaces:%u size:%uMB", n->instance,
+        n->num_namespaces, n->ns_size);
+    for (i = 0; i < n->num_namespaces; i++) {
+        nvme_init_namespace(n, &n->namespaces[i], i);
+    }
+}
+
+static void nvme_init_ctrl(NvmeCtrl *n)
+{
+    int i;
+    NvmeIdCtrl *id = &n->id_ctrl;
+
+    id->vid     = PCI_VENDOR_ID_INTEL;
+    id->ssvid   = 0x0111;
+    id->rab     = 2;
+    id->ieee[0] = 0x00;
+    id->ieee[1] = 0x02;
+    id->ieee[2] = 0xb3;
+    id->cmic    = 0;
+    id->mdts    = n->mdts;
+    id->oacs    = NVME_OACS_FORMAT;
+    id->acl     = n->acl;
+    id->aerl    = n->aerl;
+    id->frmw    = 7 << 1;
+    id->lpa     = 1 << 0;
+    id->elpe    = n->elpe;
+    id->npss    = 2;
+    id->sqes    = 0xf << 4 | 0x6;
+    id->cqes    = 0xf << 4 | 0x4;
+    id->nn      = n->num_namespaces;
+    id->oncs    = NVME_ONCS_COMPARE | NVME_ONCS_WRITE_UNCORR | NVME_ONCS_DSM;
+    id->fuses   = 0;
+    id->fna     = 0;
+    id->vwc     = 1;
+    id->awun    = 0;
+    id->awupf   = 0;
+
+    snprintf((char *)id->mn, sizeof(id->mn), "QEMU NVMe Ctrl");
+    snprintf((char *)id->fr, sizeof(id->fr), "1.0");
+    snprintf((char *)id->sn, sizeof(id->sn), "NVMeQx10%02x", n->instance);
+
+    id->psd[0].mp    = 0x9c4;
+    id->psd[0].enlat = 0x10;
+    id->psd[0].exlat = 0x4;
+    id->psd[0].rrt   = 0;
+    id->psd[0].rrl   = 0;
+    id->psd[0].rwt   = 0;
+    id->psd[0].rwl   = 0;
+
+    id->psd[1].mp    = 0x8fc;
+    id->psd[1].enlat = 0x10;
+    id->psd[1].exlat = 0x10;
+    id->psd[1].rrt   = 1;
+    id->psd[1].rrl   = 1;
+    id->psd[1].rwt   = 1;
+    id->psd[1].rwl   = 1;
+
+    id->psd[2].mp    = 0x2bc;
+    id->psd[2].enlat = 0x1e8480;
+    id->psd[2].exlat = 0x1e8480;
+    id->psd[2].rrt   = 2;
+    id->psd[2].rrl   = 2;
+    id->psd[2].rwt   = 2;
+    id->psd[2].rwl   = 2;
+
+    n->features.arbitration     = 0;
+    n->features.power_mgmt      = 0;
+    n->features.temp_thresh     = 0x14d;
+    n->features.err_rec         = 0;
+    n->features.volatile_wc     = 0;
+    n->features.num_queues      = (n->num_queues - 1) |
+                                 ((n->num_queues - 1) << 16);
+    n->features.int_coalescing  = 0;
+    n->features.write_atomicity = 0;
+    n->features.async_config    = 0x0;
+    n->features.sw_prog_marker  = 0;
+
+    for (i = 0; i < n->num_queues; i++) {
+        n->features.int_vector_config[i] = i;
+    }
+
+    n->temperature = NVME_TEMPERATURE;
+
+    n->bar.cap  = (uint64_t)(n->max_q_ents & CAP_MQES_MASK) << CAP_MQES_SHIFT;
+    n->bar.cap |= (uint64_t)(n->cqr & CAP_CQR_MASK) << CAP_CQR_SHIFT;
+    n->bar.cap |= (uint64_t)(1 & CAP_AMS_MASK) << CAP_AMS_SHIFT;
+    n->bar.cap |= (uint64_t)(0xf & CAP_TO_MASK) << CAP_TO_SHIFT;
+    n->bar.cap |= (uint64_t)(n->db_stride & CAP_DSTRD_MASK) << CAP_DSTRD_SHIFT;
+    n->bar.cap |= (uint64_t)(0 & CAP_NSSRS_MASK) << CAP_NSSRS_SHIFT;
+    n->bar.cap |= (uint64_t)(1 & CAP_CSS_MASK) << CAP_CSS_SHIFT;
+    n->bar.cap |= (uint64_t)(0 & CAP_MPSMIN_MASK) << CAP_MPSMIN_SHIFT;
+    n->bar.cap |= (uint64_t)(0xf & CAP_MPSMAX_MASK) << CAP_MPSMAX_SHIFT;
+
+    n->bar.vs = 0x00010001;
+    n->bar.intmc = n->bar.intms = 0;
+    NVME_LOG(DBG, "controller:%u cap:%016lx", n->instance, n->bar.cap);
+}
+
+static void nvme_init_admin_queues(NvmeCtrl *n)
+{
+    NVME_LOG(DBG, "controller:%u", n->instance);
+    n->admin_sq.ctrl = n;
+    n->admin_cq.ctrl = n;
+    n->admin_sq.phys_contig = 1;
+    n->admin_cq.phys_contig = 1;
+    n->cq[0] = &n->admin_cq;
+    n->sq[0] = &n->admin_sq;
+
+    QTAILQ_INIT(&n->admin_cq.sq_list);
+    QTAILQ_INSERT_TAIL(&(n->admin_cq.sq_list), &n->admin_sq, entry);
+
+    qemu_mutex_init(&n->admin_sq.queue_lock);
+    qemu_mutex_init(&n->admin_cq.queue_lock);
+    bsem_init(&n->admin_sq.event_lock);
+}
+
+static int nvme_init(PCIDevice *pci_dev)
+{
+    NvmeCtrl *n = DO_UPCAST(NvmeCtrl, dev, pci_dev);
+    uint8_t *pci_conf;
+
+    NVME_LOG(DBG, "new controller B:%u D:%u f:%u", pci_bus_num(pci_dev->bus),
+        PCI_SLOT(pci_dev->devfn), PCI_FUNC(pci_dev->devfn));
+    if (n->num_namespaces == 0 || n->num_namespaces > NVME_MAX_NUM_NAMESPACES) {
+        NVME_LOG(ERR, "requested invalid number of namespace:%u max:%u",
+            n->num_namespaces, NVME_MAX_NUM_NAMESPACES);
+        return -1;
+    }
+    if (n->ns_size == 0 || n->ns_size > NVME_MAX_NAMESPACE_SIZE) {
+        NVME_LOG(ERR, "requested invalid namespace size:%u max:%u",
+            n->ns_size, NVME_MAX_NAMESPACE_SIZE);
+        return -1;
+    }
+    if (n->num_queues < 1 || n->num_queues > NVME_MAX_QS) {
+        NVME_LOG(ERR, "requested invalid number of queues:%u max:%u",
+            n->num_queues, NVME_MAX_QS);
+        return -1;
+    }
+    if (n->db_stride > NVME_MAX_STRIDE) {
+        NVME_LOG(ERR, "requested invalid stride:%u max:%u",
+            n->db_stride, NVME_MAX_STRIDE);
+        return -1;
+    }
+    if (n->max_q_ents < 1 || n->max_q_ents >
+            NVME_MAX_QUEUE_ENTRIES) {
+        NVME_LOG(ERR, "requested invalid queue entries:%u, max:%u",
+            n->max_q_ents, NVME_MAX_QUEUE_ENTRIES);
+        return -1;
+    }
+    if (n->cqr > 1) {
+        NVME_LOG(ERR,
+            "requested invalid contiguous regions requeired:%u max:%u",
+            n->cqr, 1);
+        return -1;
+    }
+
+    n->reg_size = 1 << qemu_fls(0x1004 + 2 * (n->num_queues + 1) *
+        (4 << n->db_stride));
+    NVME_LOG(DBG,
+        "controller:%u namespaces:%u size:%uMB queues:%u stride:%u\n"\
+        "           reg size:%u queue entries:%u cqr:%u mdts:%u aerl:%u "\
+        "acl:%u elpe:%u", instance, n->num_namespaces, n->ns_size,
+        n->num_queues, n->db_stride, n->reg_size, n->max_q_ents, n->cqr,
+        n->mdts, n->aerl, n->acl, n->elpe);
+
+    pci_conf = pci_dev->config;
+    pci_conf[PCI_INTERRUPT_PIN] = 1;
+    pci_config_set_prog_interface(pci_dev->config, 0x2);
+    pci_config_set_class(pci_dev->config, PCI_CLASS_STORAGE_EXPRESS);
+
+    n->instance = instance++;
+    n->start_time = time(NULL);
+    n->error_count = 1;
+    n->namespaces = g_malloc0(sizeof(*n->namespaces)*n->num_namespaces);
+    n->sq = g_malloc0(sizeof(*n->sq)*n->num_queues);
+    n->cq = g_malloc0(sizeof(*n->cq)*n->num_queues);
+    n->aer_cid = g_malloc0((n->aerl + 1) * sizeof(*n->aer_cid));
+    n->elpes = g_malloc0((n->elpe + 1) * sizeof(*n->elpes));
+    n->features.int_vector_config = g_malloc(n->num_queues *
+        sizeof(*n->features.int_vector_config));
+
+    nvme_init_admin_queues(n);
+    nvme_init_ctrl(n);
+    nvme_init_namespaces(n);
+
+    memory_region_init_io(&n->iomem, &nvme_mmio_ops, n, "nvme-mmio",
+        n->reg_size);
+    pci_register_bar(&n->dev, 0,
+        PCI_BASE_ADDRESS_SPACE_MEMORY | PCI_BASE_ADDRESS_MEM_TYPE_64,
+        &n->iomem);
+
+    if (msix_init_exclusive_bar(&n->dev, n->num_queues, 4)) {
+        NVME_LOG(ERR, "controller:%u msix init failed", n->instance);
+    } else {
+        msix_vector_use(&n->dev, 0);
+    }
+
+    NVME_LOG(DBG, "controller:%u initialization complete", n->instance);
+    return 0;
+}
+
+static void nvme_exit(PCIDevice *pci_dev)
+{
+    NvmeCtrl *n = DO_UPCAST(NvmeCtrl, dev, pci_dev);
+    NVME_LOG(DBG, "exit controller:%u", n->instance);
+    nvme_close_namespaces(n);
+    g_free(n->namespaces);
+    g_free(n->cq);
+    g_free(n->sq);
+    g_free(n->aer_cid);
+    g_free(n->features.int_vector_config);
+    msix_vector_unuse(&n->dev, 0);
+    memory_region_destroy(&n->iomem);
+}
+
+static void nvme_reset(DeviceState *dev)
+{
+    NvmeCtrl *n = DO_UPCAST(NvmeCtrl, dev.qdev, dev);
+    NVME_LOG(DBG, "reset controller:%u", n->instance);
+    (void)n;
+}
+
+static Property nvme_props[] = {
+    DEFINE_PROP_UINT32("namespaces", NvmeCtrl, num_namespaces, 1),
+    DEFINE_PROP_UINT32("size", NvmeCtrl, ns_size, 512),
+    DEFINE_PROP_UINT32("queues", NvmeCtrl, num_queues, 64),
+    DEFINE_PROP_UINT32("entries", NvmeCtrl, max_q_ents, 0x7ff),
+    DEFINE_PROP_UINT8("stride", NvmeCtrl, db_stride, 0),
+    DEFINE_PROP_UINT8("aerl", NvmeCtrl, aerl, 3),
+    DEFINE_PROP_UINT8("acl", NvmeCtrl, acl, 3),
+    DEFINE_PROP_UINT8("elpe", NvmeCtrl, elpe, 3),
+    DEFINE_PROP_UINT8("mdts", NvmeCtrl, mdts, 5),
+    DEFINE_PROP_UINT8("cqr", NvmeCtrl, cqr, 1),
+    DEFINE_PROP_STRING("path", NvmeCtrl, disk_path),
+    DEFINE_PROP_END_OF_LIST(),
+};
+
+static void nvme_class_init(ObjectClass *oc, void *data)
+{
+    DeviceClass *dc = DEVICE_CLASS(oc);
+    PCIDeviceClass *pc = PCI_DEVICE_CLASS(oc);
+
+    pc->init = nvme_init;
+    pc->exit = nvme_exit;
+
+    pc->class_id = PCI_CLASS_STORAGE_EXPRESS;
+    pc->vendor_id = PCI_VENDOR_ID_INTEL;
+    pc->device_id = 0x0111;
+    pc->revision = 1;
+
+    dc->desc = "Non-Volatile Memory Express";
+    dc->reset = nvme_reset;
+    dc->props = nvme_props;
+}
+
+static TypeInfo nvme_info = {
+    .name          = "nvme",
+    .parent        = TYPE_PCI_DEVICE,
+    .instance_size = sizeof(NvmeCtrl),
+    .class_init    = nvme_class_init,
+};
+
+static void nvme_register_devices(void)
+{
+    type_register_static(&nvme_info);
+}
+type_init(nvme_register_devices);
diff --git a/hw/pci_ids.h b/hw/pci_ids.h
index 5df7245..bb7ba02 100644
--- a/hw/pci_ids.h
+++ b/hw/pci_ids.h
@@ -17,6 +17,7 @@
 #define PCI_CLASS_STORAGE_IDE            0x0101
 #define PCI_CLASS_STORAGE_RAID           0x0104
 #define PCI_CLASS_STORAGE_SATA           0x0106
+#define PCI_CLASS_STORAGE_EXPRESS        0x0108
 #define PCI_CLASS_STORAGE_OTHER          0x0180
 
 #define PCI_CLASS_NETWORK_ETHERNET       0x0200
