@@ -17,5 +17,14 @@ config BLK_DEV_NVME_SCSI
and block devices nodes, as well a a translation for a small
number of selected SCSI commands to NVMe commands to the NVMe
driver. If you don't know what this means you probably want
- to say N here, and if you know what it means you probably
- want to say N as well.
+ to say N here, unless you run a distro that abuses the SCSI
+ emulation to provide stable device names for mount by id, like
+ some OpenSuSE and SLES versions.
+
+config NVME_VENDOR_EXT_GOOGLE
+ bool "NVMe Vendor Extension for Improved Virtualization"
+ depends on BLK_DEV_NVME
+ ---help---
+ Google extension to reduce the number of MMIO doorbell
+ writes for the NVMe driver
+
@@ -50,6 +50,9 @@
#define SQ_SIZE(depth) (depth * sizeof(struct nvme_command))
#define CQ_SIZE(depth) (depth * sizeof(struct nvme_completion))
+/* Google Vendor ID is not in include/linux/pci_ids.h */
+#define PCI_VENDOR_ID_GOOGLE 0x1AE0
+
/*
* We handle AEN commands ourselves and don't even let the
* block layer know about them.
@@ -122,6 +125,12 @@ struct nvme_dev {
struct nvme_ctrl ctrl;
struct completion ioq_wait;
+#ifdef CONFIG_NVME_VENDOR_EXT_GOOGLE
+ u32 *db_mem;
+ dma_addr_t doorbell;
+ u32 *ei_mem;
+ dma_addr_t eventidx;
+#endif
};
static inline struct nvme_dev *to_nvme_dev(struct nvme_ctrl *ctrl)
@@ -153,6 +162,12 @@ struct nvme_queue {
u16 qid;
u8 cq_phase;
u8 cqe_seen;
+#ifdef CONFIG_NVME_VENDOR_EXT_GOOGLE
+ u32 *sq_doorbell_addr;
+ u32 *sq_eventidx_addr;
+ u32 *cq_doorbell_addr;
+ u32 *cq_eventidx_addr;
+#endif
};
/*
@@ -190,6 +205,9 @@ static inline void _nvme_check_size(void)
BUILD_BUG_ON(sizeof(struct nvme_id_ns) != 4096);
BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64);
BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512);
+#ifdef CONFIG_NVME_VENDOR_EXT_GOOGLE
+ BUILD_BUG_ON(sizeof(struct nvme_doorbell_memory) != 64);
+#endif
}
/*
@@ -319,6 +337,51 @@ static void nvme_complete_async_event(struct nvme_dev *dev,
}
}
+#ifdef CONFIG_NVME_VENDOR_EXT_GOOGLE
+static int nvme_vendor_memory_size(struct nvme_dev *dev)
+{
+ return ((num_possible_cpus() + 1) * 8 * dev->db_stride);
+}
+
+static int nvme_set_doorbell_memory(struct nvme_dev *dev)
+{
+ struct nvme_command c;
+
+ memset(&c, 0, sizeof(c));
+ c.doorbell_memory.opcode = nvme_admin_doorbell_memory;
+ c.doorbell_memory.prp1 = cpu_to_le64(dev->doorbell);
+ c.doorbell_memory.prp2 = cpu_to_le64(dev->eventidx);
+
+ return nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0);
+}
+
+static inline int nvme_ext_need_event(u16 event_idx, u16 new_idx, u16 old)
+{
+ /* Borrowed from vring_need_event */
+ return (u16)(new_idx - event_idx - 1) < (u16)(new_idx - old);
+}
+
+static void nvme_ext_write_doorbell(u16 value, u32 __iomem* q_db,
+ u32* db_addr, volatile u32* event_idx)
+{
+ u16 old_value;
+ if (!db_addr)
+ goto ring_doorbell;
+
+ old_value = *db_addr;
+ *db_addr = value;
+
+ rmb();
+ if (!nvme_ext_need_event(*event_idx, value, old_value))
+ goto no_doorbell;
+
+ring_doorbell:
+ writel(value, q_db);
+no_doorbell:
+ return;
+}
+#endif
+
/**
* __nvme_submit_cmd() - Copy a command into a queue and ring the doorbell
* @nvmeq: The queue to use
@@ -336,9 +399,19 @@ static void __nvme_submit_cmd(struct nvme_queue *nvmeq,
else
memcpy(&nvmeq->sq_cmds[tail], cmd, sizeof(*cmd));
+#ifdef CONFIG_NVME_VENDOR_EXT_GOOGLE
+ if (nvmeq->sq_doorbell_addr)
+ wmb();
+#endif
+
if (++tail == nvmeq->q_depth)
tail = 0;
+#ifdef CONFIG_NVME_VENDOR_EXT_GOOGLE
+ nvme_ext_write_doorbell(tail, nvmeq->q_db,
+ nvmeq->sq_doorbell_addr, nvmeq->sq_eventidx_addr);
+#else
writel(tail, nvmeq->q_db);
+#endif
nvmeq->sq_tail = tail;
}
@@ -753,6 +826,12 @@ static void __nvme_process_cq(struct nvme_queue *nvmeq, unsigned int *tag)
if ((status & 1) != phase)
break;
nvmeq->sq_head = le16_to_cpu(cqe.sq_head);
+
+#ifdef CONFIG_NVME_VENDOR_EXT_GOOGLE
+ if (to_pci_dev(nvmeq->dev->dev)->vendor == PCI_VENDOR_ID_GOOGLE)
+ rmb();
+#endif
+
if (++head == nvmeq->q_depth) {
head = 0;
phase = !phase;
@@ -799,7 +878,14 @@ static void __nvme_process_cq(struct nvme_queue *nvmeq, unsigned int *tag)
return;
if (likely(nvmeq->cq_vector >= 0))
+#ifdef CONFIG_NVME_VENDOR_EXT_GOOGLE
+ nvme_ext_write_doorbell(head,
+ nvmeq->q_db + nvmeq->dev->db_stride,
+ nvmeq->cq_doorbell_addr,
+ nvmeq->cq_eventidx_addr);
+#else
writel(head, nvmeq->q_db + nvmeq->dev->db_stride);
+#endif
nvmeq->cq_head = head;
nvmeq->cq_phase = phase;
@@ -1185,6 +1271,17 @@ static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
dev->queues[qid] = nvmeq;
dev->queue_count++;
+#ifdef CONFIG_NVME_VENDOR_EXT_GOOGLE
+ if (dev->db_mem && dev->ei_mem && qid != 0) {
+ nvmeq->sq_doorbell_addr = &dev->db_mem[qid * 2 * dev->db_stride];
+ nvmeq->cq_doorbell_addr =
+ &dev->db_mem[(qid * 2 + 1) * dev->db_stride];
+ nvmeq->sq_eventidx_addr = &dev->ei_mem[qid * 2 * dev->db_stride];
+ nvmeq->cq_eventidx_addr =
+ &dev->ei_mem[(qid * 2 + 1) * dev->db_stride];
+ }
+#endif
+
return nvmeq;
free_cqdma:
@@ -1215,6 +1312,16 @@ static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid)
nvmeq->cq_head = 0;
nvmeq->cq_phase = 1;
nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
+#ifdef CONFIG_NVME_VENDOR_EXT_GOOGLE
+ if (to_pci_dev(dev->dev)->vendor == PCI_VENDOR_ID_GOOGLE && qid != 0) {
+ nvmeq->sq_doorbell_addr = &dev->db_mem[qid * 2 * dev->db_stride];
+ nvmeq->cq_doorbell_addr =
+ &dev->db_mem[(qid * 2 + 1) * dev->db_stride];
+ nvmeq->sq_eventidx_addr = &dev->ei_mem[qid * 2 * dev->db_stride];
+ nvmeq->cq_eventidx_addr =
+ &dev->ei_mem[(qid * 2 + 1) * dev->db_stride];
+ }
+#endif
memset((void *)nvmeq->cqes, 0, CQ_SIZE(nvmeq->q_depth));
dev->online_queues++;
spin_unlock_irq(&nvmeq->q_lock);
@@ -1713,6 +1820,19 @@ static int nvme_dev_add(struct nvme_dev *dev)
if (blk_mq_alloc_tag_set(&dev->tagset))
return 0;
dev->ctrl.tagset = &dev->tagset;
+
+#ifdef CONFIG_NVME_VENDOR_EXT_GOOGLE
+ if (to_pci_dev(dev->dev)->vendor == PCI_VENDOR_ID_GOOGLE) {
+ int res = nvme_set_doorbell_memory(dev);
+ if (res) {
+ // Free memory and continue on.
+ dma_free_coherent(dev->dev, 8192, dev->db_mem, dev->doorbell);
+ dma_free_coherent(dev->dev, 8192, dev->ei_mem, dev->doorbell);
+ dev->db_mem = 0;
+ dev->ei_mem = 0;
+ }
+ }
+#endif
}
nvme_queue_scan(dev);
return 0;
@@ -1770,8 +1890,31 @@ static int nvme_pci_enable(struct nvme_dev *dev)
pci_enable_pcie_error_reporting(pdev);
pci_save_state(pdev);
+
+#ifdef CONFIG_NVME_VENDOR_EXT_GOOGLE
+ if (pdev->vendor == PCI_VENDOR_ID_GOOGLE) {
+ int mem_size = nvme_vendor_memory_size(dev);
+ dev->db_mem = dma_alloc_coherent(&pdev->dev, mem_size, &dev->doorbell, GFP_KERNEL);
+ if (!dev->db_mem) {
+ result = -ENOMEM;
+ goto disable;
+ }
+ dev->ei_mem = dma_alloc_coherent(&pdev->dev, mem_size, &dev->eventidx, GFP_KERNEL);
+ if (!dev->ei_mem) {
+ result = -ENOMEM;
+ goto dma_free;
+ }
+ }
+#endif
+
return 0;
+#ifdef CONFIG_NVME_VENDOR_EXT_GOOGLE
+ dma_free:
+ dma_free_coherent(&pdev->dev, nvme_vendor_memory_size(dev), dev->db_mem, dev->doorbell);
+ dev->db_mem = 0;
+#endif
+
disable:
pci_disable_device(pdev);
@@ -1788,6 +1931,14 @@ static void nvme_dev_unmap(struct nvme_dev *dev)
static void nvme_pci_disable(struct nvme_dev *dev)
{
struct pci_dev *pdev = to_pci_dev(dev->dev);
+#ifdef CONFIG_NVME_VENDOR_EXT_GOOGLE
+ int mem_size = nvme_vendor_memory_size(dev);
+
+ if (dev->db_mem)
+ dma_free_coherent(&pdev->dev, mem_size, dev->db_mem, dev->doorbell);
+ if (dev->ei_mem)
+ dma_free_coherent(&pdev->dev, mem_size, dev->ei_mem, dev->eventidx);
+#endif
if (pdev->msi_enabled)
pci_disable_msi(pdev);
@@ -546,6 +546,9 @@ enum nvme_admin_opcode {
nvme_admin_format_nvm = 0x80,
nvme_admin_security_send = 0x81,
nvme_admin_security_recv = 0x82,
+#ifdef CONFIG_NVME_VENDOR_EXT_GOOGLE
+ nvme_admin_doorbell_memory = 0xC0,
+#endif
};
enum {
@@ -810,6 +813,18 @@ struct nvmf_property_get_command {
__u8 resv4[16];
};
+#ifdef CONFIG_NVME_VENDOR_EXT_GOOGLE
+struct nvme_doorbell_memory {
+ __u8 opcode;
+ __u8 flags;
+ __u16 command_id;
+ __u32 rsvd1[5];
+ __le64 prp1;
+ __le64 prp2;
+ __u32 rsvd12[6];
+};
+#endif
+
struct nvme_command {
union {
struct nvme_common_command common;
@@ -823,6 +838,9 @@ struct nvme_command {
struct nvme_format_cmd format;
struct nvme_dsm_cmd dsm;
struct nvme_abort_cmd abort;
+#ifdef CONFIG_NVME_VENDOR_EXT_GOOGLE
+ struct nvme_doorbell_memory doorbell_memory;
+#endif
struct nvme_get_log_page_command get_log_page;
struct nvmf_common_command fabrics;
struct nvmf_connect_command connect;
@@ -901,6 +919,9 @@ enum {
NVME_SC_BAD_ATTRIBUTES = 0x180,
NVME_SC_INVALID_PI = 0x181,
NVME_SC_READ_ONLY = 0x182,
+#ifdef CONFIG_NVME_VENDOR_EXT_GOOGLE
+ NVME_SC_DOORBELL_MEMORY_INVALID = 0x1C0,
+#endif
/*
* I/O Command Set Specific - Fabrics commands: