diff mbox series

[RFC,5/5] hw/nvme: extend pi pass capable commands

Message ID 20221124155821.1501969-6-d.tihov@yadro.com
State New
Headers show
Series Protection information pass-through for block devices | expand

Commit Message

Dmitry Tihov Nov. 24, 2022, 3:58 p.m. UTC
Add protection information block level passthrough support to compare,
dataset management, verify and copy nvme commands.

Signed-off-by: Dmitry Tihov <d.tihov@yadro.com>
---
 hw/nvme/ctrl.c       | 348 +++++++++++++++++++++++++++++++++++++++----
 hw/nvme/trace-events |   2 +
 2 files changed, 325 insertions(+), 25 deletions(-)
diff mbox series

Patch

diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index c646345bcc..950d773d59 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -197,6 +197,7 @@ 
 #include "hw/pci/msix.h"
 #include "hw/pci/pcie_sriov.h"
 #include "migration/vmstate.h"
+#include "qemu/memalign.h"
 
 #include "nvme.h"
 #include "dif.h"
@@ -2168,6 +2169,50 @@  out:
     nvme_verify_cb(ctx, ret);
 }
 
+static void nvme_dif_pass_verify_cb(void *opaque, int ret)
+{
+    NvmeBounceContext *ctx = opaque;
+    NvmeRequest *req = ctx->req;
+    NvmeNamespace *ns = req->ns;
+    NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
+    uint64_t slba = le64_to_cpu(rw->slba);
+    uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control));
+    uint16_t apptag = le16_to_cpu(rw->apptag);
+    uint16_t appmask = le16_to_cpu(rw->appmask);
+    uint32_t reftag = le32_to_cpu(rw->reftag);
+
+    trace_pci_nvme_dif_pass_verify_cb(nvme_cid(req));
+    if (trace_event_get_state_backends(TRACE_PCI_NVME_DIF_DUMP_PASS_PI)) {
+        nvme_dif_pass_dump(ns, ctx->data.iov.dif.iov_base,
+                           ctx->data.iov.dif.iov_len);
+    }
+
+    if (unlikely(ret == -EILSEQ)) {
+        req->status = nvme_dif_pass_check(ns, ctx->data.bounce,
+                          ctx->data.iov.size, ctx->data.iov.dif.iov_base,
+                          prinfo, slba, reftag);
+        if (req->status) {
+            /* zero out ret to allow req->status passthrough */
+            ret = 0;
+        }
+        goto out;
+    }
+
+    if (ret) {
+        goto out;
+    }
+
+    req->status = nvme_dif_pass_apptag_check(ns, ctx->data.iov.dif.iov_base,
+                      ctx->data.iov.dif.iov_len, prinfo, apptag, appmask);
+
+out:
+    qemu_iovec_destroy_pi(&ctx->data.iov);
+    g_free(ctx->data.bounce);
+    g_free(ctx);
+
+    nvme_rw_complete_cb(req, ret);
+}
+
 struct nvme_compare_ctx {
     struct {
         QEMUIOVector iov;
@@ -2331,6 +2376,83 @@  out:
     nvme_enqueue_req_completion(nvme_cq(req), req);
 }
 
+static void nvme_dif_pass_compare_cb(void *opaque, int ret)
+{
+    NvmeRequest *req = opaque;
+    NvmeCtrl *n = nvme_ctrl(req);
+    NvmeNamespace *ns = req->ns;
+    NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
+    uint64_t slba = le64_to_cpu(rw->slba);
+    uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
+    size_t mlen = nvme_m2b(ns, nlb);
+    uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control));
+    uint16_t apptag = le16_to_cpu(rw->apptag);
+    uint16_t appmask = le16_to_cpu(rw->appmask);
+    uint32_t reftag = le32_to_cpu(rw->reftag);
+    struct nvme_compare_ctx *ctx = req->opaque;
+    g_autofree uint8_t *buf = NULL;
+    uint16_t status;
+
+    trace_pci_nvme_dif_pass_compare_cb(nvme_cid(req));
+    if (trace_event_get_state_backends(TRACE_PCI_NVME_DIF_DUMP_PASS_PI)) {
+        nvme_dif_pass_dump(ns, ctx->data.iov.dif.iov_base,
+                           ctx->data.iov.dif.iov_len);
+    }
+
+    if (unlikely(ret == -EILSEQ)) {
+        status = nvme_dif_pass_check(ns, ctx->data.bounce, ctx->data.iov.size,
+                                     ctx->data.iov.dif.iov_base, prinfo, slba,
+                                     reftag);
+        if (status) {
+            /* zero out ret to allow req->status passthrough */
+            ret = 0;
+            req->status = status;
+        }
+        goto out;
+    }
+
+    if (ret) {
+        goto out;
+    }
+
+    status = nvme_dif_pass_apptag_check(ns, ctx->data.iov.dif.iov_base,
+                      ctx->data.iov.dif.iov_len, prinfo, apptag, appmask);
+    if (status) {
+        req->status = status;
+        goto out;
+    }
+
+    buf = g_malloc(ctx->data.iov.size);
+    status = nvme_bounce_data(n, buf, ctx->data.iov.size,
+                              NVME_TX_DIRECTION_TO_DEVICE, req);
+    if (status) {
+        req->status = status;
+        goto out;
+    }
+    if (memcmp(buf, ctx->data.bounce, ctx->data.iov.size)) {
+        req->status = NVME_CMP_FAILURE;
+        goto out;
+    }
+
+    ctx->mdata.bounce = g_malloc(mlen);
+    status = nvme_bounce_mdata(n, ctx->mdata.bounce, mlen,
+                               NVME_TX_DIRECTION_TO_DEVICE, req);
+    if (status) {
+        req->status = status;
+        goto out;
+    }
+    if (memcmp(ctx->mdata.bounce, ctx->data.iov.dif.iov_base, mlen)) {
+        req->status = NVME_CMP_FAILURE;
+    }
+
+out:
+    qemu_iovec_destroy_pi(&ctx->data.iov);
+    g_free(ctx->data.bounce);
+    g_free(ctx);
+
+    nvme_rw_complete_cb(req, ret);
+}
+
 typedef struct NvmeDSMAIOCB {
     BlockAIOCB common;
     BlockAIOCB *aiocb;
@@ -2395,7 +2517,7 @@  static void nvme_dsm_md_cb(void *opaque, int ret)
         goto done;
     }
 
-    if (!ns->lbaf.ms) {
+    if (!ns->lbaf.ms || ns->pip) {
         nvme_dsm_cb(iocb, 0);
         return;
     }
@@ -2556,19 +2678,35 @@  static uint16_t nvme_verify(NvmeCtrl *n, NvmeRequest *req)
         }
     }
 
-    ctx = g_new0(NvmeBounceContext, 1);
-    ctx->req = req;
+    if (ns->pip) {
+        ctx = g_new0(NvmeBounceContext, 1);
+        ctx->req = req;
 
-    ctx->data.bounce = g_malloc(len);
+        ctx->data.bounce = qemu_memalign(qemu_real_host_page_size(), len);
 
-    qemu_iovec_init(&ctx->data.iov, 1);
-    qemu_iovec_add(&ctx->data.iov, ctx->data.bounce, len);
+        qemu_iovec_init_pi(&ctx->data.iov, 1, nlb);
+        qemu_iovec_add(&ctx->data.iov, ctx->data.bounce, len);
 
-    block_acct_start(blk_get_stats(blk), &req->acct, ctx->data.iov.size,
-                     BLOCK_ACCT_READ);
+        block_acct_start(blk_get_stats(blk), &req->acct, ctx->data.iov.size,
+                         BLOCK_ACCT_READ);
+
+        req->aiocb = blk_aio_preadv(ns->blkconf.blk, offset, &ctx->data.iov, 0,
+                                    nvme_dif_pass_verify_cb, ctx);
+    } else {
+        ctx = g_new0(NvmeBounceContext, 1);
+        ctx->req = req;
+
+        ctx->data.bounce = g_malloc(len);
+
+        qemu_iovec_init(&ctx->data.iov, 1);
+        qemu_iovec_add(&ctx->data.iov, ctx->data.bounce, len);
+
+        block_acct_start(blk_get_stats(blk), &req->acct, ctx->data.iov.size,
+                         BLOCK_ACCT_READ);
 
-    req->aiocb = blk_aio_preadv(ns->blkconf.blk, offset, &ctx->data.iov, 0,
-                                nvme_verify_mdata_in_cb, ctx);
+        req->aiocb = blk_aio_preadv(ns->blkconf.blk, offset, &ctx->data.iov, 0,
+                                    nvme_verify_mdata_in_cb, ctx);
+    }
     return NVME_NO_COMPLETE;
 }
 
@@ -2625,7 +2763,11 @@  static void nvme_copy_bh(void *opaque)
         req->cqe.result = cpu_to_le32(iocb->idx);
     }
 
-    qemu_iovec_destroy(&iocb->iov);
+    if (ns->pip) {
+        qemu_iovec_destroy_pi(&iocb->iov);
+    } else {
+        qemu_iovec_destroy(&iocb->iov);
+    }
     g_free(iocb->bounce);
 
     qemu_bh_delete(iocb->bh);
@@ -2737,10 +2879,29 @@  static void nvme_copy_out_completed_cb(void *opaque, int ret)
     NvmeRequest *req = iocb->req;
     NvmeNamespace *ns = req->ns;
     uint32_t nlb;
+    uint16_t status;
 
     nvme_copy_source_range_parse(iocb->ranges, iocb->idx, iocb->format, NULL,
                                  &nlb, NULL, NULL, NULL);
 
+    if (ns->pip) {
+        if (iocb->iov.dif.iov_len) {
+            NvmeCopyCmd *copy = (NvmeCopyCmd *)&req->cmd;
+            uint64_t slba = le64_to_cpu(copy->sdlba);
+            uint16_t prinfo = ((copy->control[2] >> 2) & 0xf);
+            size_t len = nvme_l2b(ns, nlb);
+            if (unlikely(ret == -EILSEQ)) {
+                status = nvme_dif_pass_check(ns, iocb->bounce, len,
+                             iocb->iov.dif.iov_base, prinfo, slba,
+                             iocb->reftag);
+                if (status) {
+                    goto invalid;
+                }
+            }
+        }
+
+        iocb->reftag += nlb;
+    }
     if (ret < 0) {
         iocb->ret = ret;
         goto out;
@@ -2754,8 +2915,17 @@  static void nvme_copy_out_completed_cb(void *opaque, int ret)
 
     iocb->idx++;
     iocb->slba += nlb;
+
 out:
     nvme_copy_cb(iocb, iocb->ret);
+    return;
+
+invalid:
+    req->status = status;
+    iocb->aiocb = NULL;
+    if (iocb->bh) {
+        qemu_bh_schedule(iocb->bh);
+    }
 }
 
 static void nvme_copy_out_cb(void *opaque, int ret)
@@ -2900,6 +3070,99 @@  out:
     nvme_copy_cb(iocb, ret);
 }
 
+static void nvme_dif_pass_copy_cb(void *opaque, int ret)
+{
+    NvmeCopyAIOCB *iocb = opaque;
+    NvmeRequest *req = iocb->req;
+    NvmeNamespace *ns = req->ns;
+    NvmeCopyCmd *copy = (NvmeCopyCmd *)&req->cmd;
+    uint16_t prinfor = ((copy->control[0] >> 4) & 0xf);
+    uint16_t prinfow = ((copy->control[2] >> 2) & 0xf);
+    uint32_t nlb;
+    size_t len;
+    uint16_t status;
+    uint64_t slba;
+    uint16_t apptag;
+    uint16_t appmask;
+    uint64_t reftag;
+
+    nvme_copy_source_range_parse(iocb->ranges, iocb->idx, iocb->format, &slba,
+                                 &nlb, &apptag, &appmask, &reftag);
+    len = nvme_l2b(ns, nlb);
+
+    if (unlikely(ret == -EILSEQ)) {
+        status = nvme_dif_pass_check(ns, iocb->bounce, len,
+                                     iocb->iov.dif.iov_base, prinfor, slba,
+                                     reftag);
+        if (status) {
+            goto invalid;
+        }
+    }
+
+    if (ret < 0) {
+        iocb->ret = ret;
+        goto out;
+    } else if (iocb->ret < 0) {
+        goto out;
+    }
+
+    status = nvme_dif_pass_apptag_check(ns, iocb->iov.dif.iov_base,
+                                        nvme_m2b(ns, nlb), prinfor, apptag,
+                                        appmask);
+    if (status) {
+        goto invalid;
+    }
+
+    status = nvme_check_prinfo(ns, prinfow, iocb->slba, iocb->reftag);
+    if (status) {
+        goto invalid;
+    }
+    status = nvme_check_bounds(ns, iocb->slba, nlb);
+    if (status) {
+        goto invalid;
+    }
+
+    if (ns->params.zoned) {
+        status = nvme_check_zone_write(ns, iocb->zone, iocb->slba, nlb);
+        if (status) {
+            goto invalid;
+        }
+
+        iocb->zone->w_ptr += nlb;
+    }
+
+    if (prinfow & NVME_PRINFO_PRACT) {
+        qemu_iovec_reset(&iocb->iov);
+        qemu_iovec_add(&iocb->iov, iocb->bounce, len);
+    } else {
+        appmask = le16_to_cpu(copy->appmask);
+        apptag = le16_to_cpu(copy->apptag);
+        status = nvme_dif_pass_apptag_check(ns, iocb->iov.dif.iov_base,
+                                            nvme_m2b(ns, nlb), prinfow, apptag,
+                                            appmask);
+        if (status) {
+            goto invalid;
+        }
+    }
+    iocb->aiocb = blk_aio_pwritev(ns->blkconf.blk, nvme_l2b(ns, iocb->slba),
+                                  &iocb->iov, 0, nvme_copy_out_completed_cb,
+                                  iocb);
+
+    return;
+
+invalid:
+    req->status = status;
+    iocb->aiocb = NULL;
+    if (iocb->bh) {
+        qemu_bh_schedule(iocb->bh);
+    }
+
+    return;
+
+out:
+    nvme_copy_cb(iocb, ret);
+}
+
 static void nvme_copy_in_cb(void *opaque, int ret)
 {
     NvmeCopyAIOCB *iocb = opaque;
@@ -2943,6 +3206,7 @@  static void nvme_copy_cb(void *opaque, int ret)
     NvmeNamespace *ns = req->ns;
     uint64_t slba;
     uint32_t nlb;
+    uint64_t reftag;
     size_t len;
     uint16_t status;
 
@@ -2958,7 +3222,7 @@  static void nvme_copy_cb(void *opaque, int ret)
     }
 
     nvme_copy_source_range_parse(iocb->ranges, iocb->idx, iocb->format, &slba,
-                                 &nlb, NULL, NULL, NULL);
+                                 &nlb, NULL, NULL, &reftag);
     len = nvme_l2b(ns, nlb);
 
     trace_pci_nvme_copy_source_range(slba, nlb);
@@ -2990,8 +3254,21 @@  static void nvme_copy_cb(void *opaque, int ret)
     qemu_iovec_reset(&iocb->iov);
     qemu_iovec_add(&iocb->iov, iocb->bounce, len);
 
-    iocb->aiocb = blk_aio_preadv(ns->blkconf.blk, nvme_l2b(ns, slba),
-                                 &iocb->iov, 0, nvme_copy_in_cb, iocb);
+    if (ns->pip) {
+        NvmeCopyCmd *copy = (NvmeCopyCmd *)&req->cmd;
+        uint16_t prinfor = ((copy->control[0] >> 4) & 0xf);
+        status = nvme_check_prinfo(ns, prinfor, slba, reftag);
+        if (status) {
+            goto invalid;
+        }
+        iocb->iov.dif.iov_len = nvme_m2b(ns, nlb);
+        iocb->aiocb = blk_aio_preadv(ns->blkconf.blk, nvme_l2b(ns, slba),
+                                     &iocb->iov, 0, nvme_dif_pass_copy_cb,
+                                     iocb);
+    } else {
+        iocb->aiocb = blk_aio_preadv(ns->blkconf.blk, nvme_l2b(ns, slba),
+                                     &iocb->iov, 0, nvme_copy_in_cb, iocb);
+    }
     return;
 
 invalid:
@@ -3078,11 +3355,19 @@  static uint16_t nvme_copy(NvmeCtrl *n, NvmeRequest *req)
     iocb->idx = 0;
     iocb->reftag = le32_to_cpu(copy->reftag);
     iocb->reftag |= (uint64_t)le32_to_cpu(copy->cdw3) << 32;
-    iocb->bounce = g_malloc_n(le16_to_cpu(ns->id_ns.mssrl),
-                              ns->lbasz + ns->lbaf.ms);
 
     qemu_iovec_init(&iocb->iov, 1);
 
+    if (ns->pip) {
+        qemu_iovec_init_pi(&iocb->iov, 1, le16_to_cpu(ns->id_ns.mssrl));
+        iocb->bounce = qemu_memalign(qemu_real_host_page_size(),
+                                     le16_to_cpu(ns->id_ns.mssrl) * ns->lbasz);
+    } else {
+        qemu_iovec_init(&iocb->iov, 1);
+        iocb->bounce = g_malloc_n(le16_to_cpu(ns->id_ns.mssrl),
+                                  ns->lbasz + ns->lbaf.ms);
+    }
+
     block_acct_start(blk_get_stats(ns->blkconf.blk), &iocb->acct.read, 0,
                      BLOCK_ACCT_READ);
     block_acct_start(blk_get_stats(ns->blkconf.blk), &iocb->acct.write, 0,
@@ -3145,18 +3430,31 @@  static uint16_t nvme_compare(NvmeCtrl *n, NvmeRequest *req)
         return status;
     }
 
-    ctx = g_new(struct nvme_compare_ctx, 1);
-    ctx->data.bounce = g_malloc(data_len);
+    if (ns->pip) {
+        ctx = g_new0(struct nvme_compare_ctx, 1);
+        ctx->data.bounce = qemu_memalign(qemu_real_host_page_size(), data_len);
+
+        req->opaque = ctx;
 
-    req->opaque = ctx;
+        qemu_iovec_init_pi(&ctx->data.iov, 1, nlb);
+        qemu_iovec_add(&ctx->data.iov, ctx->data.bounce, data_len);
+        block_acct_start(blk_get_stats(blk), &req->acct, data_len,
+                         BLOCK_ACCT_READ);
+        req->aiocb = blk_aio_preadv(blk, offset, &ctx->data.iov, 0,
+                                    nvme_dif_pass_compare_cb, req);
+    } else {
+        ctx = g_new(struct nvme_compare_ctx, 1);
+        ctx->data.bounce = g_malloc(data_len);
 
-    qemu_iovec_init(&ctx->data.iov, 1);
-    qemu_iovec_add(&ctx->data.iov, ctx->data.bounce, data_len);
+        req->opaque = ctx;
 
-    block_acct_start(blk_get_stats(blk), &req->acct, data_len,
-                     BLOCK_ACCT_READ);
-    req->aiocb = blk_aio_preadv(blk, offset, &ctx->data.iov, 0,
-                                nvme_compare_data_cb, req);
+        qemu_iovec_init(&ctx->data.iov, 1);
+        qemu_iovec_add(&ctx->data.iov, ctx->data.bounce, data_len);
+        block_acct_start(blk_get_stats(blk), &req->acct, data_len,
+                         BLOCK_ACCT_READ);
+        req->aiocb = blk_aio_preadv(blk, offset, &ctx->data.iov, 0,
+                                    nvme_compare_data_cb, req);
+    }
 
     return NVME_NO_COMPLETE;
 }
diff --git a/hw/nvme/trace-events b/hw/nvme/trace-events
index 259fa8ffa2..42c171ed72 100644
--- a/hw/nvme/trace-events
+++ b/hw/nvme/trace-events
@@ -41,12 +41,14 @@  pci_nvme_copy_out(uint64_t slba, uint32_t nlb) "slba 0x%"PRIx64" nlb %"PRIu32""
 pci_nvme_verify(uint16_t cid, uint32_t nsid, uint64_t slba, uint32_t nlb) "cid %"PRIu16" nsid %"PRIu32" slba 0x%"PRIx64" nlb %"PRIu32""
 pci_nvme_verify_mdata_in_cb(uint16_t cid, const char *blkname) "cid %"PRIu16" blk '%s'"
 pci_nvme_verify_cb(uint16_t cid, uint8_t prinfo, uint16_t apptag, uint16_t appmask, uint32_t reftag) "cid %"PRIu16" prinfo 0x%"PRIx8" apptag 0x%"PRIx16" appmask 0x%"PRIx16" reftag 0x%"PRIx32""
+pci_nvme_dif_pass_verify_cb(uint16_t cid) "cid %"PRIu16""
 pci_nvme_rw_complete_cb(uint16_t cid, const char *blkname) "cid %"PRIu16" blk '%s'"
 pci_nvme_block_status(int64_t offset, int64_t bytes, int64_t pnum, int ret, bool zeroed) "offset %"PRId64" bytes %"PRId64" pnum %"PRId64" ret 0x%x zeroed %d"
 pci_nvme_dsm(uint32_t nr, uint32_t attr) "nr %"PRIu32" attr 0x%"PRIx32""
 pci_nvme_dsm_deallocate(uint64_t slba, uint32_t nlb) "slba %"PRIu64" nlb %"PRIu32""
 pci_nvme_dsm_single_range_limit_exceeded(uint32_t nlb, uint32_t dmrsl) "nlb %"PRIu32" dmrsl %"PRIu32""
 pci_nvme_compare(uint16_t cid, uint32_t nsid, uint64_t slba, uint32_t nlb) "cid %"PRIu16" nsid %"PRIu32" slba 0x%"PRIx64" nlb %"PRIu32""
+pci_nvme_dif_pass_compare_cb(uint16_t cid) "cid %"PRIu16""
 pci_nvme_compare_data_cb(uint16_t cid) "cid %"PRIu16""
 pci_nvme_compare_mdata_cb(uint16_t cid) "cid %"PRIu16""
 pci_nvme_aio_discard_cb(uint16_t cid) "cid %"PRIu16""