diff mbox series

[2/2] hw/block/nvme: add write uncorrectable command

Message ID 20210210070646.730110-3-its@irrelevant.dk
State New
Headers show
Series hw/block/nvme: oncs and write uncorrectable support | expand

Commit Message

Klaus Jensen Feb. 10, 2021, 7:06 a.m. UTC
From: Gollu Appalanaidu <anaidu.gollu@samsung.com>

Add support for marking blocks invalid with the Write Uncorrectable
command. Block status is tracked in a (non-persistent) bitmap that is
checked on all reads and written to on all writes. This is potentially
expensive, so keep Write Uncorrectable disabled by default.

Signed-off-by: Gollu Appalanaidu <anaidu.gollu@samsung.com>
Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
---
 docs/specs/nvme.txt   |  3 ++
 hw/block/nvme-ns.h    |  2 ++
 hw/block/nvme.h       |  1 +
 hw/block/nvme-ns.c    |  2 ++
 hw/block/nvme.c       | 65 +++++++++++++++++++++++++++++++++++++------
 hw/block/trace-events |  1 +
 6 files changed, 66 insertions(+), 8 deletions(-)

Comments

Minwoo Im Feb. 10, 2021, 11:14 a.m. UTC | #1
On 21-02-10 08:06:46, Klaus Jensen wrote:
> From: Gollu Appalanaidu <anaidu.gollu@samsung.com>
> 
> Add support for marking blocks invalid with the Write Uncorrectable
> command. Block status is tracked in a (non-persistent) bitmap that is
> checked on all reads and written to on all writes. This is potentially
> expensive, so keep Write Uncorrectable disabled by default.
> 
> Signed-off-by: Gollu Appalanaidu <anaidu.gollu@samsung.com>
> Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
> ---
>  docs/specs/nvme.txt   |  3 ++
>  hw/block/nvme-ns.h    |  2 ++
>  hw/block/nvme.h       |  1 +
>  hw/block/nvme-ns.c    |  2 ++
>  hw/block/nvme.c       | 65 +++++++++++++++++++++++++++++++++++++------
>  hw/block/trace-events |  1 +
>  6 files changed, 66 insertions(+), 8 deletions(-)
> 
> diff --git a/docs/specs/nvme.txt b/docs/specs/nvme.txt
> index 56d393884e7a..88f9cc278d4c 100644
> --- a/docs/specs/nvme.txt
> +++ b/docs/specs/nvme.txt
> @@ -19,5 +19,8 @@ Known issues
>  
>  * The accounting numbers in the SMART/Health are reset across power cycles
>  
> +* Marking blocks invalid with the Write Uncorrectable is not persisted across
> +  power cycles.
> +
>  * Interrupt Coalescing is not supported and is disabled by default in volation
>    of the specification.
> diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h
> index 7af6884862b5..15fa422ded03 100644
> --- a/hw/block/nvme-ns.h
> +++ b/hw/block/nvme-ns.h
> @@ -72,6 +72,8 @@ typedef struct NvmeNamespace {
>      struct {
>          uint32_t err_rec;
>      } features;
> +
> +    unsigned long *uncorrectable;
>  } NvmeNamespace;
>  
>  static inline uint32_t nvme_nsid(NvmeNamespace *ns)
> diff --git a/hw/block/nvme.h b/hw/block/nvme.h
> index 98082b2dfba3..9b8f85b9cf16 100644
> --- a/hw/block/nvme.h
> +++ b/hw/block/nvme.h
> @@ -68,6 +68,7 @@ static inline const char *nvme_io_opc_str(uint8_t opc)
>      case NVME_CMD_FLUSH:            return "NVME_NVM_CMD_FLUSH";
>      case NVME_CMD_WRITE:            return "NVME_NVM_CMD_WRITE";
>      case NVME_CMD_READ:             return "NVME_NVM_CMD_READ";
> +    case NVME_CMD_WRITE_UNCOR:      return "NVME_CMD_WRITE_UNCOR";
>      case NVME_CMD_COMPARE:          return "NVME_NVM_CMD_COMPARE";
>      case NVME_CMD_WRITE_ZEROES:     return "NVME_NVM_CMD_WRITE_ZEROES";
>      case NVME_CMD_DSM:              return "NVME_NVM_CMD_DSM";
> diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c
> index ade46e2f3739..742bbc4b4b62 100644
> --- a/hw/block/nvme-ns.c
> +++ b/hw/block/nvme-ns.c
> @@ -72,6 +72,8 @@ static int nvme_ns_init(NvmeNamespace *ns, Error **errp)
>      id_ns->mcl = cpu_to_le32(ns->params.mcl);
>      id_ns->msrc = ns->params.msrc;
>  
> +    ns->uncorrectable = bitmap_new(id_ns->nsze);
> +
>      return 0;
>  }
>  
> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index e5f6666725d7..56048046c193 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -1112,6 +1112,20 @@ static uint16_t nvme_check_dulbe(NvmeNamespace *ns, uint64_t slba,
>      return NVME_SUCCESS;
>  }
>  
> +static inline uint16_t nvme_check_uncor(NvmeNamespace *ns, uint64_t slba,
> +                                        uint32_t nlb)
> +{
> +    uint64_t elba = nlb + slba;
> +
> +    if (ns->uncorrectable) {
> +        if (find_next_bit(ns->uncorrectable, elba, slba) < elba) {
> +            return NVME_UNRECOVERED_READ | NVME_DNR;
> +        }
> +    }
> +
> +    return NVME_SUCCESS;
> +}
> +
>  static void nvme_aio_err(NvmeRequest *req, int ret)
>  {
>      uint16_t status = NVME_SUCCESS;
> @@ -1423,14 +1437,24 @@ static void nvme_rw_cb(void *opaque, int ret)
>      BlockAcctCookie *acct = &req->acct;
>      BlockAcctStats *stats = blk_get_stats(blk);
>  
> +    bool is_write = nvme_is_write(req);
> +
>      trace_pci_nvme_rw_cb(nvme_cid(req), blk_name(blk));
>  
> -    if (ns->params.zoned && nvme_is_write(req)) {
> +    if (ns->params.zoned && is_write) {
>          nvme_finalize_zoned_write(ns, req);
>      }
>  
>      if (!ret) {
>          block_acct_done(stats, acct);
> +
> +        if (is_write) {
> +            NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
> +            uint64_t slba = le64_to_cpu(rw->slba);
> +            uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
> +
> +            bitmap_clear(ns->uncorrectable, slba, nlb);

It might be nitpick, 'nlb' would easily represent the value which is
defined itself in the spec which is zero-based.  Can we have this like:

	uint32_t nlb = le16_to_cpu(rw->nlb);

	bitmap_clear(ns->uncorrectable, slba, nlb + 1);

Otherwise, it looks good to me.

Reviewed-by: Minwoo Im <minwoo.im.dev@gmail.com>
Klaus Jensen Feb. 10, 2021, 11:42 a.m. UTC | #2
On Feb 10 20:14, Minwoo Im wrote:
> On 21-02-10 08:06:46, Klaus Jensen wrote:
> > From: Gollu Appalanaidu <anaidu.gollu@samsung.com>
> > 
> > Add support for marking blocks invalid with the Write Uncorrectable
> > command. Block status is tracked in a (non-persistent) bitmap that is
> > checked on all reads and written to on all writes. This is potentially
> > expensive, so keep Write Uncorrectable disabled by default.
> > 
> > Signed-off-by: Gollu Appalanaidu <anaidu.gollu@samsung.com>
> > Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
> > ---
> >  docs/specs/nvme.txt   |  3 ++
> >  hw/block/nvme-ns.h    |  2 ++
> >  hw/block/nvme.h       |  1 +
> >  hw/block/nvme-ns.c    |  2 ++
> >  hw/block/nvme.c       | 65 +++++++++++++++++++++++++++++++++++++------
> >  hw/block/trace-events |  1 +
> >  6 files changed, 66 insertions(+), 8 deletions(-)
> > 
> > diff --git a/docs/specs/nvme.txt b/docs/specs/nvme.txt
> > index 56d393884e7a..88f9cc278d4c 100644
> > --- a/docs/specs/nvme.txt
> > +++ b/docs/specs/nvme.txt
> > @@ -19,5 +19,8 @@ Known issues
> >  
> >  * The accounting numbers in the SMART/Health are reset across power cycles
> >  
> > +* Marking blocks invalid with the Write Uncorrectable is not persisted across
> > +  power cycles.
> > +
> >  * Interrupt Coalescing is not supported and is disabled by default in volation
> >    of the specification.
> > diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h
> > index 7af6884862b5..15fa422ded03 100644
> > --- a/hw/block/nvme-ns.h
> > +++ b/hw/block/nvme-ns.h
> > @@ -72,6 +72,8 @@ typedef struct NvmeNamespace {
> >      struct {
> >          uint32_t err_rec;
> >      } features;
> > +
> > +    unsigned long *uncorrectable;
> >  } NvmeNamespace;
> >  
> >  static inline uint32_t nvme_nsid(NvmeNamespace *ns)
> > diff --git a/hw/block/nvme.h b/hw/block/nvme.h
> > index 98082b2dfba3..9b8f85b9cf16 100644
> > --- a/hw/block/nvme.h
> > +++ b/hw/block/nvme.h
> > @@ -68,6 +68,7 @@ static inline const char *nvme_io_opc_str(uint8_t opc)
> >      case NVME_CMD_FLUSH:            return "NVME_NVM_CMD_FLUSH";
> >      case NVME_CMD_WRITE:            return "NVME_NVM_CMD_WRITE";
> >      case NVME_CMD_READ:             return "NVME_NVM_CMD_READ";
> > +    case NVME_CMD_WRITE_UNCOR:      return "NVME_CMD_WRITE_UNCOR";
> >      case NVME_CMD_COMPARE:          return "NVME_NVM_CMD_COMPARE";
> >      case NVME_CMD_WRITE_ZEROES:     return "NVME_NVM_CMD_WRITE_ZEROES";
> >      case NVME_CMD_DSM:              return "NVME_NVM_CMD_DSM";
> > diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c
> > index ade46e2f3739..742bbc4b4b62 100644
> > --- a/hw/block/nvme-ns.c
> > +++ b/hw/block/nvme-ns.c
> > @@ -72,6 +72,8 @@ static int nvme_ns_init(NvmeNamespace *ns, Error **errp)
> >      id_ns->mcl = cpu_to_le32(ns->params.mcl);
> >      id_ns->msrc = ns->params.msrc;
> >  
> > +    ns->uncorrectable = bitmap_new(id_ns->nsze);
> > +
> >      return 0;
> >  }
> >  
> > diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> > index e5f6666725d7..56048046c193 100644
> > --- a/hw/block/nvme.c
> > +++ b/hw/block/nvme.c
> > @@ -1112,6 +1112,20 @@ static uint16_t nvme_check_dulbe(NvmeNamespace *ns, uint64_t slba,
> >      return NVME_SUCCESS;
> >  }
> >  
> > +static inline uint16_t nvme_check_uncor(NvmeNamespace *ns, uint64_t slba,
> > +                                        uint32_t nlb)
> > +{
> > +    uint64_t elba = nlb + slba;
> > +
> > +    if (ns->uncorrectable) {
> > +        if (find_next_bit(ns->uncorrectable, elba, slba) < elba) {
> > +            return NVME_UNRECOVERED_READ | NVME_DNR;
> > +        }
> > +    }
> > +
> > +    return NVME_SUCCESS;
> > +}
> > +
> >  static void nvme_aio_err(NvmeRequest *req, int ret)
> >  {
> >      uint16_t status = NVME_SUCCESS;
> > @@ -1423,14 +1437,24 @@ static void nvme_rw_cb(void *opaque, int ret)
> >      BlockAcctCookie *acct = &req->acct;
> >      BlockAcctStats *stats = blk_get_stats(blk);
> >  
> > +    bool is_write = nvme_is_write(req);
> > +
> >      trace_pci_nvme_rw_cb(nvme_cid(req), blk_name(blk));
> >  
> > -    if (ns->params.zoned && nvme_is_write(req)) {
> > +    if (ns->params.zoned && is_write) {
> >          nvme_finalize_zoned_write(ns, req);
> >      }
> >  
> >      if (!ret) {
> >          block_acct_done(stats, acct);
> > +
> > +        if (is_write) {
> > +            NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
> > +            uint64_t slba = le64_to_cpu(rw->slba);
> > +            uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
> > +
> > +            bitmap_clear(ns->uncorrectable, slba, nlb);
> 
> It might be nitpick, 'nlb' would easily represent the value which is
> defined itself in the spec which is zero-based.  Can we have this like:
> 
> 	uint32_t nlb = le16_to_cpu(rw->nlb);
> 
> 	bitmap_clear(ns->uncorrectable, slba, nlb + 1);
> 


I do not disagree, but the `uint32_t nlb = le16_to_cpu(rw->nlb) + 1;`
pattern is already used in several places.

> Otherwise, it looks good to me.
> 
> Reviewed-by: Minwoo Im <minwoo.im.dev@gmail.com>
Minwoo Im Feb. 10, 2021, 3:28 p.m. UTC | #3
> > It might be nitpick, 'nlb' would easily represent the value which is
> > defined itself in the spec which is zero-based.  Can we have this like:
> > 
> > 	uint32_t nlb = le16_to_cpu(rw->nlb);
> > 
> > 	bitmap_clear(ns->uncorrectable, slba, nlb + 1);
> > 
> 
> 
> I do not disagree, but the `uint32_t nlb = le16_to_cpu(rw->nlb) + 1;`
> pattern is already used in several places.

Oh yes, Now I just saw some places.  Then, please take my review tag for
this patch.

Thanks!
Keith Busch Feb. 11, 2021, 3:37 a.m. UTC | #4
On Wed, Feb 10, 2021 at 08:06:46AM +0100, Klaus Jensen wrote:
> From: Gollu Appalanaidu <anaidu.gollu@samsung.com>
> 
> Add support for marking blocks invalid with the Write Uncorrectable
> command. Block status is tracked in a (non-persistent) bitmap that is
> checked on all reads and written to on all writes. This is potentially
> expensive, so keep Write Uncorrectable disabled by default.

I really think attempting to emulate all these things is putting a
potentially unnecessary maintenance burden on this device.

The DULBE implementation started off similiar, but I suggested it
leverage support out of the backing file, and I feel it ended up better
for it.

But unlike punching and checking for holes, there's no filesystem
support for Write Uncorrectable in our qemu API, and that's probably
because this is kind of a niche feature. Is there a use case with a
real qemu guest wanting this?
Klaus Jensen Feb. 11, 2021, 8:43 a.m. UTC | #5
On Feb 11 12:37, Keith Busch wrote:
> On Wed, Feb 10, 2021 at 08:06:46AM +0100, Klaus Jensen wrote:
> > From: Gollu Appalanaidu <anaidu.gollu@samsung.com>
> > 
> > Add support for marking blocks invalid with the Write Uncorrectable
> > command. Block status is tracked in a (non-persistent) bitmap that is
> > checked on all reads and written to on all writes. This is potentially
> > expensive, so keep Write Uncorrectable disabled by default.
> 
> I really think attempting to emulate all these things is putting a
> potentially unnecessary maintenance burden on this device.
> 

Even though I myself posted the Telemetry support on behalf of Gollu, I
now agree that it would bloat the device with a "useless" feature. We
totally accept that and in that initial form there was not a lot of bang
for bucks.

This emulated feature comes at a pretty low cost in terms of code and
complexity, but I won't argue beyond that. However, it does comes at a
performance cost, which is why the intention was to disable it by
default.

> The DULBE implementation started off similiar, but I suggested it
> leverage support out of the backing file, and I feel it ended up better
> for it.
> 

And thanks for pushing back against that - the solution we ended up with
was totally superior, no doubt about that!

> But unlike punching and checking for holes, there's no filesystem
> support for Write Uncorrectable in our qemu API, and that's probably
> because this is kind of a niche feature.

True. I don't see any trivial way to support this on a lower level. Even
if we persued implementing this in the QEMU block layer I only think it
could be supported by "integrated formats" like QCOW2 that could
optionally include a persistent bitmap, like the dirty bitmap feature.

> Is there a use case with a real qemu guest wanting this?

Like for the extended metadata case (which also does not have a lot of
"public" exposure, but definitely have enterprise use), our main
motivation here was to ease testing for compliance suites and frameworks
like SPDK. I'm honestly have no clue what so ever what a real world use
of Write Uncorrectable would be. It's been in the spec since 1.0, so
there must have been some reason, Is it just to align with SCSI WRITE
LONG? I'm not SCSI expert at all, but from what I can read it looks like
that was also intended as a feature for testing read error conditions.
Keith Busch Feb. 11, 2021, 3:37 p.m. UTC | #6
On Thu, Feb 11, 2021 at 09:43:05AM +0100, Klaus Jensen wrote:
> On Feb 11 12:37, Keith Busch wrote:
> 
> > Is there a use case with a real qemu guest wanting this?
> 
> Like for the extended metadata case (which also does not have a lot of
> "public" exposure, but definitely have enterprise use), our main
> motivation here was to ease testing for compliance suites and frameworks
> like SPDK. 

I'm okay with the metadata patches.

> I'm honestly have no clue what so ever what a real world use
> of Write Uncorrectable would be. It's been in the spec since 1.0, so
> there must have been some reason, Is it just to align with SCSI WRITE
> LONG? I'm not SCSI expert at all, but from what I can read it looks like
> that was also intended as a feature for testing read error conditions.

I don't think it's for testing purposes.

If you need to send a burst of non-atomic writes (ex: writing a RAID
stripe), a power failure can result in an inconsistent state where you
don't know at a block level which ones have old data or new data. If you
Write Uncorrectable first, you can never read old data, and thus have no
"write hole".

Journalling solves this better, and I'm not aware of any real
implementation relying on uncorrectable.
Klaus Jensen Feb. 11, 2021, 5:54 p.m. UTC | #7
On Feb 12 00:37, Keith Busch wrote:
> On Thu, Feb 11, 2021 at 09:43:05AM +0100, Klaus Jensen wrote:
> > On Feb 11 12:37, Keith Busch wrote:
> > 
> > > Is there a use case with a real qemu guest wanting this?
> > 
> > Like for the extended metadata case (which also does not have a lot of
> > "public" exposure, but definitely have enterprise use), our main
> > motivation here was to ease testing for compliance suites and frameworks
> > like SPDK. 
> 
> I'm okay with the metadata patches.
> 
> > I'm honestly have no clue what so ever what a real world use
> > of Write Uncorrectable would be. It's been in the spec since 1.0, so
> > there must have been some reason, Is it just to align with SCSI WRITE
> > LONG? I'm not SCSI expert at all, but from what I can read it looks like
> > that was also intended as a feature for testing read error conditions.
> 
> I don't think it's for testing purposes.
> 
> If you need to send a burst of non-atomic writes (ex: writing a RAID
> stripe), a power failure can result in an inconsistent state where you
> don't know at a block level which ones have old data or new data. If you
> Write Uncorrectable first, you can never read old data, and thus have no
> "write hole".
> 
> Journalling solves this better, and I'm not aware of any real
> implementation relying on uncorrectable.

Right, thanks! I'm aware of the RAID write hole issue, but I did not
consider Write Uncorrectable as a possible means to solve it.
diff mbox series

Patch

diff --git a/docs/specs/nvme.txt b/docs/specs/nvme.txt
index 56d393884e7a..88f9cc278d4c 100644
--- a/docs/specs/nvme.txt
+++ b/docs/specs/nvme.txt
@@ -19,5 +19,8 @@  Known issues
 
 * The accounting numbers in the SMART/Health are reset across power cycles
 
+* Marking blocks invalid with the Write Uncorrectable is not persisted across
+  power cycles.
+
 * Interrupt Coalescing is not supported and is disabled by default in volation
   of the specification.
diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h
index 7af6884862b5..15fa422ded03 100644
--- a/hw/block/nvme-ns.h
+++ b/hw/block/nvme-ns.h
@@ -72,6 +72,8 @@  typedef struct NvmeNamespace {
     struct {
         uint32_t err_rec;
     } features;
+
+    unsigned long *uncorrectable;
 } NvmeNamespace;
 
 static inline uint32_t nvme_nsid(NvmeNamespace *ns)
diff --git a/hw/block/nvme.h b/hw/block/nvme.h
index 98082b2dfba3..9b8f85b9cf16 100644
--- a/hw/block/nvme.h
+++ b/hw/block/nvme.h
@@ -68,6 +68,7 @@  static inline const char *nvme_io_opc_str(uint8_t opc)
     case NVME_CMD_FLUSH:            return "NVME_NVM_CMD_FLUSH";
     case NVME_CMD_WRITE:            return "NVME_NVM_CMD_WRITE";
     case NVME_CMD_READ:             return "NVME_NVM_CMD_READ";
+    case NVME_CMD_WRITE_UNCOR:      return "NVME_CMD_WRITE_UNCOR";
     case NVME_CMD_COMPARE:          return "NVME_NVM_CMD_COMPARE";
     case NVME_CMD_WRITE_ZEROES:     return "NVME_NVM_CMD_WRITE_ZEROES";
     case NVME_CMD_DSM:              return "NVME_NVM_CMD_DSM";
diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c
index ade46e2f3739..742bbc4b4b62 100644
--- a/hw/block/nvme-ns.c
+++ b/hw/block/nvme-ns.c
@@ -72,6 +72,8 @@  static int nvme_ns_init(NvmeNamespace *ns, Error **errp)
     id_ns->mcl = cpu_to_le32(ns->params.mcl);
     id_ns->msrc = ns->params.msrc;
 
+    ns->uncorrectable = bitmap_new(id_ns->nsze);
+
     return 0;
 }
 
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index e5f6666725d7..56048046c193 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -1112,6 +1112,20 @@  static uint16_t nvme_check_dulbe(NvmeNamespace *ns, uint64_t slba,
     return NVME_SUCCESS;
 }
 
+static inline uint16_t nvme_check_uncor(NvmeNamespace *ns, uint64_t slba,
+                                        uint32_t nlb)
+{
+    uint64_t elba = nlb + slba;
+
+    if (ns->uncorrectable) {
+        if (find_next_bit(ns->uncorrectable, elba, slba) < elba) {
+            return NVME_UNRECOVERED_READ | NVME_DNR;
+        }
+    }
+
+    return NVME_SUCCESS;
+}
+
 static void nvme_aio_err(NvmeRequest *req, int ret)
 {
     uint16_t status = NVME_SUCCESS;
@@ -1423,14 +1437,24 @@  static void nvme_rw_cb(void *opaque, int ret)
     BlockAcctCookie *acct = &req->acct;
     BlockAcctStats *stats = blk_get_stats(blk);
 
+    bool is_write = nvme_is_write(req);
+
     trace_pci_nvme_rw_cb(nvme_cid(req), blk_name(blk));
 
-    if (ns->params.zoned && nvme_is_write(req)) {
+    if (ns->params.zoned && is_write) {
         nvme_finalize_zoned_write(ns, req);
     }
 
     if (!ret) {
         block_acct_done(stats, acct);
+
+        if (is_write) {
+            NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
+            uint64_t slba = le64_to_cpu(rw->slba);
+            uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
+
+            bitmap_clear(ns->uncorrectable, slba, nlb);
+        }
     } else {
         block_acct_failed(stats, acct);
         nvme_aio_err(req, ret);
@@ -1521,13 +1545,13 @@  static void nvme_copy_cb(void *opaque, int ret)
 {
     NvmeRequest *req = opaque;
     NvmeNamespace *ns = req->ns;
+    NvmeCopyCmd *copy = (NvmeCopyCmd *)&req->cmd;
+    uint64_t sdlba = le64_to_cpu(copy->sdlba);
     struct nvme_copy_ctx *ctx = req->opaque;
 
     trace_pci_nvme_copy_cb(nvme_cid(req));
 
     if (ns->params.zoned) {
-        NvmeCopyCmd *copy = (NvmeCopyCmd *)&req->cmd;
-        uint64_t sdlba = le64_to_cpu(copy->sdlba);
         NvmeZone *zone = nvme_get_zone_by_slba(ns, sdlba);
 
         __nvme_advance_zone_wp(ns, zone, ctx->nlb);
@@ -1535,6 +1559,7 @@  static void nvme_copy_cb(void *opaque, int ret)
 
     if (!ret) {
         block_acct_done(blk_get_stats(ns->blkconf.blk), &req->acct);
+        bitmap_clear(ns->uncorrectable, sdlba, ctx->nlb);
     } else {
         block_acct_failed(blk_get_stats(ns->blkconf.blk), &req->acct);
         nvme_aio_err(req, ret);
@@ -1953,6 +1978,12 @@  static uint16_t nvme_read(NvmeCtrl *n, NvmeRequest *req)
         goto invalid;
     }
 
+    status = nvme_check_uncor(ns, slba, nlb);
+    if (status) {
+        trace_pci_nvme_err_unrecoverable_read(slba, nlb);
+        return status;
+    }
+
     if (ns->params.zoned) {
         status = nvme_check_zone_read(ns, slba, nlb);
         if (status) {
@@ -1992,7 +2023,7 @@  invalid:
 }
 
 static uint16_t nvme_do_write(NvmeCtrl *n, NvmeRequest *req, bool append,
-                              bool wrz)
+                              bool wrz, bool uncor)
 {
     NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
     NvmeNamespace *ns = req->ns;
@@ -2008,7 +2039,7 @@  static uint16_t nvme_do_write(NvmeCtrl *n, NvmeRequest *req, bool append,
     trace_pci_nvme_write(nvme_cid(req), nvme_io_opc_str(rw->opcode),
                          nvme_nsid(ns), nlb, data_size, slba);
 
-    if (!wrz) {
+    if (!wrz && !uncor) {
         status = nvme_check_mdts(n, data_size);
         if (status) {
             trace_pci_nvme_err_mdts(nvme_cid(req), data_size);
@@ -2055,6 +2086,11 @@  static uint16_t nvme_do_write(NvmeCtrl *n, NvmeRequest *req, bool append,
         zone->w_ptr += nlb;
     }
 
+    if (uncor) {
+        bitmap_set(ns->uncorrectable, slba, nlb);
+        return NVME_SUCCESS;
+    }
+
     data_offset = nvme_l2b(ns, slba);
 
     if (!wrz) {
@@ -2087,17 +2123,22 @@  invalid:
 
 static inline uint16_t nvme_write(NvmeCtrl *n, NvmeRequest *req)
 {
-    return nvme_do_write(n, req, false, false);
+    return nvme_do_write(n, req, false, false, false);
 }
 
 static inline uint16_t nvme_write_zeroes(NvmeCtrl *n, NvmeRequest *req)
 {
-    return nvme_do_write(n, req, false, true);
+    return nvme_do_write(n, req, false, true, false);
 }
 
 static inline uint16_t nvme_zone_append(NvmeCtrl *n, NvmeRequest *req)
 {
-    return nvme_do_write(n, req, true, false);
+    return nvme_do_write(n, req, true, false, false);
+}
+
+static inline uint16_t nvme_write_uncor(NvmeCtrl *n, NvmeRequest *req)
+{
+    return nvme_do_write(n, req, false, false, true);
 }
 
 static uint16_t nvme_get_mgmt_zone_slba_idx(NvmeNamespace *ns, NvmeCmd *c,
@@ -2596,6 +2637,8 @@  static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req)
         return nvme_flush(n, req);
     case NVME_CMD_WRITE_ZEROES:
         return nvme_write_zeroes(n, req);
+    case NVME_CMD_WRITE_UNCOR:
+        return nvme_write_uncor(n, req);
     case NVME_CMD_ZONE_APPEND:
         return nvme_zone_append(n, req);
     case NVME_CMD_WRITE:
@@ -4514,6 +4557,11 @@  static void nvme_init_cse_iocs(NvmeCtrl *n)
     n->iocs.nvm[NVME_CMD_WRITE] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC;
     n->iocs.nvm[NVME_CMD_READ]  = NVME_CMD_EFF_CSUPP;
 
+    if (oncs & NVME_ONCS_WRITE_UNCORR) {
+        n->iocs.nvm[NVME_CMD_WRITE_UNCOR] = NVME_CMD_EFF_CSUPP |
+            NVME_CMD_EFF_LBCC;
+    }
+
     if (oncs & NVME_ONCS_WRITE_ZEROES) {
         n->iocs.nvm[NVME_CMD_WRITE_ZEROES] = NVME_CMD_EFF_CSUPP |
             NVME_CMD_EFF_LBCC;
@@ -4853,6 +4901,7 @@  static void nvme_exit(PCIDevice *pci_dev)
         }
 
         nvme_ns_cleanup(ns);
+        g_free(ns->uncorrectable);
     }
 
     g_free(n->cq);
diff --git a/hw/block/trace-events b/hw/block/trace-events
index 4b5ee04024f4..f30ef220c26a 100644
--- a/hw/block/trace-events
+++ b/hw/block/trace-events
@@ -128,6 +128,7 @@  pci_nvme_err_invalid_prp2_align(uint64_t prp2) "PRP2 is not page aligned: 0x%"PR
 pci_nvme_err_invalid_opc(uint8_t opc) "invalid opcode 0x%"PRIx8""
 pci_nvme_err_invalid_admin_opc(uint8_t opc) "invalid admin opcode 0x%"PRIx8""
 pci_nvme_err_invalid_lba_range(uint64_t start, uint64_t len, uint64_t limit) "Invalid LBA start=%"PRIu64" len=%"PRIu64" limit=%"PRIu64""
+pci_nvme_err_unrecoverable_read(uint64_t start, uint32_t nlb) "islba 0x%"PRIx64" nlb %"PRIu32""
 pci_nvme_err_invalid_log_page_offset(uint64_t ofs, uint64_t size) "must be <= %"PRIu64", got %"PRIu64""
 pci_nvme_err_cmb_invalid_cba(uint64_t cmbmsc) "cmbmsc 0x%"PRIx64""
 pci_nvme_err_cmb_not_enabled(uint64_t cmbmsc) "cmbmsc 0x%"PRIx64""