diff mbox series

[2/2] block/nvme: add support for discard

Message ID 20190825071541.10389-3-mlevitsk@redhat.com
State New
Headers show
Series block/nvme: add support for write zeros and discard | expand

Commit Message

Maxim Levitsky Aug. 25, 2019, 7:15 a.m. UTC
Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
---
 block/nvme.c       | 83 ++++++++++++++++++++++++++++++++++++++++++++++
 block/trace-events |  2 ++
 2 files changed, 85 insertions(+)

Comments

John Snow Aug. 27, 2019, 10:29 p.m. UTC | #1
On 8/25/19 3:15 AM, Maxim Levitsky wrote:
> Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
> ---
>  block/nvme.c       | 83 ++++++++++++++++++++++++++++++++++++++++++++++
>  block/trace-events |  2 ++
>  2 files changed, 85 insertions(+)
> 
> diff --git a/block/nvme.c b/block/nvme.c
> index f8bd11e19a..dd041f39c9 100644
> --- a/block/nvme.c
> +++ b/block/nvme.c
> @@ -112,6 +112,7 @@ typedef struct {
>      bool plugged;
>  
>      bool supports_write_zeros;
> +    bool supports_discard;
>  
>      CoMutex dma_map_lock;
>      CoQueue dma_flush_queue;
> @@ -463,6 +464,7 @@ static void nvme_identify(BlockDriverState *bs, int namespace, Error **errp)
>  
>      oncs = le16_to_cpu(idctrl->oncs);
>      s->supports_write_zeros = (oncs & NVME_ONCS_WRITE_ZEROS) != 0;
> +    s->supports_discard = (oncs & NVME_ONCS_DSM) != 0;

Same comment -- checking !!(register & FIELD) is nicer than the
negative. (I'm actually not sure even the !! is needed, but it seems to
be a QEMU-ism and I've caught myself using it...)

Rest looks good to me on a skim, but I'm not very well-versed in NVME.

>  
>      memset(resp, 0, 4096);
>  
> @@ -1153,6 +1155,86 @@ static coroutine_fn int nvme_co_pwrite_zeroes(BlockDriverState *bs,
>  }
>  
>  
> +static int coroutine_fn nvme_co_pdiscard(BlockDriverState *bs,
> +                                         int64_t offset,
> +                                         int bytes)
> +{
> +    BDRVNVMeState *s = bs->opaque;
> +    NVMeQueuePair *ioq = s->queues[1];
> +    NVMeRequest *req;
> +    NvmeDsmRange *buf;
> +    QEMUIOVector local_qiov;
> +    int ret;
> +
> +    NvmeCmd cmd = {
> +        .opcode = NVME_CMD_DSM,
> +        .nsid = cpu_to_le32(s->nsid),
> +        .cdw10 = cpu_to_le32(0), /*number of ranges - 0 based*/
> +        .cdw11 = cpu_to_le32(1 << 2), /*deallocate bit*/
> +    };
> +
> +    NVMeCoData data = {
> +        .ctx = bdrv_get_aio_context(bs),
> +        .ret = -EINPROGRESS,
> +    };
> +
> +    if (!s->supports_discard) {
> +        return -ENOTSUP;
> +    }
> +
> +    assert(s->nr_queues > 1);
> +
> +    buf = qemu_try_blockalign0(bs, s->page_size);
> +    if (!buf) {
> +        return -ENOMEM;
> +    }
> +
> +    buf->nlb = cpu_to_le32(bytes >> s->blkshift);
> +    buf->slba = cpu_to_le64(offset >> s->blkshift);
> +    buf->cattr = 0;
> +
> +    qemu_iovec_init(&local_qiov, 1);
> +    qemu_iovec_add(&local_qiov, buf, 4096);
> +
> +    req = nvme_get_free_req(ioq);
> +    assert(req);
> +
> +    qemu_co_mutex_lock(&s->dma_map_lock);
> +    ret = nvme_cmd_map_qiov(bs, &cmd, req, &local_qiov);
> +    qemu_co_mutex_unlock(&s->dma_map_lock);
> +
> +    if (ret) {
> +        req->busy = false;
> +        goto out;
> +    }
> +
> +    trace_nvme_dsm(s, offset, bytes);
> +
> +    nvme_submit_command(s, ioq, req, &cmd, nvme_rw_cb, &data);
> +
> +    data.co = qemu_coroutine_self();
> +    while (data.ret == -EINPROGRESS) {
> +        qemu_coroutine_yield();
> +    }
> +
> +    qemu_co_mutex_lock(&s->dma_map_lock);
> +    ret = nvme_cmd_unmap_qiov(bs, &local_qiov);
> +    qemu_co_mutex_unlock(&s->dma_map_lock);
> +
> +    if (ret) {
> +        goto out;
> +    }
> +
> +    ret = data.ret;
> +    trace_nvme_dsm_done(s, offset, bytes, ret);
> +out:
> +    qemu_iovec_destroy(&local_qiov);
> +    qemu_vfree(buf);
> +    return ret;
> +
> +}
> +
> +
>  static int nvme_reopen_prepare(BDRVReopenState *reopen_state,
>                                 BlockReopenQueue *queue, Error **errp)
>  {
> @@ -1259,6 +1341,7 @@ static BlockDriver bdrv_nvme = {
>      .bdrv_co_pwritev          = nvme_co_pwritev,
>  
>      .bdrv_co_pwrite_zeroes    = nvme_co_pwrite_zeroes,
> +    .bdrv_co_pdiscard         = nvme_co_pdiscard,
>  
>      .bdrv_co_flush_to_disk    = nvme_co_flush,
>      .bdrv_reopen_prepare      = nvme_reopen_prepare,
> diff --git a/block/trace-events b/block/trace-events
> index 8209fbd0c7..7d1d48b502 100644
> --- a/block/trace-events
> +++ b/block/trace-events
> @@ -153,6 +153,8 @@ nvme_write_zeros(void *s, uint64_t offset, uint64_t bytes, int flags) "s %p offs
>  nvme_qiov_unaligned(const void *qiov, int n, void *base, size_t size, int align) "qiov %p n %d base %p size 0x%zx align 0x%x"
>  nvme_prw_buffered(void *s, uint64_t offset, uint64_t bytes, int niov, int is_write) "s %p offset %"PRId64" bytes %"PRId64" niov %d is_write %d"
>  nvme_rw_done(void *s, int is_write, uint64_t offset, uint64_t bytes, int ret) "s %p is_write %d offset %"PRId64" bytes %"PRId64" ret %d"
> +nvme_dsm(void *s, uint64_t offset, uint64_t bytes) "s %p offset %"PRId64" bytes %"PRId64""
> +nvme_dsm_done(void *s, uint64_t offset, uint64_t bytes, int ret) "s %p offset %"PRId64" bytes %"PRId64" ret %d"
>  nvme_dma_map_flush(void *s) "s %p"
>  nvme_free_req_queue_wait(void *q) "q %p"
>  nvme_cmd_map_qiov(void *s, void *cmd, void *req, void *qiov, int entries) "s %p cmd %p req %p qiov %p entries %d"
>
Maxim Levitsky Aug. 28, 2019, 9:03 a.m. UTC | #2
On Tue, 2019-08-27 at 18:29 -0400, John Snow wrote:
> 
> On 8/25/19 3:15 AM, Maxim Levitsky wrote:
> > Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
> > ---
> >  block/nvme.c       | 83 ++++++++++++++++++++++++++++++++++++++++++++++
> >  block/trace-events |  2 ++
> >  2 files changed, 85 insertions(+)
> > 
> > diff --git a/block/nvme.c b/block/nvme.c
> > index f8bd11e19a..dd041f39c9 100644
> > --- a/block/nvme.c
> > +++ b/block/nvme.c
> > @@ -112,6 +112,7 @@ typedef struct {
> >      bool plugged;
> >  
> >      bool supports_write_zeros;
> > +    bool supports_discard;
> >  
> >      CoMutex dma_map_lock;
> >      CoQueue dma_flush_queue;
> > @@ -463,6 +464,7 @@ static void nvme_identify(BlockDriverState *bs, int namespace, Error **errp)
> >  
> >      oncs = le16_to_cpu(idctrl->oncs);
> >      s->supports_write_zeros = (oncs & NVME_ONCS_WRITE_ZEROS) != 0;
> > +    s->supports_discard = (oncs & NVME_ONCS_DSM) != 0;
> 
> Same comment -- checking !!(register & FIELD) is nicer than the
> negative. (I'm actually not sure even the !! is needed, but it seems to
> be a QEMU-ism and I've caught myself using it...)

All right, no problem to use !!

> 
> Rest looks good to me on a skim, but I'm not very well-versed in NVME.
Thanks!


> 
> >  
> >      memset(resp, 0, 4096);
> >  
> > @@ -1153,6 +1155,86 @@ static coroutine_fn int nvme_co_pwrite_zeroes(BlockDriverState *bs,
> >  }
> >  
> >  
> > +static int coroutine_fn nvme_co_pdiscard(BlockDriverState *bs,
> > +                                         int64_t offset,
> > +                                         int bytes)
> > +{
> > +    BDRVNVMeState *s = bs->opaque;
> > +    NVMeQueuePair *ioq = s->queues[1];
> > +    NVMeRequest *req;
> > +    NvmeDsmRange *buf;
> > +    QEMUIOVector local_qiov;
> > +    int ret;
> > +
> > +    NvmeCmd cmd = {
> > +        .opcode = NVME_CMD_DSM,
> > +        .nsid = cpu_to_le32(s->nsid),
> > +        .cdw10 = cpu_to_le32(0), /*number of ranges - 0 based*/
> > +        .cdw11 = cpu_to_le32(1 << 2), /*deallocate bit*/
> > +    };
> > +
> > +    NVMeCoData data = {
> > +        .ctx = bdrv_get_aio_context(bs),
> > +        .ret = -EINPROGRESS,
> > +    };
> > +
> > +    if (!s->supports_discard) {
> > +        return -ENOTSUP;
> > +    }
> > +
> > +    assert(s->nr_queues > 1);
> > +
> > +    buf = qemu_try_blockalign0(bs, s->page_size);
> > +    if (!buf) {
> > +        return -ENOMEM;
> > +    }
> > +
> > +    buf->nlb = cpu_to_le32(bytes >> s->blkshift);
> > +    buf->slba = cpu_to_le64(offset >> s->blkshift);
> > +    buf->cattr = 0;
> > +
> > +    qemu_iovec_init(&local_qiov, 1);
> > +    qemu_iovec_add(&local_qiov, buf, 4096);
> > +
> > +    req = nvme_get_free_req(ioq);
> > +    assert(req);
> > +
> > +    qemu_co_mutex_lock(&s->dma_map_lock);
> > +    ret = nvme_cmd_map_qiov(bs, &cmd, req, &local_qiov);
> > +    qemu_co_mutex_unlock(&s->dma_map_lock);
> > +
> > +    if (ret) {
> > +        req->busy = false;
> > +        goto out;
> > +    }
> > +
> > +    trace_nvme_dsm(s, offset, bytes);
> > +
> > +    nvme_submit_command(s, ioq, req, &cmd, nvme_rw_cb, &data);
> > +
> > +    data.co = qemu_coroutine_self();
> > +    while (data.ret == -EINPROGRESS) {
> > +        qemu_coroutine_yield();
> > +    }
> > +
> > +    qemu_co_mutex_lock(&s->dma_map_lock);
> > +    ret = nvme_cmd_unmap_qiov(bs, &local_qiov);
> > +    qemu_co_mutex_unlock(&s->dma_map_lock);
> > +
> > +    if (ret) {
> > +        goto out;
> > +    }
> > +
> > +    ret = data.ret;
> > +    trace_nvme_dsm_done(s, offset, bytes, ret);
> > +out:
> > +    qemu_iovec_destroy(&local_qiov);
> > +    qemu_vfree(buf);
> > +    return ret;
> > +
> > +}
> > +
> > +
> >  static int nvme_reopen_prepare(BDRVReopenState *reopen_state,
> >                                 BlockReopenQueue *queue, Error **errp)
> >  {
> > @@ -1259,6 +1341,7 @@ static BlockDriver bdrv_nvme = {
> >      .bdrv_co_pwritev          = nvme_co_pwritev,
> >  
> >      .bdrv_co_pwrite_zeroes    = nvme_co_pwrite_zeroes,
> > +    .bdrv_co_pdiscard         = nvme_co_pdiscard,
> >  
> >      .bdrv_co_flush_to_disk    = nvme_co_flush,
> >      .bdrv_reopen_prepare      = nvme_reopen_prepare,
> > diff --git a/block/trace-events b/block/trace-events
> > index 8209fbd0c7..7d1d48b502 100644
> > --- a/block/trace-events
> > +++ b/block/trace-events
> > @@ -153,6 +153,8 @@ nvme_write_zeros(void *s, uint64_t offset, uint64_t bytes, int flags) "s %p offs
> >  nvme_qiov_unaligned(const void *qiov, int n, void *base, size_t size, int align) "qiov %p n %d base %p size 0x%zx align 0x%x"
> >  nvme_prw_buffered(void *s, uint64_t offset, uint64_t bytes, int niov, int is_write) "s %p offset %"PRId64" bytes %"PRId64" niov %d is_write %d"
> >  nvme_rw_done(void *s, int is_write, uint64_t offset, uint64_t bytes, int ret) "s %p is_write %d offset %"PRId64" bytes %"PRId64" ret %d"
> > +nvme_dsm(void *s, uint64_t offset, uint64_t bytes) "s %p offset %"PRId64" bytes %"PRId64""
> > +nvme_dsm_done(void *s, uint64_t offset, uint64_t bytes, int ret) "s %p offset %"PRId64" bytes %"PRId64" ret %d"
> >  nvme_dma_map_flush(void *s) "s %p"
> >  nvme_free_req_queue_wait(void *q) "q %p"
> >  nvme_cmd_map_qiov(void *s, void *cmd, void *req, void *qiov, int entries) "s %p cmd %p req %p qiov %p entries %d"
> > 


Best regards,
	Maxim Levitsky
Maxim Levitsky Sept. 5, 2019, 1:24 p.m. UTC | #3
On Wed, 2019-08-28 at 12:03 +0300, Maxim Levitsky wrote:
> On Tue, 2019-08-27 at 18:29 -0400, John Snow wrote:
> > 
> > On 8/25/19 3:15 AM, Maxim Levitsky wrote:
> > > Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
> > > ---
> > >  block/nvme.c       | 83 ++++++++++++++++++++++++++++++++++++++++++++++
> > >  block/trace-events |  2 ++
> > >  2 files changed, 85 insertions(+)
> > > 
> > > diff --git a/block/nvme.c b/block/nvme.c
> > > index f8bd11e19a..dd041f39c9 100644
> > > --- a/block/nvme.c
> > > +++ b/block/nvme.c
> > > @@ -112,6 +112,7 @@ typedef struct {
> > >      bool plugged;
> > >  
> > >      bool supports_write_zeros;
> > > +    bool supports_discard;
> > >  
> > >      CoMutex dma_map_lock;
> > >      CoQueue dma_flush_queue;
> > > @@ -463,6 +464,7 @@ static void nvme_identify(BlockDriverState *bs, int namespace, Error **errp)
> > >  
> > >      oncs = le16_to_cpu(idctrl->oncs);
> > >      s->supports_write_zeros = (oncs & NVME_ONCS_WRITE_ZEROS) != 0;
> > > +    s->supports_discard = (oncs & NVME_ONCS_DSM) != 0;
> > 
> > Same comment -- checking !!(register & FIELD) is nicer than the
> > negative. (I'm actually not sure even the !! is needed, but it seems to
> > be a QEMU-ism and I've caught myself using it...)
> 
> All right, no problem to use !!
> 
> > 
> > Rest looks good to me on a skim, but I'm not very well-versed in NVME.
> 
> Thanks!
> 

Kind ping about this patch series.

Apart from using !!, do you think that this patch series
can be merged, or should I do anything else?
Which tree do you think this should be committed to?

I kind of want to see that merged before the freeze
starts, if there are no objections,
to reduce the amount of pending stuff in my queue.


[...]



Best regards,
	Maxim Levitsky
John Snow Sept. 5, 2019, 5:27 p.m. UTC | #4
On 9/5/19 9:24 AM, Maxim Levitsky wrote:
> On Wed, 2019-08-28 at 12:03 +0300, Maxim Levitsky wrote:
>> On Tue, 2019-08-27 at 18:29 -0400, John Snow wrote:
>>>
>>> On 8/25/19 3:15 AM, Maxim Levitsky wrote:
>>>> Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
>>>> ---
>>>>  block/nvme.c       | 83 ++++++++++++++++++++++++++++++++++++++++++++++
>>>>  block/trace-events |  2 ++
>>>>  2 files changed, 85 insertions(+)
>>>>
>>>> diff --git a/block/nvme.c b/block/nvme.c
>>>> index f8bd11e19a..dd041f39c9 100644
>>>> --- a/block/nvme.c
>>>> +++ b/block/nvme.c
>>>> @@ -112,6 +112,7 @@ typedef struct {
>>>>      bool plugged;
>>>>  
>>>>      bool supports_write_zeros;
>>>> +    bool supports_discard;
>>>>  
>>>>      CoMutex dma_map_lock;
>>>>      CoQueue dma_flush_queue;
>>>> @@ -463,6 +464,7 @@ static void nvme_identify(BlockDriverState *bs, int namespace, Error **errp)
>>>>  
>>>>      oncs = le16_to_cpu(idctrl->oncs);
>>>>      s->supports_write_zeros = (oncs & NVME_ONCS_WRITE_ZEROS) != 0;
>>>> +    s->supports_discard = (oncs & NVME_ONCS_DSM) != 0;
>>>
>>> Same comment -- checking !!(register & FIELD) is nicer than the
>>> negative. (I'm actually not sure even the !! is needed, but it seems to
>>> be a QEMU-ism and I've caught myself using it...)
>>
>> All right, no problem to use !!
>>
>>>
>>> Rest looks good to me on a skim, but I'm not very well-versed in NVME.
>>
>> Thanks!
>>
> 
> Kind ping about this patch series.
> 
> Apart from using !!, do you think that this patch series
> can be merged, or should I do anything else?
> Which tree do you think this should be committed to?
> 
> I kind of want to see that merged before the freeze
> starts, if there are no objections,
> to reduce the amount of pending stuff in my queue.
> 

Didn't I ask a few other things?

like not using "res30" because you've moved the fields around, and
trying to be consistent about "zeros" vs "zeroes".

Removing "+#define NVME_ID_NS_DLFEAT_GUARD_CRC(dlfeat)       ((dlfeat) &
0x10)" because it's unused.

You also probably require review (or at least an ACK) from Keith Busch
who maintains this file.

--js
Maxim Levitsky Sept. 5, 2019, 5:32 p.m. UTC | #5
On Thu, 2019-09-05 at 13:27 -0400, John Snow wrote:
> 
> On 9/5/19 9:24 AM, Maxim Levitsky wrote:
> > On Wed, 2019-08-28 at 12:03 +0300, Maxim Levitsky wrote:
> > > On Tue, 2019-08-27 at 18:29 -0400, John Snow wrote:
> > > > 
> > > > On 8/25/19 3:15 AM, Maxim Levitsky wrote:
> > > > > Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
> > > > > ---
> > > > >  block/nvme.c       | 83 ++++++++++++++++++++++++++++++++++++++++++++++
> > > > >  block/trace-events |  2 ++
> > > > >  2 files changed, 85 insertions(+)
> > > > > 
> > > > > diff --git a/block/nvme.c b/block/nvme.c
> > > > > index f8bd11e19a..dd041f39c9 100644
> > > > > --- a/block/nvme.c
> > > > > +++ b/block/nvme.c
> > > > > @@ -112,6 +112,7 @@ typedef struct {
> > > > >      bool plugged;
> > > > >  
> > > > >      bool supports_write_zeros;
> > > > > +    bool supports_discard;
> > > > >  
> > > > >      CoMutex dma_map_lock;
> > > > >      CoQueue dma_flush_queue;
> > > > > @@ -463,6 +464,7 @@ static void nvme_identify(BlockDriverState *bs, int namespace, Error **errp)
> > > > >  
> > > > >      oncs = le16_to_cpu(idctrl->oncs);
> > > > >      s->supports_write_zeros = (oncs & NVME_ONCS_WRITE_ZEROS) != 0;
> > > > > +    s->supports_discard = (oncs & NVME_ONCS_DSM) != 0;
> > > > 
> > > > Same comment -- checking !!(register & FIELD) is nicer than the
> > > > negative. (I'm actually not sure even the !! is needed, but it seems to
> > > > be a QEMU-ism and I've caught myself using it...)
> > > 
> > > All right, no problem to use !!
> > > 
> > > > 
> > > > Rest looks good to me on a skim, but I'm not very well-versed in NVME.
> > > 
> > > Thanks!
> > > 
> > 
> > Kind ping about this patch series.
> > 
> > Apart from using !!, do you think that this patch series
> > can be merged, or should I do anything else?
> > Which tree do you think this should be committed to?
> > 
> > I kind of want to see that merged before the freeze
> > starts, if there are no objections,
> > to reduce the amount of pending stuff in my queue.
> > 
> 
> Didn't I ask a few other things?
> 
> like not using "res30" because you've moved the fields around, and
> trying to be consistent about "zeros" vs "zeroes".
> 
> Removing "+#define NVME_ID_NS_DLFEAT_GUARD_CRC(dlfeat)       ((dlfeat) &
> 0x10)" because it's unused.


All right I forgot about it, that's one of they joys of duplication of a kernel
driver in the userspace...


> 
> You also probably require review (or at least an ACK) from Keith Busch
> who maintains this file.
> 
> --js

All right,
	Best regards,
		Maxim Levitsky
Max Reitz Sept. 9, 2019, 9:25 a.m. UTC | #6
On 05.09.19 19:27, John Snow wrote:

[...]

> You also probably require review (or at least an ACK) from Keith Busch
> who maintains this file.

Keith actually maintains the NVMe guest device; technically, Fam is the
NVMe block driver maintainer.

Max
John Snow Sept. 9, 2019, 5:03 p.m. UTC | #7
On 9/9/19 5:25 AM, Max Reitz wrote:
> On 05.09.19 19:27, John Snow wrote:
> 
> [...]
> 
>> You also probably require review (or at least an ACK) from Keith Busch
>> who maintains this file.
> 
> Keith actually maintains the NVMe guest device; technically, Fam is the
> NVMe block driver maintainer.

W h o o p s. Thanks for correcting me.

Well, if it's Fam -- he seems a little busier lately -- it's probably
not so crucial to gate on his approval. I thought it'd be nice to at
least get an ACK from someone who has used this module before, because I
haven't -- I was just giving some style review to help push it along.

(On that note, if you felt like my style review was wrong or isn't worth
doing -- it is always perfectly fair to just say so, along with some
reason as to why you won't -- that way patches won't rot on the list
when people may have gotten the impression that a V2 is warranted.)

--js
Paolo Bonzini Sept. 10, 2019, 2:49 p.m. UTC | #8
On 09/09/19 19:03, John Snow wrote:
> 
> 
> On 9/9/19 5:25 AM, Max Reitz wrote:
>> On 05.09.19 19:27, John Snow wrote:
>>
>> [...]
>>
>>> You also probably require review (or at least an ACK) from Keith Busch
>>> who maintains this file.
>>
>> Keith actually maintains the NVMe guest device; technically, Fam is the
>> NVMe block driver maintainer.
> 
> W h o o p s. Thanks for correcting me.
> 
> Well, if it's Fam -- he seems a little busier lately -- it's probably
> not so crucial to gate on his approval. I thought it'd be nice to at
> least get an ACK from someone who has used this module before, because I
> haven't -- I was just giving some style review to help push it along.
> 
> (On that note, if you felt like my style review was wrong or isn't worth
> doing -- it is always perfectly fair to just say so, along with some
> reason as to why you won't -- that way patches won't rot on the list
> when people may have gotten the impression that a V2 is warranted.)

Looks good to me with the changes you pointed out (especially res30;
leaving out the unused macros is not so important).

Paolo
Maxim Levitsky Sept. 10, 2019, 2:57 p.m. UTC | #9
On Tue, 2019-09-10 at 16:49 +0200, Paolo Bonzini wrote:
> On 09/09/19 19:03, John Snow wrote:
> > 
> > 
> > On 9/9/19 5:25 AM, Max Reitz wrote:
> > > On 05.09.19 19:27, John Snow wrote:
> > > 
> > > [...]
> > > 
> > > > You also probably require review (or at least an ACK) from Keith Busch
> > > > who maintains this file.
> > > 
> > > Keith actually maintains the NVMe guest device; technically, Fam is the
> > > NVMe block driver maintainer.
> > 
> > W h o o p s. Thanks for correcting me.
> > 
> > Well, if it's Fam -- he seems a little busier lately -- it's probably
> > not so crucial to gate on his approval. I thought it'd be nice to at
> > least get an ACK from someone who has used this module before, because I
> > haven't -- I was just giving some style review to help push it along.
> > 
> > (On that note, if you felt like my style review was wrong or isn't worth
> > doing -- it is always perfectly fair to just say so, along with some
> > reason as to why you won't -- that way patches won't rot on the list
> > when people may have gotten the impression that a V2 is warranted.)
Absolutely not, your review was fine! I just was/is a bit lazy to send next version of the patches
before I get some kind of indication if anything else is needed for this to be merged,
since the module doesn't have currently an active maintainer.


> 
> Looks good to me with the changes you pointed out (especially res30;
> leaving out the unused macros is not so important).

All right, I'll send an updated version of those two patches soon.

Best regards,
	Maxim Levitsky
diff mbox series

Patch

diff --git a/block/nvme.c b/block/nvme.c
index f8bd11e19a..dd041f39c9 100644
--- a/block/nvme.c
+++ b/block/nvme.c
@@ -112,6 +112,7 @@  typedef struct {
     bool plugged;
 
     bool supports_write_zeros;
+    bool supports_discard;
 
     CoMutex dma_map_lock;
     CoQueue dma_flush_queue;
@@ -463,6 +464,7 @@  static void nvme_identify(BlockDriverState *bs, int namespace, Error **errp)
 
     oncs = le16_to_cpu(idctrl->oncs);
     s->supports_write_zeros = (oncs & NVME_ONCS_WRITE_ZEROS) != 0;
+    s->supports_discard = (oncs & NVME_ONCS_DSM) != 0;
 
     memset(resp, 0, 4096);
 
@@ -1153,6 +1155,86 @@  static coroutine_fn int nvme_co_pwrite_zeroes(BlockDriverState *bs,
 }
 
 
+static int coroutine_fn nvme_co_pdiscard(BlockDriverState *bs,
+                                         int64_t offset,
+                                         int bytes)
+{
+    BDRVNVMeState *s = bs->opaque;
+    NVMeQueuePair *ioq = s->queues[1];
+    NVMeRequest *req;
+    NvmeDsmRange *buf;
+    QEMUIOVector local_qiov;
+    int ret;
+
+    NvmeCmd cmd = {
+        .opcode = NVME_CMD_DSM,
+        .nsid = cpu_to_le32(s->nsid),
+        .cdw10 = cpu_to_le32(0), /*number of ranges - 0 based*/
+        .cdw11 = cpu_to_le32(1 << 2), /*deallocate bit*/
+    };
+
+    NVMeCoData data = {
+        .ctx = bdrv_get_aio_context(bs),
+        .ret = -EINPROGRESS,
+    };
+
+    if (!s->supports_discard) {
+        return -ENOTSUP;
+    }
+
+    assert(s->nr_queues > 1);
+
+    buf = qemu_try_blockalign0(bs, s->page_size);
+    if (!buf) {
+        return -ENOMEM;
+    }
+
+    buf->nlb = cpu_to_le32(bytes >> s->blkshift);
+    buf->slba = cpu_to_le64(offset >> s->blkshift);
+    buf->cattr = 0;
+
+    qemu_iovec_init(&local_qiov, 1);
+    qemu_iovec_add(&local_qiov, buf, 4096);
+
+    req = nvme_get_free_req(ioq);
+    assert(req);
+
+    qemu_co_mutex_lock(&s->dma_map_lock);
+    ret = nvme_cmd_map_qiov(bs, &cmd, req, &local_qiov);
+    qemu_co_mutex_unlock(&s->dma_map_lock);
+
+    if (ret) {
+        req->busy = false;
+        goto out;
+    }
+
+    trace_nvme_dsm(s, offset, bytes);
+
+    nvme_submit_command(s, ioq, req, &cmd, nvme_rw_cb, &data);
+
+    data.co = qemu_coroutine_self();
+    while (data.ret == -EINPROGRESS) {
+        qemu_coroutine_yield();
+    }
+
+    qemu_co_mutex_lock(&s->dma_map_lock);
+    ret = nvme_cmd_unmap_qiov(bs, &local_qiov);
+    qemu_co_mutex_unlock(&s->dma_map_lock);
+
+    if (ret) {
+        goto out;
+    }
+
+    ret = data.ret;
+    trace_nvme_dsm_done(s, offset, bytes, ret);
+out:
+    qemu_iovec_destroy(&local_qiov);
+    qemu_vfree(buf);
+    return ret;
+
+}
+
+
 static int nvme_reopen_prepare(BDRVReopenState *reopen_state,
                                BlockReopenQueue *queue, Error **errp)
 {
@@ -1259,6 +1341,7 @@  static BlockDriver bdrv_nvme = {
     .bdrv_co_pwritev          = nvme_co_pwritev,
 
     .bdrv_co_pwrite_zeroes    = nvme_co_pwrite_zeroes,
+    .bdrv_co_pdiscard         = nvme_co_pdiscard,
 
     .bdrv_co_flush_to_disk    = nvme_co_flush,
     .bdrv_reopen_prepare      = nvme_reopen_prepare,
diff --git a/block/trace-events b/block/trace-events
index 8209fbd0c7..7d1d48b502 100644
--- a/block/trace-events
+++ b/block/trace-events
@@ -153,6 +153,8 @@  nvme_write_zeros(void *s, uint64_t offset, uint64_t bytes, int flags) "s %p offs
 nvme_qiov_unaligned(const void *qiov, int n, void *base, size_t size, int align) "qiov %p n %d base %p size 0x%zx align 0x%x"
 nvme_prw_buffered(void *s, uint64_t offset, uint64_t bytes, int niov, int is_write) "s %p offset %"PRId64" bytes %"PRId64" niov %d is_write %d"
 nvme_rw_done(void *s, int is_write, uint64_t offset, uint64_t bytes, int ret) "s %p is_write %d offset %"PRId64" bytes %"PRId64" ret %d"
+nvme_dsm(void *s, uint64_t offset, uint64_t bytes) "s %p offset %"PRId64" bytes %"PRId64""
+nvme_dsm_done(void *s, uint64_t offset, uint64_t bytes, int ret) "s %p offset %"PRId64" bytes %"PRId64" ret %d"
 nvme_dma_map_flush(void *s) "s %p"
 nvme_free_req_queue_wait(void *q) "q %p"
 nvme_cmd_map_qiov(void *s, void *cmd, void *req, void *qiov, int entries) "s %p cmd %p req %p qiov %p entries %d"