diff mbox series

block: simplify write-threshold and drop write notifiers

Message ID 20210421220950.105017-1-vsementsov@virtuozzo.com
State New
Headers show
Series block: simplify write-threshold and drop write notifiers | expand

Commit Message

Vladimir Sementsov-Ogievskiy April 21, 2021, 10:09 p.m. UTC
write-notifiers are used only for write-threshold. New code for such
purpose should create filters.

Let's handle write-threshold simply in generic code and drop write
notifiers at all.

Also move part of write-threshold API that is used only for testing to
the test.

Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
---

I agree that this could be split into 2-3 parts and not combining
everything into one. But I'm tired now. I can send v2 if needed, so
consider it as RFC. Or take as is if you think it's not too much-in-one.

I also suggest this as a prepartion (and partly substitution) for
"[PATCH v2 0/8] Block layer thread-safety, continued"

 include/block/block_int.h         | 12 -----
 include/block/write-threshold.h   | 24 ---------
 block.c                           |  1 -
 block/io.c                        | 21 +++++---
 block/write-threshold.c           | 87 ++-----------------------------
 tests/unit/test-write-threshold.c | 38 ++++++++++++++
 6 files changed, 54 insertions(+), 129 deletions(-)

Comments

Emanuele Giuseppe Esposito April 22, 2021, 9:57 a.m. UTC | #1
On 22/04/2021 00:09, Vladimir Sementsov-Ogievskiy wrote:
> write-notifiers are used only for write-threshold. New code for such
> purpose should create filters.
> 
> Let's handle write-threshold simply in generic code and drop write
> notifiers at all.
> 
> Also move part of write-threshold API that is used only for testing to
> the test.
> 
> Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
> ---
> 
> I agree that this could be split into 2-3 parts and not combining
> everything into one. But I'm tired now. I can send v2 if needed, so
> consider it as RFC. Or take as is if you think it's not too much-in-one.

Thank you for this patch. Since I am reworking on v2, if you want I can 
also integrate this patch with mines and send everything together once I 
am done.

Emanuele

> 
> I also suggest this as a prepartion (and partly substitution) for
> "[PATCH v2 0/8] Block layer thread-safety, continued"
> 
>   include/block/block_int.h         | 12 -----
>   include/block/write-threshold.h   | 24 ---------
>   block.c                           |  1 -
>   block/io.c                        | 21 +++++---
>   block/write-threshold.c           | 87 ++-----------------------------
>   tests/unit/test-write-threshold.c | 38 ++++++++++++++
>   6 files changed, 54 insertions(+), 129 deletions(-)
> 
> diff --git a/include/block/block_int.h b/include/block/block_int.h
> index 88e4111939..50af58af75 100644
> --- a/include/block/block_int.h
> +++ b/include/block/block_int.h
> @@ -957,12 +957,8 @@ struct BlockDriverState {
>        */
>       int64_t total_sectors;
>   
> -    /* Callback before write request is processed */
> -    NotifierWithReturnList before_write_notifiers;
> -
>       /* threshold limit for writes, in bytes. "High water mark". */
>       uint64_t write_threshold_offset;
> -    NotifierWithReturn write_threshold_notifier;
>   
>       /* Writing to the list requires the BQL _and_ the dirty_bitmap_mutex.
>        * Reading from the list can be done with either the BQL or the
> @@ -1087,14 +1083,6 @@ void bdrv_parse_filename_strip_prefix(const char *filename, const char *prefix,
>   bool bdrv_backing_overridden(BlockDriverState *bs);
>   
>   
> -/**
> - * bdrv_add_before_write_notifier:
> - *
> - * Register a callback that is invoked before write requests are processed but
> - * after any throttling or waiting for overlapping requests.
> - */
> -void bdrv_add_before_write_notifier(BlockDriverState *bs,
> -                                    NotifierWithReturn *notifier);
>   
>   /**
>    * bdrv_add_aio_context_notifier:
> diff --git a/include/block/write-threshold.h b/include/block/write-threshold.h
> index c646f267a4..23e1bfc123 100644
> --- a/include/block/write-threshold.h
> +++ b/include/block/write-threshold.h
> @@ -35,28 +35,4 @@ void bdrv_write_threshold_set(BlockDriverState *bs, uint64_t threshold_bytes);
>    */
>   uint64_t bdrv_write_threshold_get(const BlockDriverState *bs);
>   
> -/*
> - * bdrv_write_threshold_is_set
> - *
> - * Tell if a write threshold is set for a given BDS.
> - */
> -bool bdrv_write_threshold_is_set(const BlockDriverState *bs);
> -
> -/*
> - * bdrv_write_threshold_exceeded
> - *
> - * Return the extent of a write request that exceeded the threshold,
> - * or zero if the request is below the threshold.
> - * Return zero also if the threshold was not set.
> - *
> - * NOTE: here we assume the following holds for each request this code
> - * deals with:
> - *
> - * assert((req->offset + req->bytes) <= UINT64_MAX)
> - *
> - * Please not there is *not* an actual C assert().
> - */
> -uint64_t bdrv_write_threshold_exceeded(const BlockDriverState *bs,
> -                                       const BdrvTrackedRequest *req);
> -
>   #endif
> diff --git a/block.c b/block.c
> index c5b887cec1..001453105e 100644
> --- a/block.c
> +++ b/block.c
> @@ -381,7 +381,6 @@ BlockDriverState *bdrv_new(void)
>       for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
>           QLIST_INIT(&bs->op_blockers[i]);
>       }
> -    notifier_with_return_list_init(&bs->before_write_notifiers);
>       qemu_co_mutex_init(&bs->reqs_lock);
>       qemu_mutex_init(&bs->dirty_bitmap_mutex);
>       bs->refcnt = 1;
> diff --git a/block/io.c b/block/io.c
> index ca2dca3007..e0aa775952 100644
> --- a/block/io.c
> +++ b/block/io.c
> @@ -36,6 +36,8 @@
>   #include "qemu/main-loop.h"
>   #include "sysemu/replay.h"
>   
> +#include "qapi/qapi-events-block-core.h"
> +
>   /* Maximum bounce buffer for copy-on-read and write zeroes, in bytes */
>   #define MAX_BOUNCE_BUFFER (32768 << BDRV_SECTOR_BITS)
>   
> @@ -1974,6 +1976,8 @@ bdrv_co_write_req_prepare(BdrvChild *child, int64_t offset, int64_t bytes,
>              child->perm & BLK_PERM_RESIZE);
>   
>       switch (req->type) {
> +        uint64_t write_threshold;
> +
>       case BDRV_TRACKED_WRITE:
>       case BDRV_TRACKED_DISCARD:
>           if (flags & BDRV_REQ_WRITE_UNCHANGED) {
> @@ -1981,8 +1985,15 @@ bdrv_co_write_req_prepare(BdrvChild *child, int64_t offset, int64_t bytes,
>           } else {
>               assert(child->perm & BLK_PERM_WRITE);
>           }
> -        return notifier_with_return_list_notify(&bs->before_write_notifiers,
> -                                                req);
> +        write_threshold = qatomic_read(&bs->write_threshold_offset);
> +        if (write_threshold > 0 && offset + bytes > write_threshold) {
> +            qapi_event_send_block_write_threshold(
> +                bs->node_name,
> +                offset + bytes - write_threshold,
> +                write_threshold);
> +            qatomic_set(&bs->write_threshold_offset, 0);
> +        }
> +        return 0;
>       case BDRV_TRACKED_TRUNCATE:
>           assert(child->perm & BLK_PERM_RESIZE);
>           return 0;
> @@ -3137,12 +3148,6 @@ bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
>       return true;
>   }
>   
> -void bdrv_add_before_write_notifier(BlockDriverState *bs,
> -                                    NotifierWithReturn *notifier)
> -{
> -    notifier_with_return_list_add(&bs->before_write_notifiers, notifier);
> -}
> -
>   void bdrv_io_plug(BlockDriverState *bs)
>   {
>       BdrvChild *child;
> diff --git a/block/write-threshold.c b/block/write-threshold.c
> index 85b78dc2a9..9bf4287c6e 100644
> --- a/block/write-threshold.c
> +++ b/block/write-threshold.c
> @@ -21,104 +21,23 @@
>   
>   uint64_t bdrv_write_threshold_get(const BlockDriverState *bs)
>   {
> -    return bs->write_threshold_offset;
> -}
> -
> -bool bdrv_write_threshold_is_set(const BlockDriverState *bs)
> -{
> -    return bs->write_threshold_offset > 0;
> -}
> -
> -static void write_threshold_disable(BlockDriverState *bs)
> -{
> -    if (bdrv_write_threshold_is_set(bs)) {
> -        notifier_with_return_remove(&bs->write_threshold_notifier);
> -        bs->write_threshold_offset = 0;
> -    }
> -}
> -
> -uint64_t bdrv_write_threshold_exceeded(const BlockDriverState *bs,
> -                                       const BdrvTrackedRequest *req)
> -{
> -    if (bdrv_write_threshold_is_set(bs)) {
> -        if (req->offset > bs->write_threshold_offset) {
> -            return (req->offset - bs->write_threshold_offset) + req->bytes;
> -        }
> -        if ((req->offset + req->bytes) > bs->write_threshold_offset) {
> -            return (req->offset + req->bytes) - bs->write_threshold_offset;
> -        }
> -    }
> -    return 0;
> -}
> -
> -static int coroutine_fn before_write_notify(NotifierWithReturn *notifier,
> -                                            void *opaque)
> -{
> -    BdrvTrackedRequest *req = opaque;
> -    BlockDriverState *bs = req->bs;
> -    uint64_t amount = 0;
> -
> -    amount = bdrv_write_threshold_exceeded(bs, req);
> -    if (amount > 0) {
> -        qapi_event_send_block_write_threshold(
> -            bs->node_name,
> -            amount,
> -            bs->write_threshold_offset);
> -
> -        /* autodisable to avoid flooding the monitor */
> -        write_threshold_disable(bs);
> -    }
> -
> -    return 0; /* should always let other notifiers run */
> -}
> -
> -static void write_threshold_register_notifier(BlockDriverState *bs)
> -{
> -    bs->write_threshold_notifier.notify = before_write_notify;
> -    bdrv_add_before_write_notifier(bs, &bs->write_threshold_notifier);
> -}
> -
> -static void write_threshold_update(BlockDriverState *bs,
> -                                   int64_t threshold_bytes)
> -{
> -    bs->write_threshold_offset = threshold_bytes;
> +    return qatomic_read(&bs->write_threshold_offset);
>   }
>   
>   void bdrv_write_threshold_set(BlockDriverState *bs, uint64_t threshold_bytes)
>   {
> -    if (bdrv_write_threshold_is_set(bs)) {
> -        if (threshold_bytes > 0) {
> -            write_threshold_update(bs, threshold_bytes);
> -        } else {
> -            write_threshold_disable(bs);
> -        }
> -    } else {
> -        if (threshold_bytes > 0) {
> -            /* avoid multiple registration */
> -            write_threshold_register_notifier(bs);
> -            write_threshold_update(bs, threshold_bytes);
> -        }
> -        /* discard bogus disable request */
> -    }
> +    qatomic_set(&bs->write_threshold_offset, threshold_bytes);
>   }
>   
>   void qmp_block_set_write_threshold(const char *node_name,
>                                      uint64_t threshold_bytes,
>                                      Error **errp)
>   {
> -    BlockDriverState *bs;
> -    AioContext *aio_context;
> -
> -    bs = bdrv_find_node(node_name);
> +    BlockDriverState *bs = bdrv_find_node(node_name);
>       if (!bs) {
>           error_setg(errp, "Device '%s' not found", node_name);
>           return;
>       }
>   
> -    aio_context = bdrv_get_aio_context(bs);
> -    aio_context_acquire(aio_context);
> -
>       bdrv_write_threshold_set(bs, threshold_bytes);
> -
> -    aio_context_release(aio_context);
>   }
> diff --git a/tests/unit/test-write-threshold.c b/tests/unit/test-write-threshold.c
> index fc1c45a2eb..c2f4cd20d7 100644
> --- a/tests/unit/test-write-threshold.c
> +++ b/tests/unit/test-write-threshold.c
> @@ -12,6 +12,44 @@
>   #include "block/write-threshold.h"
>   
>   
> +/*
> + * bdrv_write_threshold_is_set
> + *
> + * Tell if a write threshold is set for a given BDS.
> + */
> +static bool bdrv_write_threshold_is_set(const BlockDriverState *bs)
> +{
> +    return bs->write_threshold_offset > 0;
> +}
> +
> +/*
> + * bdrv_write_threshold_exceeded
> + *
> + * Return the extent of a write request that exceeded the threshold,
> + * or zero if the request is below the threshold.
> + * Return zero also if the threshold was not set.
> + *
> + * NOTE: here we assume the following holds for each request this code
> + * deals with:
> + *
> + * assert((req->offset + req->bytes) <= UINT64_MAX)
> + *
> + * Please not there is *not* an actual C assert().
> + */
> +static uint64_t bdrv_write_threshold_exceeded(const BlockDriverState *bs,
> +                                              const BdrvTrackedRequest *req)
> +{
> +    if (bdrv_write_threshold_is_set(bs)) {
> +        if (req->offset > bs->write_threshold_offset) {
> +            return (req->offset - bs->write_threshold_offset) + req->bytes;
> +        }
> +        if ((req->offset + req->bytes) > bs->write_threshold_offset) {
> +            return (req->offset + req->bytes) - bs->write_threshold_offset;
> +        }
> +    }
> +    return 0;
> +}
> +
>   static void test_threshold_not_set_on_init(void)
>   {
>       uint64_t res;
>
Vladimir Sementsov-Ogievskiy April 22, 2021, 10:12 a.m. UTC | #2
22.04.2021 12:57, Emanuele Giuseppe Esposito wrote:
> 
> 
> On 22/04/2021 00:09, Vladimir Sementsov-Ogievskiy wrote:
>> write-notifiers are used only for write-threshold. New code for such
>> purpose should create filters.
>>
>> Let's handle write-threshold simply in generic code and drop write
>> notifiers at all.
>>
>> Also move part of write-threshold API that is used only for testing to
>> the test.
>>
>> Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
>> ---
>>
>> I agree that this could be split into 2-3 parts and not combining
>> everything into one. But I'm tired now. I can send v2 if needed, so
>> consider it as RFC. Or take as is if you think it's not too much-in-one.
> 
> Thank you for this patch. Since I am reworking on v2, if you want I can also integrate this patch with mines and send everything together once I am done.
> 
> Emanuele

I'd wait several days for comments. Maybe I'll have to resend v2 of this.

Also, after this patch, is something of your patches 1-4 needed? Probably you may just resend your series not touching write-threshold, and just note in cover-letter that write-threshold is covered by "[PATCH] block: simplify write-threshold and drop write notifiers"
Max Reitz April 30, 2021, 10:04 a.m. UTC | #3
On 22.04.21 00:09, Vladimir Sementsov-Ogievskiy wrote:
> write-notifiers are used only for write-threshold. New code for such
> purpose should create filters.
> 
> Let's handle write-threshold simply in generic code and drop write
> notifiers at all.
> 
> Also move part of write-threshold API that is used only for testing to
> the test.
> 
> Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
> ---
> 
> I agree that this could be split into 2-3 parts and not combining
> everything into one. But I'm tired now.

Er...  You should have put it off until the next day then? O:)

It should be multiple patches.  At least one to move the write threshold 
update to block/io.c, and then another to drop write notifiers.

> I can send v2 if needed, so
> consider it as RFC. Or take as is if you think it's not too much-in-one.
> 
> I also suggest this as a prepartion (and partly substitution) for
> "[PATCH v2 0/8] Block layer thread-safety, continued"
> 
>   include/block/block_int.h         | 12 -----
>   include/block/write-threshold.h   | 24 ---------
>   block.c                           |  1 -
>   block/io.c                        | 21 +++++---
>   block/write-threshold.c           | 87 ++-----------------------------
>   tests/unit/test-write-threshold.c | 38 ++++++++++++++
>   6 files changed, 54 insertions(+), 129 deletions(-)

[...]

> diff --git a/block/io.c b/block/io.c
> index ca2dca3007..e0aa775952 100644
> --- a/block/io.c
> +++ b/block/io.c
> @@ -36,6 +36,8 @@
>   #include "qemu/main-loop.h"
>   #include "sysemu/replay.h"
>   
> +#include "qapi/qapi-events-block-core.h"
> +
>   /* Maximum bounce buffer for copy-on-read and write zeroes, in bytes */
>   #define MAX_BOUNCE_BUFFER (32768 << BDRV_SECTOR_BITS)
>   
> @@ -1974,6 +1976,8 @@ bdrv_co_write_req_prepare(BdrvChild *child, int64_t offset, int64_t bytes,
>              child->perm & BLK_PERM_RESIZE);
>   
>       switch (req->type) {
> +        uint64_t write_threshold;
> +

Works, but I think this is the first time I see a variable declared in a 
switch block.  What I usually do for such cases is to put a block after 
the label.  (i.e. case X: { uint64_t write_threshold; ... })

But it wouldn’t hurt to just declare this at the beginning of 
bdrv_co_write_req_prepare(), I think.

>       case BDRV_TRACKED_WRITE:
>       case BDRV_TRACKED_DISCARD:
>           if (flags & BDRV_REQ_WRITE_UNCHANGED) {
> @@ -1981,8 +1985,15 @@ bdrv_co_write_req_prepare(BdrvChild *child, int64_t offset, int64_t bytes,
>           } else {
>               assert(child->perm & BLK_PERM_WRITE);
>           }
> -        return notifier_with_return_list_notify(&bs->before_write_notifiers,
> -                                                req);
> +        write_threshold = qatomic_read(&bs->write_threshold_offset);
> +        if (write_threshold > 0 && offset + bytes > write_threshold) {
> +            qapi_event_send_block_write_threshold(
> +                bs->node_name,
> +                offset + bytes - write_threshold,
> +                write_threshold);
> +            qatomic_set(&bs->write_threshold_offset, 0);
> +        }

I’d put all of this into a function in block/write-threshold.c that’s 
called from here.

Max

> +        return 0;
>       case BDRV_TRACKED_TRUNCATE:
>           assert(child->perm & BLK_PERM_RESIZE);
>           return 0;
> @@ -3137,12 +3148,6 @@ bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
>       return true;
>   }
>   
> -void bdrv_add_before_write_notifier(BlockDriverState *bs,
> -                                    NotifierWithReturn *notifier)
> -{
> -    notifier_with_return_list_add(&bs->before_write_notifiers, notifier);
> -}
> -
>   void bdrv_io_plug(BlockDriverState *bs)
>   {
>       BdrvChild *child;
Vladimir Sementsov-Ogievskiy May 3, 2021, 8:21 a.m. UTC | #4
30.04.2021 13:04, Max Reitz wrote:
> On 22.04.21 00:09, Vladimir Sementsov-Ogievskiy wrote:
>> write-notifiers are used only for write-threshold. New code for such
>> purpose should create filters.
>>
>> Let's handle write-threshold simply in generic code and drop write
>> notifiers at all.
>>
>> Also move part of write-threshold API that is used only for testing to
>> the test.
>>
>> Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
>> ---
>>
>> I agree that this could be split into 2-3 parts and not combining
>> everything into one. But I'm tired now.
> 
> Er...  You should have put it off until the next day then? O:)

Yes, sorry. I wanted to send that day... Don't remember why :) Checked now, that was not Friday.. I wanted to drop write notifiers long ago, and when I finally do it I couldn't wait, so cool it seemed to me :)

Thanks for comments, I'll send v2 soon.

> 
> It should be multiple patches.  At least one to move the write threshold update to block/io.c, and then another to drop write notifiers.
> 
>> I can send v2 if needed, so
>> consider it as RFC. Or take as is if you think it's not too much-in-one.
>>
>> I also suggest this as a prepartion (and partly substitution) for
>> "[PATCH v2 0/8] Block layer thread-safety, continued"
>>
>>   include/block/block_int.h         | 12 -----
>>   include/block/write-threshold.h   | 24 ---------
>>   block.c                           |  1 -
>>   block/io.c                        | 21 +++++---
>>   block/write-threshold.c           | 87 ++-----------------------------
>>   tests/unit/test-write-threshold.c | 38 ++++++++++++++
>>   6 files changed, 54 insertions(+), 129 deletions(-)
> 
> [...]
> 
>> diff --git a/block/io.c b/block/io.c
>> index ca2dca3007..e0aa775952 100644
>> --- a/block/io.c
>> +++ b/block/io.c
>> @@ -36,6 +36,8 @@
>>   #include "qemu/main-loop.h"
>>   #include "sysemu/replay.h"
>> +#include "qapi/qapi-events-block-core.h"
>> +
>>   /* Maximum bounce buffer for copy-on-read and write zeroes, in bytes */
>>   #define MAX_BOUNCE_BUFFER (32768 << BDRV_SECTOR_BITS)
>> @@ -1974,6 +1976,8 @@ bdrv_co_write_req_prepare(BdrvChild *child, int64_t offset, int64_t bytes,
>>              child->perm & BLK_PERM_RESIZE);
>>       switch (req->type) {
>> +        uint64_t write_threshold;
>> +
> 
> Works, but I think this is the first time I see a variable declared in a switch block.  What I usually do for such cases is to put a block after the label.  (i.e. case X: { uint64_t write_threshold; ... })
> 
> But it wouldn’t hurt to just declare this at the beginning of bdrv_co_write_req_prepare(), I think.
> 
>>       case BDRV_TRACKED_WRITE:
>>       case BDRV_TRACKED_DISCARD:
>>           if (flags & BDRV_REQ_WRITE_UNCHANGED) {
>> @@ -1981,8 +1985,15 @@ bdrv_co_write_req_prepare(BdrvChild *child, int64_t offset, int64_t bytes,
>>           } else {
>>               assert(child->perm & BLK_PERM_WRITE);
>>           }
>> -        return notifier_with_return_list_notify(&bs->before_write_notifiers,
>> -                                                req);
>> +        write_threshold = qatomic_read(&bs->write_threshold_offset);
>> +        if (write_threshold > 0 && offset + bytes > write_threshold) {
>> +            qapi_event_send_block_write_threshold(
>> +                bs->node_name,
>> +                offset + bytes - write_threshold,
>> +                write_threshold);
>> +            qatomic_set(&bs->write_threshold_offset, 0);
>> +        }
> 
> I’d put all of this into a function in block/write-threshold.c that’s called from here.
> 
> Max
> 
>> +        return 0;
>>       case BDRV_TRACKED_TRUNCATE:
>>           assert(child->perm & BLK_PERM_RESIZE);
>>           return 0;
>> @@ -3137,12 +3148,6 @@ bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
>>       return true;
>>   }
>> -void bdrv_add_before_write_notifier(BlockDriverState *bs,
>> -                                    NotifierWithReturn *notifier)
>> -{
>> -    notifier_with_return_list_add(&bs->before_write_notifiers, notifier);
>> -}
>> -
>>   void bdrv_io_plug(BlockDriverState *bs)
>>   {
>>       BdrvChild *child;
>
Stefan Hajnoczi May 5, 2021, 10:10 a.m. UTC | #5
On Thu, Apr 22, 2021 at 01:09:50AM +0300, Vladimir Sementsov-Ogievskiy wrote:
> @@ -1981,8 +1985,15 @@ bdrv_co_write_req_prepare(BdrvChild *child, int64_t offset, int64_t bytes,
>          } else {
>              assert(child->perm & BLK_PERM_WRITE);
>          }
> -        return notifier_with_return_list_notify(&bs->before_write_notifiers,
> -                                                req);
> +        write_threshold = qatomic_read(&bs->write_threshold_offset);
> +        if (write_threshold > 0 && offset + bytes > write_threshold) {
> +            qapi_event_send_block_write_threshold(
> +                bs->node_name,
> +                offset + bytes - write_threshold,
> +                write_threshold);
> +            qatomic_set(&bs->write_threshold_offset, 0);

It's safer to reset the threshold before emitting the event. That way
there is no race with the QMP client setting a new threshold via
bdrv_write_threshold_is_set(). I guess the race is possible since
qatomic is used and there is no lock.

I like the idea of dropping write notifiers:
Acked-by: Stefan Hajnoczi <stefanha@redhat.com>
Vladimir Sementsov-Ogievskiy May 5, 2021, 10:17 a.m. UTC | #6
05.05.2021 13:10, Stefan Hajnoczi wrote:
> On Thu, Apr 22, 2021 at 01:09:50AM +0300, Vladimir Sementsov-Ogievskiy wrote:
>> @@ -1981,8 +1985,15 @@ bdrv_co_write_req_prepare(BdrvChild *child, int64_t offset, int64_t bytes,
>>           } else {
>>               assert(child->perm & BLK_PERM_WRITE);
>>           }
>> -        return notifier_with_return_list_notify(&bs->before_write_notifiers,
>> -                                                req);
>> +        write_threshold = qatomic_read(&bs->write_threshold_offset);
>> +        if (write_threshold > 0 && offset + bytes > write_threshold) {
>> +            qapi_event_send_block_write_threshold(
>> +                bs->node_name,
>> +                offset + bytes - write_threshold,
>> +                write_threshold);
>> +            qatomic_set(&bs->write_threshold_offset, 0);
> 
> It's safer to reset the threshold before emitting the event. That way
> there is no race with the QMP client setting a new threshold via
> bdrv_write_threshold_is_set(). I guess the race is possible since
> qatomic is used and there is no lock.
> 
> I like the idea of dropping write notifiers:
> Acked-by: Stefan Hajnoczi <stefanha@redhat.com>
> 

Sorry, this is an outdated patch, I've sent a v2:

  [PATCH v2 0/9] block: refactor write threshold
diff mbox series

Patch

diff --git a/include/block/block_int.h b/include/block/block_int.h
index 88e4111939..50af58af75 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -957,12 +957,8 @@  struct BlockDriverState {
      */
     int64_t total_sectors;
 
-    /* Callback before write request is processed */
-    NotifierWithReturnList before_write_notifiers;
-
     /* threshold limit for writes, in bytes. "High water mark". */
     uint64_t write_threshold_offset;
-    NotifierWithReturn write_threshold_notifier;
 
     /* Writing to the list requires the BQL _and_ the dirty_bitmap_mutex.
      * Reading from the list can be done with either the BQL or the
@@ -1087,14 +1083,6 @@  void bdrv_parse_filename_strip_prefix(const char *filename, const char *prefix,
 bool bdrv_backing_overridden(BlockDriverState *bs);
 
 
-/**
- * bdrv_add_before_write_notifier:
- *
- * Register a callback that is invoked before write requests are processed but
- * after any throttling or waiting for overlapping requests.
- */
-void bdrv_add_before_write_notifier(BlockDriverState *bs,
-                                    NotifierWithReturn *notifier);
 
 /**
  * bdrv_add_aio_context_notifier:
diff --git a/include/block/write-threshold.h b/include/block/write-threshold.h
index c646f267a4..23e1bfc123 100644
--- a/include/block/write-threshold.h
+++ b/include/block/write-threshold.h
@@ -35,28 +35,4 @@  void bdrv_write_threshold_set(BlockDriverState *bs, uint64_t threshold_bytes);
  */
 uint64_t bdrv_write_threshold_get(const BlockDriverState *bs);
 
-/*
- * bdrv_write_threshold_is_set
- *
- * Tell if a write threshold is set for a given BDS.
- */
-bool bdrv_write_threshold_is_set(const BlockDriverState *bs);
-
-/*
- * bdrv_write_threshold_exceeded
- *
- * Return the extent of a write request that exceeded the threshold,
- * or zero if the request is below the threshold.
- * Return zero also if the threshold was not set.
- *
- * NOTE: here we assume the following holds for each request this code
- * deals with:
- *
- * assert((req->offset + req->bytes) <= UINT64_MAX)
- *
- * Please not there is *not* an actual C assert().
- */
-uint64_t bdrv_write_threshold_exceeded(const BlockDriverState *bs,
-                                       const BdrvTrackedRequest *req);
-
 #endif
diff --git a/block.c b/block.c
index c5b887cec1..001453105e 100644
--- a/block.c
+++ b/block.c
@@ -381,7 +381,6 @@  BlockDriverState *bdrv_new(void)
     for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
         QLIST_INIT(&bs->op_blockers[i]);
     }
-    notifier_with_return_list_init(&bs->before_write_notifiers);
     qemu_co_mutex_init(&bs->reqs_lock);
     qemu_mutex_init(&bs->dirty_bitmap_mutex);
     bs->refcnt = 1;
diff --git a/block/io.c b/block/io.c
index ca2dca3007..e0aa775952 100644
--- a/block/io.c
+++ b/block/io.c
@@ -36,6 +36,8 @@ 
 #include "qemu/main-loop.h"
 #include "sysemu/replay.h"
 
+#include "qapi/qapi-events-block-core.h"
+
 /* Maximum bounce buffer for copy-on-read and write zeroes, in bytes */
 #define MAX_BOUNCE_BUFFER (32768 << BDRV_SECTOR_BITS)
 
@@ -1974,6 +1976,8 @@  bdrv_co_write_req_prepare(BdrvChild *child, int64_t offset, int64_t bytes,
            child->perm & BLK_PERM_RESIZE);
 
     switch (req->type) {
+        uint64_t write_threshold;
+
     case BDRV_TRACKED_WRITE:
     case BDRV_TRACKED_DISCARD:
         if (flags & BDRV_REQ_WRITE_UNCHANGED) {
@@ -1981,8 +1985,15 @@  bdrv_co_write_req_prepare(BdrvChild *child, int64_t offset, int64_t bytes,
         } else {
             assert(child->perm & BLK_PERM_WRITE);
         }
-        return notifier_with_return_list_notify(&bs->before_write_notifiers,
-                                                req);
+        write_threshold = qatomic_read(&bs->write_threshold_offset);
+        if (write_threshold > 0 && offset + bytes > write_threshold) {
+            qapi_event_send_block_write_threshold(
+                bs->node_name,
+                offset + bytes - write_threshold,
+                write_threshold);
+            qatomic_set(&bs->write_threshold_offset, 0);
+        }
+        return 0;
     case BDRV_TRACKED_TRUNCATE:
         assert(child->perm & BLK_PERM_RESIZE);
         return 0;
@@ -3137,12 +3148,6 @@  bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
     return true;
 }
 
-void bdrv_add_before_write_notifier(BlockDriverState *bs,
-                                    NotifierWithReturn *notifier)
-{
-    notifier_with_return_list_add(&bs->before_write_notifiers, notifier);
-}
-
 void bdrv_io_plug(BlockDriverState *bs)
 {
     BdrvChild *child;
diff --git a/block/write-threshold.c b/block/write-threshold.c
index 85b78dc2a9..9bf4287c6e 100644
--- a/block/write-threshold.c
+++ b/block/write-threshold.c
@@ -21,104 +21,23 @@ 
 
 uint64_t bdrv_write_threshold_get(const BlockDriverState *bs)
 {
-    return bs->write_threshold_offset;
-}
-
-bool bdrv_write_threshold_is_set(const BlockDriverState *bs)
-{
-    return bs->write_threshold_offset > 0;
-}
-
-static void write_threshold_disable(BlockDriverState *bs)
-{
-    if (bdrv_write_threshold_is_set(bs)) {
-        notifier_with_return_remove(&bs->write_threshold_notifier);
-        bs->write_threshold_offset = 0;
-    }
-}
-
-uint64_t bdrv_write_threshold_exceeded(const BlockDriverState *bs,
-                                       const BdrvTrackedRequest *req)
-{
-    if (bdrv_write_threshold_is_set(bs)) {
-        if (req->offset > bs->write_threshold_offset) {
-            return (req->offset - bs->write_threshold_offset) + req->bytes;
-        }
-        if ((req->offset + req->bytes) > bs->write_threshold_offset) {
-            return (req->offset + req->bytes) - bs->write_threshold_offset;
-        }
-    }
-    return 0;
-}
-
-static int coroutine_fn before_write_notify(NotifierWithReturn *notifier,
-                                            void *opaque)
-{
-    BdrvTrackedRequest *req = opaque;
-    BlockDriverState *bs = req->bs;
-    uint64_t amount = 0;
-
-    amount = bdrv_write_threshold_exceeded(bs, req);
-    if (amount > 0) {
-        qapi_event_send_block_write_threshold(
-            bs->node_name,
-            amount,
-            bs->write_threshold_offset);
-
-        /* autodisable to avoid flooding the monitor */
-        write_threshold_disable(bs);
-    }
-
-    return 0; /* should always let other notifiers run */
-}
-
-static void write_threshold_register_notifier(BlockDriverState *bs)
-{
-    bs->write_threshold_notifier.notify = before_write_notify;
-    bdrv_add_before_write_notifier(bs, &bs->write_threshold_notifier);
-}
-
-static void write_threshold_update(BlockDriverState *bs,
-                                   int64_t threshold_bytes)
-{
-    bs->write_threshold_offset = threshold_bytes;
+    return qatomic_read(&bs->write_threshold_offset);
 }
 
 void bdrv_write_threshold_set(BlockDriverState *bs, uint64_t threshold_bytes)
 {
-    if (bdrv_write_threshold_is_set(bs)) {
-        if (threshold_bytes > 0) {
-            write_threshold_update(bs, threshold_bytes);
-        } else {
-            write_threshold_disable(bs);
-        }
-    } else {
-        if (threshold_bytes > 0) {
-            /* avoid multiple registration */
-            write_threshold_register_notifier(bs);
-            write_threshold_update(bs, threshold_bytes);
-        }
-        /* discard bogus disable request */
-    }
+    qatomic_set(&bs->write_threshold_offset, threshold_bytes);
 }
 
 void qmp_block_set_write_threshold(const char *node_name,
                                    uint64_t threshold_bytes,
                                    Error **errp)
 {
-    BlockDriverState *bs;
-    AioContext *aio_context;
-
-    bs = bdrv_find_node(node_name);
+    BlockDriverState *bs = bdrv_find_node(node_name);
     if (!bs) {
         error_setg(errp, "Device '%s' not found", node_name);
         return;
     }
 
-    aio_context = bdrv_get_aio_context(bs);
-    aio_context_acquire(aio_context);
-
     bdrv_write_threshold_set(bs, threshold_bytes);
-
-    aio_context_release(aio_context);
 }
diff --git a/tests/unit/test-write-threshold.c b/tests/unit/test-write-threshold.c
index fc1c45a2eb..c2f4cd20d7 100644
--- a/tests/unit/test-write-threshold.c
+++ b/tests/unit/test-write-threshold.c
@@ -12,6 +12,44 @@ 
 #include "block/write-threshold.h"
 
 
+/*
+ * bdrv_write_threshold_is_set
+ *
+ * Tell if a write threshold is set for a given BDS.
+ */
+static bool bdrv_write_threshold_is_set(const BlockDriverState *bs)
+{
+    return bs->write_threshold_offset > 0;
+}
+
+/*
+ * bdrv_write_threshold_exceeded
+ *
+ * Return the extent of a write request that exceeded the threshold,
+ * or zero if the request is below the threshold.
+ * Return zero also if the threshold was not set.
+ *
+ * NOTE: here we assume the following holds for each request this code
+ * deals with:
+ *
+ * assert((req->offset + req->bytes) <= UINT64_MAX)
+ *
+ * Please not there is *not* an actual C assert().
+ */
+static uint64_t bdrv_write_threshold_exceeded(const BlockDriverState *bs,
+                                              const BdrvTrackedRequest *req)
+{
+    if (bdrv_write_threshold_is_set(bs)) {
+        if (req->offset > bs->write_threshold_offset) {
+            return (req->offset - bs->write_threshold_offset) + req->bytes;
+        }
+        if ((req->offset + req->bytes) > bs->write_threshold_offset) {
+            return (req->offset + req->bytes) - bs->write_threshold_offset;
+        }
+    }
+    return 0;
+}
+
 static void test_threshold_not_set_on_init(void)
 {
     uint64_t res;