diff mbox

[v3,03/23] block: Connect BlockBackend to BlockDriverState

Message ID 1410891148-28849-4-git-send-email-armbru@redhat.com
State New
Headers show

Commit Message

Markus Armbruster Sept. 16, 2014, 6:12 p.m. UTC
The pointer from BlockBackend to BlockDriverState is a strong
reference, managed with bdrv_ref() / bdrv_unref(), the back-pointer is
a weak one.

Convenience function blk_new_with_bs() creates a BlockBackend with its
BlockDriverState.  Callers have to unref both.  The commit after next
will relieve them of the need to unref the BlockDriverState.

Complication: due to the silly way drive_del works, we need a way to
hide a BlockBackend, just like bdrv_make_anon().  To emphasize its
"special" status, give the function a suitably off-putting name:
blk_hide_on_behalf_of_do_drive_del().  Unfortunately, hiding turns the
BlockBackend's name into the empty string.  Can't avoid that without
breaking the blk->bs->device_name equals blk->name invariant.

The patch adds a memory leak: drive_del while a device model is
connected leaks the BlockBackend.  Avoiding the leak here is rather
hairy, but it'll become straightforward in a few commits, so I mark it
FIXME in the code now, and plug it when it's easy.

Signed-off-by: Markus Armbruster <armbru@redhat.com>
---
 block.c                        |  10 ++--
 block/block-backend.c          |  71 ++++++++++++++++++++++-
 blockdev.c                     |  21 ++++---
 hw/block/xen_disk.c            |   8 +--
 include/block/block_int.h      |   2 +
 include/sysemu/block-backend.h |   5 ++
 qemu-img.c                     | 125 +++++++++++++++++++----------------------
 qemu-io.c                      |   4 +-
 qemu-nbd.c                     |   4 +-
 9 files changed, 156 insertions(+), 94 deletions(-)

Comments

Max Reitz Sept. 20, 2014, 7:08 p.m. UTC | #1
On 16.09.2014 20:12, Markus Armbruster wrote:
> The pointer from BlockBackend to BlockDriverState is a strong
> reference, managed with bdrv_ref() / bdrv_unref(), the back-pointer is
> a weak one.
>
> Convenience function blk_new_with_bs() creates a BlockBackend with its
> BlockDriverState.  Callers have to unref both.  The commit after next
> will relieve them of the need to unref the BlockDriverState.
>
> Complication: due to the silly way drive_del works, we need a way to
> hide a BlockBackend, just like bdrv_make_anon().  To emphasize its
> "special" status, give the function a suitably off-putting name:
> blk_hide_on_behalf_of_do_drive_del().  Unfortunately, hiding turns the
> BlockBackend's name into the empty string.  Can't avoid that without
> breaking the blk->bs->device_name equals blk->name invariant.
>
> The patch adds a memory leak: drive_del while a device model is
> connected leaks the BlockBackend.  Avoiding the leak here is rather
> hairy, but it'll become straightforward in a few commits, so I mark it
> FIXME in the code now, and plug it when it's easy.

Good.

> Signed-off-by: Markus Armbruster <armbru@redhat.com>
> ---
>   block.c                        |  10 ++--
>   block/block-backend.c          |  71 ++++++++++++++++++++++-
>   blockdev.c                     |  21 ++++---
>   hw/block/xen_disk.c            |   8 +--
>   include/block/block_int.h      |   2 +
>   include/sysemu/block-backend.h |   5 ++
>   qemu-img.c                     | 125 +++++++++++++++++++----------------------
>   qemu-io.c                      |   4 +-
>   qemu-nbd.c                     |   4 +-
>   9 files changed, 156 insertions(+), 94 deletions(-)

Reviewed-by: Max Reitz <mreitz@redhat.com>
Kevin Wolf Sept. 22, 2014, 2:59 p.m. UTC | #2
Am 16.09.2014 um 20:12 hat Markus Armbruster geschrieben:
> The pointer from BlockBackend to BlockDriverState is a strong
> reference, managed with bdrv_ref() / bdrv_unref(), the back-pointer is
> a weak one.
> 
> Convenience function blk_new_with_bs() creates a BlockBackend with its
> BlockDriverState.  Callers have to unref both.  The commit after next
> will relieve them of the need to unref the BlockDriverState.
> 
> Complication: due to the silly way drive_del works, we need a way to
> hide a BlockBackend, just like bdrv_make_anon().  To emphasize its
> "special" status, give the function a suitably off-putting name:
> blk_hide_on_behalf_of_do_drive_del().  Unfortunately, hiding turns the
> BlockBackend's name into the empty string.  Can't avoid that without
> breaking the blk->bs->device_name equals blk->name invariant.
> 
> The patch adds a memory leak: drive_del while a device model is
> connected leaks the BlockBackend.  Avoiding the leak here is rather
> hairy, but it'll become straightforward in a few commits, so I mark it
> FIXME in the code now, and plug it when it's easy.
> 
> Signed-off-by: Markus Armbruster <armbru@redhat.com>

> +/*
> + * Hide @blk.
> + * @blk must not have been hidden already.
> + * Make attached BlockDriverState, if any, anonymous.
> + * Once hidden, @blk is invisible to all functions that don't receive
> + * it as argument.  For example, blk_by_name() won't return it.
> + * Strictly for use by do_drive_del().
> + * TODO get rid of it!
> + */
> +void blk_hide_on_behalf_of_do_drive_del(BlockBackend *blk)
> +{
> +    QTAILQ_REMOVE(&blk_backends, blk, link);
> +    blk->name[0] = 0;

Style nit: I prefer '\0' when dealing with strings.

> +    if (blk->bs) {
> +        bdrv_make_anon(blk->bs);
> +    }
> +}
> diff --git a/blockdev.c b/blockdev.c
> index 583235a..5da6028 100644
> --- a/blockdev.c
> +++ b/blockdev.c
> @@ -228,6 +228,7 @@ void drive_info_del(DriveInfo *dinfo)
>      if (dinfo->opts) {
>          qemu_opts_del(dinfo->opts);
>      }
> +
>      g_free(dinfo->id);
>      QTAILQ_REMOVE(&drives, dinfo, next);
>      g_free(dinfo->serial);

This hunk is a rebasing artifact, I guess?

> diff --git a/include/block/block_int.h b/include/block/block_int.h
> index 8d86a6c..14e0b7c 100644
> --- a/include/block/block_int.h
> +++ b/include/block/block_int.h
> @@ -324,6 +324,8 @@ struct BlockDriverState {
>      BlockDriver *drv; /* NULL means no media */
>      void *opaque;
>  
> +    BlockBackend *blk;          /* owning backend, if any */
> +
>      void *dev;                  /* attached device model, if any */
>      /* TODO change to DeviceState when all users are qdevified */
>      const BlockDevOps *dev_ops;

Just to make sure that we agree on where we're going: This makes the
assumption that a BDS has at most one BB that owns it. Which is not the
final state that we want to have, so this will have to go away later.
(Where "later" isn't necessarily part of this series.)

For now, the use of the field is limited to callbacks and
bdrv_get_device_name(). Callbacks could always only serve a single
device, so nothing became worse here.

I'm not entirely sure about bdrv_get_device_name(), whether it needs to
go or to be rewritten to get the name of any BB pointing to it (I
suspect for most callers we want to replace it by something that uses
node-name by default if there is one and only fall back to BB names if
there isn't), but that's not an issue to block this patch.

What I would consider, however, is adding a TODO comment that tells
people that this field needs to go and if you need to use it, something
is wrong with your design (which happens to be true for the existing
design of some code).


Nothing critical in this patch, so with or without addressing the
comments:

Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Markus Armbruster Sept. 22, 2014, 4:34 p.m. UTC | #3
Kevin Wolf <kwolf@redhat.com> writes:

> Am 16.09.2014 um 20:12 hat Markus Armbruster geschrieben:
>> The pointer from BlockBackend to BlockDriverState is a strong
>> reference, managed with bdrv_ref() / bdrv_unref(), the back-pointer is
>> a weak one.
>> 
>> Convenience function blk_new_with_bs() creates a BlockBackend with its
>> BlockDriverState.  Callers have to unref both.  The commit after next
>> will relieve them of the need to unref the BlockDriverState.
>> 
>> Complication: due to the silly way drive_del works, we need a way to
>> hide a BlockBackend, just like bdrv_make_anon().  To emphasize its
>> "special" status, give the function a suitably off-putting name:
>> blk_hide_on_behalf_of_do_drive_del().  Unfortunately, hiding turns the
>> BlockBackend's name into the empty string.  Can't avoid that without
>> breaking the blk->bs->device_name equals blk->name invariant.
>> 
>> The patch adds a memory leak: drive_del while a device model is
>> connected leaks the BlockBackend.  Avoiding the leak here is rather
>> hairy, but it'll become straightforward in a few commits, so I mark it
>> FIXME in the code now, and plug it when it's easy.
>> 
>> Signed-off-by: Markus Armbruster <armbru@redhat.com>
>
>> +/*
>> + * Hide @blk.
>> + * @blk must not have been hidden already.
>> + * Make attached BlockDriverState, if any, anonymous.
>> + * Once hidden, @blk is invisible to all functions that don't receive
>> + * it as argument.  For example, blk_by_name() won't return it.
>> + * Strictly for use by do_drive_del().
>> + * TODO get rid of it!
>> + */
>> +void blk_hide_on_behalf_of_do_drive_del(BlockBackend *blk)
>> +{
>> +    QTAILQ_REMOVE(&blk_backends, blk, link);
>> +    blk->name[0] = 0;
>
> Style nit: I prefer '\0' when dealing with strings.

I don't, but if you feel strongly about it, I'll do it your way.

>> +    if (blk->bs) {
>> +        bdrv_make_anon(blk->bs);
>> +    }
>> +}
>> diff --git a/blockdev.c b/blockdev.c
>> index 583235a..5da6028 100644
>> --- a/blockdev.c
>> +++ b/blockdev.c
>> @@ -228,6 +228,7 @@ void drive_info_del(DriveInfo *dinfo)
>>      if (dinfo->opts) {
>>          qemu_opts_del(dinfo->opts);
>>      }
>> +
>>      g_free(dinfo->id);
>>      QTAILQ_REMOVE(&drives, dinfo, next);
>>      g_free(dinfo->serial);
>
> This hunk is a rebasing artifact, I guess?

Consider it gone.

>> diff --git a/include/block/block_int.h b/include/block/block_int.h
>> index 8d86a6c..14e0b7c 100644
>> --- a/include/block/block_int.h
>> +++ b/include/block/block_int.h
>> @@ -324,6 +324,8 @@ struct BlockDriverState {
>>      BlockDriver *drv; /* NULL means no media */
>>      void *opaque;
>>  
>> +    BlockBackend *blk;          /* owning backend, if any */
>> +
>>      void *dev;                  /* attached device model, if any */
>>      /* TODO change to DeviceState when all users are qdevified */
>>      const BlockDevOps *dev_ops;
>
> Just to make sure that we agree on where we're going: This makes the
> assumption that a BDS has at most one BB that owns it.

Yes.

>                                                        Which is not the
> final state that we want to have, so this will have to go away later.

I don't know.  Can you explain why you think we're going to want
multiple BBs?

> (Where "later" isn't necessarily part of this series.)
>
> For now, the use of the field is limited to callbacks and
> bdrv_get_device_name(). Callbacks could always only serve a single
> device, so nothing became worse here.

In *this* patch, member blk is only read in bdrv_swap(), which asserts
it's null.  Later on in the series, it gets indeed used as you describe.

PATCH 22 puts it to use for BlockDevOps callbacks.  The patch moves the
callbacks from BDS to BB.  I hope you'll agree that's where they belong.

Naturally, the *calls* of the callbacks remain where they are, in
block.c.  They get updated like this:

-       bdrv_dev_FOO(bs, ARGS)
+       if (bs->blk) {
+           blk_dev_FOO(bs->blk ARGS)
+       }

PATCH 08 uses it to eliminate BDS member device_name[].

> I'm not entirely sure about bdrv_get_device_name(), whether it needs to
> go or to be rewritten to get the name of any BB pointing to it (I
> suspect for most callers we want to replace it by something that uses
> node-name by default if there is one and only fall back to BB names if
> there isn't), but that's not an issue to block this patch.

I agree users of bdrv_get_device_name() need to be examined, and the
ones that really want a BDS name should probably be changed to use the
BDS name (a.k.a. node-name) and fall back to the BB name.

This series makes this need more visible, by emphasizing the
distinctness of the two names.

Aside: which one to fall back to if we have multiple BBs?

> What I would consider, however, is adding a TODO comment that tells
> people that this field needs to go and if you need to use it, something
> is wrong with your design (which happens to be true for the existing
> design of some code).

For the device callbacks, we need a way to find the BB.  If multiple BBs
can sit on top of the same BDS, we need to find the one with a device
models attached.  Ot even the ones, if we permit that.

Let's discuss this a bit, and depending on what we learn, add a suitable
comment.  Possibly on top.

> Nothing critical in this patch, so with or without addressing the
> comments:
>
> Reviewed-by: Kevin Wolf <kwolf@redhat.com>

Thanks!
Kevin Wolf Sept. 23, 2014, 11:45 a.m. UTC | #4
Am 22.09.2014 um 18:34 hat Markus Armbruster geschrieben:
> Kevin Wolf <kwolf@redhat.com> writes:
> 
> > Am 16.09.2014 um 20:12 hat Markus Armbruster geschrieben:
> >> diff --git a/include/block/block_int.h b/include/block/block_int.h
> >> index 8d86a6c..14e0b7c 100644
> >> --- a/include/block/block_int.h
> >> +++ b/include/block/block_int.h
> >> @@ -324,6 +324,8 @@ struct BlockDriverState {
> >>      BlockDriver *drv; /* NULL means no media */
> >>      void *opaque;
> >>  
> >> +    BlockBackend *blk;          /* owning backend, if any */
> >> +
> >>      void *dev;                  /* attached device model, if any */
> >>      /* TODO change to DeviceState when all users are qdevified */
> >>      const BlockDevOps *dev_ops;
> >
> > Just to make sure that we agree on where we're going: This makes the
> > assumption that a BDS has at most one BB that owns it.
> 
> Yes.
> 
> >                                                        Which is not the
> > final state that we want to have, so this will have to go away later.
> 
> I don't know.  Can you explain why you think we're going to want
> multiple BBs?

We already agreed that we'll have multiple parents for a BDS, for
scenarios like having an NBD server on a snapshot or sharing backing
files, potentially also some block jobs.

The question is whether among these multiple parents we want to have a
limitation to one BlockBackend, forbidding e.g. an NBD server on the
active layer. This would be a problem for live storage migration if we
don't want the NBD server to reuse the same BB as the guest device.

More generally, if we can indirectly have multiple BBs on a single
BDS by putting a filter in between, do we have good reasons to forbid
having them attached directly?

> > (Where "later" isn't necessarily part of this series.)
> >
> > For now, the use of the field is limited to callbacks and
> > bdrv_get_device_name(). Callbacks could always only serve a single
> > device, so nothing became worse here.
> 
> In *this* patch, member blk is only read in bdrv_swap(), which asserts
> it's null.  Later on in the series, it gets indeed used as you describe.

Yes, my "now" depends on context and either refers to the patch I'm
commenting on or the end of the series. In most cases when I see
something that I feel is worth having a closer look, the first thing I
do is looking at the fully applied series.

> PATCH 22 puts it to use for BlockDevOps callbacks.  The patch moves the
> callbacks from BDS to BB.  I hope you'll agree that's where they belong.
> 
> Naturally, the *calls* of the callbacks remain where they are, in
> block.c.  They get updated like this:
> 
> -       bdrv_dev_FOO(bs, ARGS)
> +       if (bs->blk) {
> +           blk_dev_FOO(bs->blk ARGS)
> +       }

Yes, as I said, this is fine for now. When we allow multiple BBs, we'll
have to turn it into something like notifier lists, but that can wait.

> PATCH 08 uses it to eliminate BDS member device_name[].
> 
> > I'm not entirely sure about bdrv_get_device_name(), whether it needs to
> > go or to be rewritten to get the name of any BB pointing to it (I
> > suspect for most callers we want to replace it by something that uses
> > node-name by default if there is one and only fall back to BB names if
> > there isn't), but that's not an issue to block this patch.
> 
> I agree users of bdrv_get_device_name() need to be examined, and the
> ones that really want a BDS name should probably be changed to use the
> BDS name (a.k.a. node-name) and fall back to the BB name.
> 
> This series makes this need more visible, by emphasizing the
> distinctness of the two names.
> 
> Aside: which one to fall back to if we have multiple BBs?

My first attempt would be "any", and in cases where this isn't good
enough, you can't use a fallback at all.

> > What I would consider, however, is adding a TODO comment that tells
> > people that this field needs to go and if you need to use it, something
> > is wrong with your design (which happens to be true for the existing
> > design of some code).
> 
> For the device callbacks, we need a way to find the BB.  If multiple BBs
> can sit on top of the same BDS, we need to find the one with a device
> models attached.  Ot even the ones, if we permit that.
> 
> Let's discuss this a bit, and depending on what we learn, add a suitable
> comment.  Possibly on top.

Are you sure that nothing else than device models can be interested in
callbacks? I expect that whatever block layer user we have, they will
always be interested in resizes, for example. Media change might also
not be entirely uninteresting, though in most cases what other users
want is probably a blocker.

Kevin
Markus Armbruster Sept. 23, 2014, 12:52 p.m. UTC | #5
Kevin Wolf <kwolf@redhat.com> writes:

> Am 22.09.2014 um 18:34 hat Markus Armbruster geschrieben:
>> Kevin Wolf <kwolf@redhat.com> writes:
>> 
>> > Am 16.09.2014 um 20:12 hat Markus Armbruster geschrieben:
>> >> diff --git a/include/block/block_int.h b/include/block/block_int.h
>> >> index 8d86a6c..14e0b7c 100644
>> >> --- a/include/block/block_int.h
>> >> +++ b/include/block/block_int.h
>> >> @@ -324,6 +324,8 @@ struct BlockDriverState {
>> >>      BlockDriver *drv; /* NULL means no media */
>> >>      void *opaque;
>> >>  
>> >> +    BlockBackend *blk;          /* owning backend, if any */
>> >> +
>> >>      void *dev;                  /* attached device model, if any */
>> >>      /* TODO change to DeviceState when all users are qdevified */
>> >>      const BlockDevOps *dev_ops;
>> >
>> > Just to make sure that we agree on where we're going: This makes the
>> > assumption that a BDS has at most one BB that owns it.
>> 
>> Yes.
>> 
>> >                                                        Which is not the
>> > final state that we want to have, so this will have to go away later.
>> 
>> I don't know.  Can you explain why you think we're going to want
>> multiple BBs?
>
> We already agreed that we'll have multiple parents for a BDS, for
> scenarios like having an NBD server on a snapshot or sharing backing
> files, potentially also some block jobs.

We certainly want to provide for multiple "users" (intentionally vague
language here), such NBD server, block jobs, device models.  Should they
share a BB, or does each one need its own BB?

> The question is whether among these multiple parents we want to have a
> limitation to one BlockBackend, forbidding e.g. an NBD server on the
> active layer. This would be a problem for live storage migration if we
> don't want the NBD server to reuse the same BB as the guest device.
>
> More generally, if we can indirectly have multiple BBs on a single
> BDS by putting a filter in between, do we have good reasons to forbid
> having them attached directly?

Keeping code simple?

Not a valid argument when we *need* multiple BBs, i.e. when the answer
to my prior question is "each one needs its own BB".

>> > (Where "later" isn't necessarily part of this series.)
>> >
>> > For now, the use of the field is limited to callbacks and
>> > bdrv_get_device_name(). Callbacks could always only serve a single
>> > device, so nothing became worse here.
>> 
>> In *this* patch, member blk is only read in bdrv_swap(), which asserts
>> it's null.  Later on in the series, it gets indeed used as you describe.
>
> Yes, my "now" depends on context and either refers to the patch I'm
> commenting on or the end of the series. In most cases when I see
> something that I feel is worth having a closer look, the first thing I
> do is looking at the fully applied series.
>
>> PATCH 22 puts it to use for BlockDevOps callbacks.  The patch moves the
>> callbacks from BDS to BB.  I hope you'll agree that's where they belong.
>> 
>> Naturally, the *calls* of the callbacks remain where they are, in
>> block.c.  They get updated like this:
>> 
>> -       bdrv_dev_FOO(bs, ARGS)
>> +       if (bs->blk) {
>> +           blk_dev_FOO(bs->blk ARGS)
>> +       }
>
> Yes, as I said, this is fine for now. When we allow multiple BBs, we'll
> have to turn it into something like notifier lists, but that can wait.

Okay.

>> PATCH 08 uses it to eliminate BDS member device_name[].
>> 
>> > I'm not entirely sure about bdrv_get_device_name(), whether it needs to
>> > go or to be rewritten to get the name of any BB pointing to it (I
>> > suspect for most callers we want to replace it by something that uses
>> > node-name by default if there is one and only fall back to BB names if
>> > there isn't), but that's not an issue to block this patch.
>> 
>> I agree users of bdrv_get_device_name() need to be examined, and the
>> ones that really want a BDS name should probably be changed to use the
>> BDS name (a.k.a. node-name) and fall back to the BB name.
>> 
>> This series makes this need more visible, by emphasizing the
>> distinctness of the two names.
>> 
>> Aside: which one to fall back to if we have multiple BBs?
>
> My first attempt would be "any", and in cases where this isn't good
> enough, you can't use a fallback at all.

This is going to be fun :)

>> > What I would consider, however, is adding a TODO comment that tells
>> > people that this field needs to go and if you need to use it, something
>> > is wrong with your design (which happens to be true for the existing
>> > design of some code).
>> 
>> For the device callbacks, we need a way to find the BB.  If multiple BBs
>> can sit on top of the same BDS, we need to find the one with a device
>> models attached.  Ot even the ones, if we permit that.
>> 
>> Let's discuss this a bit, and depending on what we learn, add a suitable
>> comment.  Possibly on top.
>
> Are you sure that nothing else than device models can be interested in
> callbacks? I expect that whatever block layer user we have, they will
> always be interested in resizes, for example. Media change might also
> not be entirely uninteresting, though in most cases what other users
> want is probably a blocker.

I designed BlockDevOps for device models only.  If other users emerge,
it needs a rename, and possibly a rethink.
Kevin Wolf Sept. 23, 2014, 1:36 p.m. UTC | #6
Am 23.09.2014 um 14:52 hat Markus Armbruster geschrieben:
> Kevin Wolf <kwolf@redhat.com> writes:
> 
> > Am 22.09.2014 um 18:34 hat Markus Armbruster geschrieben:
> >> Kevin Wolf <kwolf@redhat.com> writes:
> >> 
> >> > Am 16.09.2014 um 20:12 hat Markus Armbruster geschrieben:
> >> >> diff --git a/include/block/block_int.h b/include/block/block_int.h
> >> >> index 8d86a6c..14e0b7c 100644
> >> >> --- a/include/block/block_int.h
> >> >> +++ b/include/block/block_int.h
> >> >> @@ -324,6 +324,8 @@ struct BlockDriverState {
> >> >>      BlockDriver *drv; /* NULL means no media */
> >> >>      void *opaque;
> >> >>  
> >> >> +    BlockBackend *blk;          /* owning backend, if any */
> >> >> +
> >> >>      void *dev;                  /* attached device model, if any */
> >> >>      /* TODO change to DeviceState when all users are qdevified */
> >> >>      const BlockDevOps *dev_ops;
> >> >
> >> > Just to make sure that we agree on where we're going: This makes the
> >> > assumption that a BDS has at most one BB that owns it.
> >> 
> >> Yes.
> >> 
> >> >                                                        Which is not the
> >> > final state that we want to have, so this will have to go away later.
> >> 
> >> I don't know.  Can you explain why you think we're going to want
> >> multiple BBs?
> >
> > We already agreed that we'll have multiple parents for a BDS, for
> > scenarios like having an NBD server on a snapshot or sharing backing
> > files, potentially also some block jobs.
> 
> We certainly want to provide for multiple "users" (intentionally vague
> language here), such NBD server, block jobs, device models.  Should they
> share a BB, or does each one need its own BB?

I think they should have their own BB, but I can still be convinced
otherwise.

The first reason is that frontends (= BB users, more or less) and
backends are easiest to understand when they come in pairs. Having
multiple frontends for a single backend might be confusion.

Second, if the NBD server doesn't sit at the root but accesses a
backing file, it already has to get its own BB with its own name and
with no device model attached.  Doing the same at the root helps with
consistency.

Third, we'll probably want to have some things like werror/rerror or I/O
accounting handled separately for device models and NBD servers.

If we look at another type of users, we might easily find more reasons,
but for me this is already a pretty strong indicator that shared BBs are
probably not a good idea.

> > The question is whether among these multiple parents we want to have a
> > limitation to one BlockBackend, forbidding e.g. an NBD server on the
> > active layer. This would be a problem for live storage migration if we
> > don't want the NBD server to reuse the same BB as the guest device.
> >
> > More generally, if we can indirectly have multiple BBs on a single
> > BDS by putting a filter in between, do we have good reasons to forbid
> > having them attached directly?
> 
> Keeping code simple?
> 
> Not a valid argument when we *need* multiple BBs, i.e. when the answer
> to my prior question is "each one needs its own BB".

Are there more places than just the callbacks that would be complicated
by multiple BBs per BDS?

> >> > What I would consider, however, is adding a TODO comment that tells
> >> > people that this field needs to go and if you need to use it, something
> >> > is wrong with your design (which happens to be true for the existing
> >> > design of some code).
> >> 
> >> For the device callbacks, we need a way to find the BB.  If multiple BBs
> >> can sit on top of the same BDS, we need to find the one with a device
> >> models attached.  Ot even the ones, if we permit that.
> >> 
> >> Let's discuss this a bit, and depending on what we learn, add a suitable
> >> comment.  Possibly on top.
> >
> > Are you sure that nothing else than device models can be interested in
> > callbacks? I expect that whatever block layer user we have, they will
> > always be interested in resizes, for example. Media change might also
> > not be entirely uninteresting, though in most cases what other users
> > want is probably a blocker.
> 
> I designed BlockDevOps for device models only.  If other users emerge,
> it needs a rename, and possibly a rethink.

Very likely to happen in the long run. Block jobs are today blocking
resizes, but that's mostly because they don't have an easy way to
respond to a resize.  Sooner or later someone will want to grow their
images while they are being mirrored (which is a completely reasonable,
even if not trivial, thing to want).

Do we have a KVM Forum block layer agenda yet? I think this thread could
already contain a few topics to discuss there.

Kevin
Markus Armbruster Sept. 23, 2014, 3:29 p.m. UTC | #7
Kevin Wolf <kwolf@redhat.com> writes:

> Am 23.09.2014 um 14:52 hat Markus Armbruster geschrieben:
>> Kevin Wolf <kwolf@redhat.com> writes:
>> 
>> > Am 22.09.2014 um 18:34 hat Markus Armbruster geschrieben:
>> >> Kevin Wolf <kwolf@redhat.com> writes:
>> >> 
>> >> > Am 16.09.2014 um 20:12 hat Markus Armbruster geschrieben:
>> >> >> diff --git a/include/block/block_int.h b/include/block/block_int.h
>> >> >> index 8d86a6c..14e0b7c 100644
>> >> >> --- a/include/block/block_int.h
>> >> >> +++ b/include/block/block_int.h
>> >> >> @@ -324,6 +324,8 @@ struct BlockDriverState {
>> >> >>      BlockDriver *drv; /* NULL means no media */
>> >> >>      void *opaque;
>> >> >>  
>> >> >> +    BlockBackend *blk;          /* owning backend, if any */
>> >> >> +
>> >> >>      void *dev;                  /* attached device model, if any */
>> >> >>      /* TODO change to DeviceState when all users are qdevified */
>> >> >>      const BlockDevOps *dev_ops;
>> >> >
>> >> > Just to make sure that we agree on where we're going: This makes the
>> >> > assumption that a BDS has at most one BB that owns it.
>> >> 
>> >> Yes.
>> >> 
>> >> >                                                        Which is not the
>> >> > final state that we want to have, so this will have to go away later.
>> >> 
>> >> I don't know.  Can you explain why you think we're going to want
>> >> multiple BBs?
>> >
>> > We already agreed that we'll have multiple parents for a BDS, for
>> > scenarios like having an NBD server on a snapshot or sharing backing
>> > files, potentially also some block jobs.
>> 
>> We certainly want to provide for multiple "users" (intentionally vague
>> language here), such NBD server, block jobs, device models.  Should they
>> share a BB, or does each one need its own BB?
>
> I think they should have their own BB, but I can still be convinced
> otherwise.
>
> The first reason is that frontends (= BB users, more or less) and
> backends are easiest to understand when they come in pairs. Having
> multiple frontends for a single backend might be confusion.
>
> Second, if the NBD server doesn't sit at the root but accesses a
> backing file, it already has to get its own BB with its own name and
> with no device model attached.  Doing the same at the root helps with
> consistency.

These are valid, but fairly weak.

> Third, we'll probably want to have some things like werror/rerror or I/O
> accounting handled separately for device models and NBD servers.

This one's pretty convincing.

> If we look at another type of users, we might easily find more reasons,
> but for me this is already a pretty strong indicator that shared BBs are
> probably not a good idea.
>
>> > The question is whether among these multiple parents we want to have a
>> > limitation to one BlockBackend, forbidding e.g. an NBD server on the
>> > active layer. This would be a problem for live storage migration if we
>> > don't want the NBD server to reuse the same BB as the guest device.
>> >
>> > More generally, if we can indirectly have multiple BBs on a single
>> > BDS by putting a filter in between, do we have good reasons to forbid
>> > having them attached directly?
>> 
>> Keeping code simple?
>> 
>> Not a valid argument when we *need* multiple BBs, i.e. when the answer
>> to my prior question is "each one needs its own BB".
>
> Are there more places than just the callbacks that would be complicated
> by multiple BBs per BDS?

We'll know when we're done lifting stuff from BDS into BB.

>> >> > What I would consider, however, is adding a TODO comment that tells
>> >> > people that this field needs to go and if you need to use it, something
>> >> > is wrong with your design (which happens to be true for the existing
>> >> > design of some code).
>> >> 
>> >> For the device callbacks, we need a way to find the BB.  If multiple BBs
>> >> can sit on top of the same BDS, we need to find the one with a device
>> >> models attached.  Ot even the ones, if we permit that.
>> >> 
>> >> Let's discuss this a bit, and depending on what we learn, add a suitable
>> >> comment.  Possibly on top.
>> >
>> > Are you sure that nothing else than device models can be interested in
>> > callbacks? I expect that whatever block layer user we have, they will
>> > always be interested in resizes, for example. Media change might also
>> > not be entirely uninteresting, though in most cases what other users
>> > want is probably a blocker.
>> 
>> I designed BlockDevOps for device models only.  If other users emerge,
>> it needs a rename, and possibly a rethink.
>
> Very likely to happen in the long run. Block jobs are today blocking
> resizes, but that's mostly because they don't have an easy way to
> respond to a resize.  Sooner or later someone will want to grow their
> images while they are being mirrored (which is a completely reasonable,
> even if not trivial, thing to want).
>
> Do we have a KVM Forum block layer agenda yet? I think this thread could
> already contain a few topics to discuss there.

No agenda yet, as far as I know.
Benoît Canet Sept. 25, 2014, 9:54 p.m. UTC | #8
On Tue, Sep 23, 2014 at 03:36:03PM +0200, Kevin Wolf wrote:
> Do we have a KVM Forum block layer agenda yet? I think this thread could
> already contain a few topics to discuss there.

Being the guy who constantly bring back painfull issues
(Block filters, Block Backend) on the table I think we should also do a BOFH
(Stefan's idea on a private discussion) about how we can further tweak and
improve the review process.

I experienced on some other open sources projects as complex as the QEMU block
layer a feeling of reactivity while contributing patches and while the new
QEMU block layer review process is starting we are not here yet.

One idea I have is that we could benefit from this event to establish semi
informal peering review contracts between contributors like the ISP does for
bandwith.

Of course we should be carefull to avoid to go the academic review circle route
and left some for unknown people.

Best regards

Benoît

> 
> Kevin
Kevin Wolf Sept. 30, 2014, 10:40 a.m. UTC | #9
Am 16.09.2014 um 20:12 hat Markus Armbruster geschrieben:
> The pointer from BlockBackend to BlockDriverState is a strong
> reference, managed with bdrv_ref() / bdrv_unref(), the back-pointer is
> a weak one.
> 
> Convenience function blk_new_with_bs() creates a BlockBackend with its
> BlockDriverState.  Callers have to unref both.  The commit after next
> will relieve them of the need to unref the BlockDriverState.
> 
> Complication: due to the silly way drive_del works, we need a way to
> hide a BlockBackend, just like bdrv_make_anon().  To emphasize its
> "special" status, give the function a suitably off-putting name:
> blk_hide_on_behalf_of_do_drive_del().  Unfortunately, hiding turns the
> BlockBackend's name into the empty string.  Can't avoid that without
> breaking the blk->bs->device_name equals blk->name invariant.
> 
> The patch adds a memory leak: drive_del while a device model is
> connected leaks the BlockBackend.  Avoiding the leak here is rather
> hairy, but it'll become straightforward in a few commits, so I mark it
> FIXME in the code now, and plug it when it's easy.
> 
> Signed-off-by: Markus Armbruster <armbru@redhat.com>
> ---
>  block.c                        |  10 ++--
>  block/block-backend.c          |  71 ++++++++++++++++++++++-
>  blockdev.c                     |  21 ++++---
>  hw/block/xen_disk.c            |   8 +--
>  include/block/block_int.h      |   2 +
>  include/sysemu/block-backend.h |   5 ++
>  qemu-img.c                     | 125 +++++++++++++++++++----------------------
>  qemu-io.c                      |   4 +-
>  qemu-nbd.c                     |   4 +-
>  9 files changed, 156 insertions(+), 94 deletions(-)
> 
> diff --git a/block.c b/block.c
> index 934881f..7ccf443 100644
> --- a/block.c
> +++ b/block.c
> @@ -2032,7 +2032,7 @@ static void bdrv_move_feature_fields(BlockDriverState *bs_dest,
>   * This will modify the BlockDriverState fields, and swap contents
>   * between bs_new and bs_old. Both bs_new and bs_old are modified.
>   *
> - * bs_new is required to be anonymous.
> + * bs_new must be nameless and not attached to a BlockBackend.
>   *
>   * This function does not create any image files.
>   */
> @@ -2051,8 +2051,9 @@ void bdrv_swap(BlockDriverState *bs_new, BlockDriverState *bs_old)
>          QTAILQ_REMOVE(&graph_bdrv_states, bs_old, node_list);
>      }
>  
> -    /* bs_new must be anonymous and shouldn't have anything fancy enabled */
> +    /* bs_new must be nameless and shouldn't have anything fancy enabled */
>      assert(bs_new->device_name[0] == '\0');
> +    assert(!bs_new->blk);
>      assert(QLIST_EMPTY(&bs_new->dirty_bitmaps));
>      assert(bs_new->job == NULL);
>      assert(bs_new->dev == NULL);
> @@ -2068,8 +2069,9 @@ void bdrv_swap(BlockDriverState *bs_new, BlockDriverState *bs_old)
>      bdrv_move_feature_fields(bs_old, bs_new);
>      bdrv_move_feature_fields(bs_new, &tmp);
>  
> -    /* bs_new shouldn't be in bdrv_states even after the swap!  */
> +    /* bs_new must remain nameless and unattached */
>      assert(bs_new->device_name[0] == '\0');
> +    assert(!bs_new->blk);

Taking back my R-b: You tricked us, this assertion doesn't hold true.
Easy to reproduce by taking a live snapshot. qemu-iotests case 052
catches it. Didn't you run it?

You probably need to swap bs->blk in bdrv_move_feature_fields().

Kevin
Markus Armbruster Sept. 30, 2014, 10:56 a.m. UTC | #10
Kevin Wolf <kwolf@redhat.com> writes:

> Am 16.09.2014 um 20:12 hat Markus Armbruster geschrieben:
>> The pointer from BlockBackend to BlockDriverState is a strong
>> reference, managed with bdrv_ref() / bdrv_unref(), the back-pointer is
>> a weak one.
>> 
>> Convenience function blk_new_with_bs() creates a BlockBackend with its
>> BlockDriverState.  Callers have to unref both.  The commit after next
>> will relieve them of the need to unref the BlockDriverState.
>> 
>> Complication: due to the silly way drive_del works, we need a way to
>> hide a BlockBackend, just like bdrv_make_anon().  To emphasize its
>> "special" status, give the function a suitably off-putting name:
>> blk_hide_on_behalf_of_do_drive_del().  Unfortunately, hiding turns the
>> BlockBackend's name into the empty string.  Can't avoid that without
>> breaking the blk->bs->device_name equals blk->name invariant.
>> 
>> The patch adds a memory leak: drive_del while a device model is
>> connected leaks the BlockBackend.  Avoiding the leak here is rather
>> hairy, but it'll become straightforward in a few commits, so I mark it
>> FIXME in the code now, and plug it when it's easy.
>> 
>> Signed-off-by: Markus Armbruster <armbru@redhat.com>
>> ---
>>  block.c                        |  10 ++--
>>  block/block-backend.c          |  71 ++++++++++++++++++++++-
>>  blockdev.c                     |  21 ++++---
>>  hw/block/xen_disk.c            |   8 +--
>>  include/block/block_int.h      |   2 +
>>  include/sysemu/block-backend.h |   5 ++
>>  qemu-img.c                     | 125 +++++++++++++++++++----------------------
>>  qemu-io.c                      |   4 +-
>>  qemu-nbd.c                     |   4 +-
>>  9 files changed, 156 insertions(+), 94 deletions(-)
>> 
>> diff --git a/block.c b/block.c
>> index 934881f..7ccf443 100644
>> --- a/block.c
>> +++ b/block.c
>> @@ -2032,7 +2032,7 @@ static void bdrv_move_feature_fields(BlockDriverState *bs_dest,
>>   * This will modify the BlockDriverState fields, and swap contents
>>   * between bs_new and bs_old. Both bs_new and bs_old are modified.
>>   *
>> - * bs_new is required to be anonymous.
>> + * bs_new must be nameless and not attached to a BlockBackend.
>>   *
>>   * This function does not create any image files.
>>   */
>> @@ -2051,8 +2051,9 @@ void bdrv_swap(BlockDriverState *bs_new, BlockDriverState *bs_old)
>>          QTAILQ_REMOVE(&graph_bdrv_states, bs_old, node_list);
>>      }
>>  
>> -    /* bs_new must be anonymous and shouldn't have anything fancy enabled */
>> +    /* bs_new must be nameless and shouldn't have anything fancy enabled */
>>      assert(bs_new->device_name[0] == '\0');
>> +    assert(!bs_new->blk);
>>      assert(QLIST_EMPTY(&bs_new->dirty_bitmaps));
>>      assert(bs_new->job == NULL);
>>      assert(bs_new->dev == NULL);
>> @@ -2068,8 +2069,9 @@ void bdrv_swap(BlockDriverState *bs_new, BlockDriverState *bs_old)
>>      bdrv_move_feature_fields(bs_old, bs_new);
>>      bdrv_move_feature_fields(bs_new, &tmp);
>>  
>> -    /* bs_new shouldn't be in bdrv_states even after the swap!  */
>> +    /* bs_new must remain nameless and unattached */
>>      assert(bs_new->device_name[0] == '\0');
>> +    assert(!bs_new->blk);
>
> Taking back my R-b: You tricked us, this assertion doesn't hold true.
> Easy to reproduce by taking a live snapshot. qemu-iotests case 052
> catches it. Didn't you run it?

I run "make check-qtest check-block" on every commit before I submit.
No idea what went wrong with this one.

> You probably need to swap bs->blk in bdrv_move_feature_fields().

I'll look into it, thanks!
Kevin Wolf Sept. 30, 2014, 11:10 a.m. UTC | #11
Am 30.09.2014 um 12:56 hat Markus Armbruster geschrieben:
> Kevin Wolf <kwolf@redhat.com> writes:
> 
> > Am 16.09.2014 um 20:12 hat Markus Armbruster geschrieben:
> >> The pointer from BlockBackend to BlockDriverState is a strong
> >> reference, managed with bdrv_ref() / bdrv_unref(), the back-pointer is
> >> a weak one.
> >> 
> >> Convenience function blk_new_with_bs() creates a BlockBackend with its
> >> BlockDriverState.  Callers have to unref both.  The commit after next
> >> will relieve them of the need to unref the BlockDriverState.
> >> 
> >> Complication: due to the silly way drive_del works, we need a way to
> >> hide a BlockBackend, just like bdrv_make_anon().  To emphasize its
> >> "special" status, give the function a suitably off-putting name:
> >> blk_hide_on_behalf_of_do_drive_del().  Unfortunately, hiding turns the
> >> BlockBackend's name into the empty string.  Can't avoid that without
> >> breaking the blk->bs->device_name equals blk->name invariant.
> >> 
> >> The patch adds a memory leak: drive_del while a device model is
> >> connected leaks the BlockBackend.  Avoiding the leak here is rather
> >> hairy, but it'll become straightforward in a few commits, so I mark it
> >> FIXME in the code now, and plug it when it's easy.
> >> 
> >> Signed-off-by: Markus Armbruster <armbru@redhat.com>
> >> ---
> >>  block.c                        |  10 ++--
> >>  block/block-backend.c          |  71 ++++++++++++++++++++++-
> >>  blockdev.c                     |  21 ++++---
> >>  hw/block/xen_disk.c            |   8 +--
> >>  include/block/block_int.h      |   2 +
> >>  include/sysemu/block-backend.h |   5 ++
> >>  qemu-img.c                     | 125 +++++++++++++++++++----------------------
> >>  qemu-io.c                      |   4 +-
> >>  qemu-nbd.c                     |   4 +-
> >>  9 files changed, 156 insertions(+), 94 deletions(-)
> >> 
> >> diff --git a/block.c b/block.c
> >> index 934881f..7ccf443 100644
> >> --- a/block.c
> >> +++ b/block.c
> >> @@ -2032,7 +2032,7 @@ static void bdrv_move_feature_fields(BlockDriverState *bs_dest,
> >>   * This will modify the BlockDriverState fields, and swap contents
> >>   * between bs_new and bs_old. Both bs_new and bs_old are modified.
> >>   *
> >> - * bs_new is required to be anonymous.
> >> + * bs_new must be nameless and not attached to a BlockBackend.
> >>   *
> >>   * This function does not create any image files.
> >>   */
> >> @@ -2051,8 +2051,9 @@ void bdrv_swap(BlockDriverState *bs_new, BlockDriverState *bs_old)
> >>          QTAILQ_REMOVE(&graph_bdrv_states, bs_old, node_list);
> >>      }
> >>  
> >> -    /* bs_new must be anonymous and shouldn't have anything fancy enabled */
> >> +    /* bs_new must be nameless and shouldn't have anything fancy enabled */
> >>      assert(bs_new->device_name[0] == '\0');
> >> +    assert(!bs_new->blk);
> >>      assert(QLIST_EMPTY(&bs_new->dirty_bitmaps));
> >>      assert(bs_new->job == NULL);
> >>      assert(bs_new->dev == NULL);
> >> @@ -2068,8 +2069,9 @@ void bdrv_swap(BlockDriverState *bs_new, BlockDriverState *bs_old)
> >>      bdrv_move_feature_fields(bs_old, bs_new);
> >>      bdrv_move_feature_fields(bs_new, &tmp);
> >>  
> >> -    /* bs_new shouldn't be in bdrv_states even after the swap!  */
> >> +    /* bs_new must remain nameless and unattached */
> >>      assert(bs_new->device_name[0] == '\0');
> >> +    assert(!bs_new->blk);
> >
> > Taking back my R-b: You tricked us, this assertion doesn't hold true.
> > Easy to reproduce by taking a live snapshot. qemu-iotests case 052
> > catches it. Didn't you run it?
> 
> I run "make check-qtest check-block" on every commit before I submit.
> No idea what went wrong with this one.

When run for raw, it's only 052 that catches it. For qcow2, I got some
more failures: 039 040 041 051 052 085

I see the problem: Only 039 and 052 are marked as 'quick', i.e. the rest
is already excluded from 'make check-block'. 039 and 052 don't work with
cache=none and 'make check-block' uses -nocache, so those are skipped as
well. I'll send a patch to remove the -nocache option and let it run
with the default options.

Kevin
Markus Armbruster Sept. 30, 2014, 12:03 p.m. UTC | #12
Kevin Wolf <kwolf@redhat.com> writes:

> Am 30.09.2014 um 12:56 hat Markus Armbruster geschrieben:
>> Kevin Wolf <kwolf@redhat.com> writes:
[...]
>> > Taking back my R-b: You tricked us, this assertion doesn't hold true.
>> > Easy to reproduce by taking a live snapshot. qemu-iotests case 052
>> > catches it. Didn't you run it?
>> 
>> I run "make check-qtest check-block" on every commit before I submit.
>> No idea what went wrong with this one.
>
> When run for raw, it's only 052 that catches it. For qcow2, I got some
> more failures: 039 040 041 051 052 085
>
> I see the problem: Only 039 and 052 are marked as 'quick', i.e. the rest
> is already excluded from 'make check-block'. 039 and 052 don't work with
> cache=none and 'make check-block' uses -nocache, so those are skipped as
> well.

Yes, that's why I missed it.

>       I'll send a patch to remove the -nocache option and let it run
> with the default options.

Appreciated!
diff mbox

Patch

diff --git a/block.c b/block.c
index 934881f..7ccf443 100644
--- a/block.c
+++ b/block.c
@@ -2032,7 +2032,7 @@  static void bdrv_move_feature_fields(BlockDriverState *bs_dest,
  * This will modify the BlockDriverState fields, and swap contents
  * between bs_new and bs_old. Both bs_new and bs_old are modified.
  *
- * bs_new is required to be anonymous.
+ * bs_new must be nameless and not attached to a BlockBackend.
  *
  * This function does not create any image files.
  */
@@ -2051,8 +2051,9 @@  void bdrv_swap(BlockDriverState *bs_new, BlockDriverState *bs_old)
         QTAILQ_REMOVE(&graph_bdrv_states, bs_old, node_list);
     }
 
-    /* bs_new must be anonymous and shouldn't have anything fancy enabled */
+    /* bs_new must be nameless and shouldn't have anything fancy enabled */
     assert(bs_new->device_name[0] == '\0');
+    assert(!bs_new->blk);
     assert(QLIST_EMPTY(&bs_new->dirty_bitmaps));
     assert(bs_new->job == NULL);
     assert(bs_new->dev == NULL);
@@ -2068,8 +2069,9 @@  void bdrv_swap(BlockDriverState *bs_new, BlockDriverState *bs_old)
     bdrv_move_feature_fields(bs_old, bs_new);
     bdrv_move_feature_fields(bs_new, &tmp);
 
-    /* bs_new shouldn't be in bdrv_states even after the swap!  */
+    /* bs_new must remain nameless and unattached */
     assert(bs_new->device_name[0] == '\0');
+    assert(!bs_new->blk);
 
     /* Check a few fields that should remain attached to the device */
     assert(bs_new->dev == NULL);
@@ -2096,7 +2098,7 @@  void bdrv_swap(BlockDriverState *bs_new, BlockDriverState *bs_old)
  * This will modify the BlockDriverState fields, and swap contents
  * between bs_new and bs_top. Both bs_new and bs_top are modified.
  *
- * bs_new is required to be anonymous.
+ * bs_new must be nameless and not attached to a BlockBackend.
  *
  * This function does not create any image files.
  */
diff --git a/block/block-backend.c b/block/block-backend.c
index e89caa9..a12215a 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -16,10 +16,11 @@ 
 struct BlockBackend {
     char *name;
     int refcnt;
+    BlockDriverState *bs;
     QTAILQ_ENTRY(BlockBackend) link; /* for blk_backends */
 };
 
-/* All the BlockBackends */
+/* All the BlockBackends (except for hidden ones) */
 static QTAILQ_HEAD(, BlockBackend) blk_backends =
     QTAILQ_HEAD_INITIALIZER(blk_backends);
 
@@ -47,10 +48,44 @@  BlockBackend *blk_new(const char *name, Error **errp)
     return blk;
 }
 
+/*
+ * Create a new BlockBackend with a new BlockDriverState attached.
+ * Both have a reference count of one.  Caller owns *both* references.
+ * TODO Let caller own only the BlockBackend reference
+ * Otherwise just like blk_new(), which see.
+ */
+BlockBackend *blk_new_with_bs(const char *name, Error **errp)
+{
+    BlockBackend *blk;
+    BlockDriverState *bs;
+
+    blk = blk_new(name, errp);
+    if (!blk) {
+        return NULL;
+    }
+
+    bs = bdrv_new_root(name, errp);
+    if (!bs) {
+        blk_unref(blk);
+        return NULL;
+    }
+
+    blk->bs = bs;
+    bs->blk = blk;
+    return blk;
+}
+
 static void blk_delete(BlockBackend *blk)
 {
     assert(!blk->refcnt);
-    QTAILQ_REMOVE(&blk_backends, blk, link);
+    if (blk->bs) {
+        blk->bs->blk = NULL;
+        blk->bs = NULL;
+    }
+    /* Avoid double-remove after blk_hide_on_behalf_of_do_drive_del() */
+    if (blk->name[0]) {
+        QTAILQ_REMOVE(&blk_backends, blk, link);
+    }
     g_free(blk->name);
     g_free(blk);
 }
@@ -68,6 +103,8 @@  void blk_ref(BlockBackend *blk)
  * Decrement @blk's reference count.
  * If this drops it to zero, destroy @blk.
  * For convenience, do nothing if @blk is null.
+ * Does *not* touch the attached BlockDriverState's reference count.
+ * TODO Decrement it!
  */
 void blk_unref(BlockBackend *blk)
 {
@@ -95,7 +132,9 @@  BlockBackend *blk_next(BlockBackend *blk)
 }
 
 /*
- * Return @blk's name, a non-null, non-empty string.
+ * Return @blk's name, a non-null string.
+ * Wart: the name is empty iff @blk has been hidden with
+ * blk_hide_on_behalf_of_do_drive_del().
  */
 const char *blk_name(BlockBackend *blk)
 {
@@ -118,3 +157,29 @@  BlockBackend *blk_by_name(const char *name)
     }
     return NULL;
 }
+
+/*
+ * Return the BlockDriverState attached to @blk if any, else null.
+ */
+BlockDriverState *blk_bs(BlockBackend *blk)
+{
+    return blk->bs;
+}
+
+/*
+ * Hide @blk.
+ * @blk must not have been hidden already.
+ * Make attached BlockDriverState, if any, anonymous.
+ * Once hidden, @blk is invisible to all functions that don't receive
+ * it as argument.  For example, blk_by_name() won't return it.
+ * Strictly for use by do_drive_del().
+ * TODO get rid of it!
+ */
+void blk_hide_on_behalf_of_do_drive_del(BlockBackend *blk)
+{
+    QTAILQ_REMOVE(&blk_backends, blk, link);
+    blk->name[0] = 0;
+    if (blk->bs) {
+        bdrv_make_anon(blk->bs);
+    }
+}
diff --git a/blockdev.c b/blockdev.c
index 583235a..5da6028 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -228,6 +228,7 @@  void drive_info_del(DriveInfo *dinfo)
     if (dinfo->opts) {
         qemu_opts_del(dinfo->opts);
     }
+
     g_free(dinfo->id);
     QTAILQ_REMOVE(&drives, dinfo, next);
     g_free(dinfo->serial);
@@ -465,14 +466,11 @@  static DriveInfo *blockdev_init(const char *file, QDict *bs_opts,
     }
 
     /* init */
-    blk = blk_new(qemu_opts_id(opts), errp);
+    blk = blk_new_with_bs(qemu_opts_id(opts), errp);
     if (!blk) {
         goto early_err;
     }
-    bs = bdrv_new_root(qemu_opts_id(opts), errp);
-    if (!bs) {
-        goto bdrv_new_err;
-    }
+    bs = blk_bs(blk);
     bs->open_flags = snapshot ? BDRV_O_SNAPSHOT : 0;
     bs->read_only = ro;
     bs->detect_zeroes = detect_zeroes;
@@ -537,7 +535,6 @@  static DriveInfo *blockdev_init(const char *file, QDict *bs_opts,
 
 err:
     bdrv_unref(bs);
-bdrv_new_err:
     blk_unref(blk);
 early_err:
     qemu_opts_del(opts);
@@ -1752,16 +1749,18 @@  void qmp_block_set_io_throttle(const char *device, int64_t bps, int64_t bps_rd,
 int do_drive_del(Monitor *mon, const QDict *qdict, QObject **ret_data)
 {
     const char *id = qdict_get_str(qdict, "id");
+    BlockBackend *blk;
     BlockDriverState *bs;
     DriveInfo *dinfo;
     AioContext *aio_context;
     Error *local_err = NULL;
 
-    bs = bdrv_find(id);
-    if (!bs) {
+    blk = blk_by_name(id);
+    if (!blk) {
         error_report("Device '%s' not found", id);
         return -1;
     }
+    bs = blk_bs(blk);
 
     dinfo = drive_get_by_blockdev(bs);
     if (dinfo && !dinfo->enable_auto_del) {
@@ -1791,15 +1790,15 @@  int do_drive_del(Monitor *mon, const QDict *qdict, QObject **ret_data)
      * then we can just get rid of the block driver state right here.
      */
     if (bdrv_get_attached_dev(bs)) {
-        bdrv_make_anon(bs);
-
+        blk_hide_on_behalf_of_do_drive_del(blk);
         /* Further I/O must not pause the guest */
         bdrv_set_on_error(bs, BLOCKDEV_ON_ERROR_REPORT,
                           BLOCKDEV_ON_ERROR_REPORT);
+        /* FIXME bs->blk leaked when bs dies */
     } else {
         drive_del(dinfo);
+        blk_unref(blk);
     }
-    blk_unref(blk_by_name(id));
 
     aio_context_release(aio_context);
     return 0;
diff --git a/hw/block/xen_disk.c b/hw/block/xen_disk.c
index 265cf13..0022083 100644
--- a/hw/block/xen_disk.c
+++ b/hw/block/xen_disk.c
@@ -860,15 +860,11 @@  static int blk_connect(struct XenDevice *xendev)
 
         /* setup via xenbus -> create new block driver instance */
         xen_be_printf(&blkdev->xendev, 2, "create new bdrv (xenbus setup)\n");
-        blk = blk_new(blkdev->dev, NULL);
+        blk = blk_new_with_bs(blkdev->dev, NULL);
         if (!blk) {
             return -1;
         }
-        blkdev->bs = bdrv_new_root(blkdev->dev, NULL);
-        if (!blkdev->bs) {
-            blk_unref(blk);
-            return -1;
-        }
+        blkdev->bs = blk_bs(blk);
 
         drv = bdrv_find_whitelisted_format(blkdev->fileproto, readonly);
         if (bdrv_open(&blkdev->bs, blkdev->filename, NULL, NULL, qflags,
diff --git a/include/block/block_int.h b/include/block/block_int.h
index 8d86a6c..14e0b7c 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -324,6 +324,8 @@  struct BlockDriverState {
     BlockDriver *drv; /* NULL means no media */
     void *opaque;
 
+    BlockBackend *blk;          /* owning backend, if any */
+
     void *dev;                  /* attached device model, if any */
     /* TODO change to DeviceState when all users are qdevified */
     const BlockDevOps *dev_ops;
diff --git a/include/sysemu/block-backend.h b/include/sysemu/block-backend.h
index 3f8371c..fa8f623 100644
--- a/include/sysemu/block-backend.h
+++ b/include/sysemu/block-backend.h
@@ -17,10 +17,15 @@ 
 #include "qapi/error.h"
 
 BlockBackend *blk_new(const char *name, Error **errp);
+BlockBackend *blk_new_with_bs(const char *name, Error **errp);
 void blk_ref(BlockBackend *blk);
 void blk_unref(BlockBackend *blk);
 const char *blk_name(BlockBackend *blk);
 BlockBackend *blk_by_name(const char *name);
 BlockBackend *blk_next(BlockBackend *blk);
 
+BlockDriverState *blk_bs(BlockBackend *blk);
+
+void blk_hide_on_behalf_of_do_drive_del(BlockBackend *blk);
+
 #endif
diff --git a/qemu-img.c b/qemu-img.c
index acb272e..206a513 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -284,20 +284,19 @@  static int print_block_option_help(const char *filename, const char *fmt)
     return 0;
 }
 
-static BlockDriverState *bdrv_new_open(const char *id,
-                                       const char *filename,
-                                       const char *fmt,
-                                       int flags,
-                                       bool require_io,
-                                       bool quiet)
+static BlockBackend *img_open(const char *id, const char *filename,
+                              const char *fmt, int flags,
+                              bool require_io, bool quiet)
 {
+    BlockBackend *blk;
     BlockDriverState *bs;
     BlockDriver *drv;
     char password[256];
     Error *local_err = NULL;
     int ret;
 
-    bs = bdrv_new_root(id, &error_abort);
+    blk = blk_new_with_bs(id, &error_abort);
+    bs = blk_bs(blk);
 
     if (fmt) {
         drv = bdrv_find_format(fmt);
@@ -328,9 +327,10 @@  static BlockDriverState *bdrv_new_open(const char *id,
             goto fail;
         }
     }
-    return bs;
+    return blk;
 fail:
     bdrv_unref(bs);
+    blk_unref(blk);
     return NULL;
 }
 
@@ -580,7 +580,7 @@  static int img_check(int argc, char **argv)
     BlockDriverState *bs;
     int fix = 0;
     int flags = BDRV_O_FLAGS | BDRV_O_CHECK;
-    ImageCheck *check = NULL;
+    ImageCheck *check;
     bool quiet = false;
 
     fmt = NULL;
@@ -651,12 +651,11 @@  static int img_check(int argc, char **argv)
         return 1;
     }
 
-    blk = blk_new("image", &error_abort);
-    bs = bdrv_new_open("image", filename, fmt, flags, true, quiet);
-    if (!bs) {
-        ret = 1;
-        goto fail;
+    blk = img_open("image", filename, fmt, flags, true, quiet);
+    if (!blk) {
+        return 1;
     }
+    bs = blk_bs(blk);
 
     check = g_new0(ImageCheck, 1);
     ret = collect_image_check(bs, check, filename, fmt, fix);
@@ -762,12 +761,12 @@  static int img_commit(int argc, char **argv)
         return 1;
     }
 
-    blk = blk_new("image", &error_abort);
-    bs = bdrv_new_open("image", filename, fmt, flags, true, quiet);
-    if (!bs) {
-        ret = -1;
-        goto out;
+    blk = img_open("image", filename, fmt, flags, true, quiet);
+    if (!blk) {
+        return 1;
     }
+    bs = blk_bs(blk);
+
     ret = bdrv_commit(bs);
     switch(ret) {
     case 0:
@@ -787,7 +786,6 @@  static int img_commit(int argc, char **argv)
         break;
     }
 
-out:
     bdrv_unref(bs);
     blk_unref(blk);
     if (ret) {
@@ -1022,21 +1020,21 @@  static int img_compare(int argc, char **argv)
         goto out3;
     }
 
-    blk1 = blk_new("image 1", &error_abort);
-    bs1 = bdrv_new_open("image 1", filename1, fmt1, flags, true, quiet);
-    if (!bs1) {
+    blk1 = img_open("image 1", filename1, fmt1, flags, true, quiet);
+    if (!blk1) {
         error_report("Can't open file %s", filename1);
         ret = 2;
-        goto out2;
+        goto out3;
     }
+    bs1 = blk_bs(blk1);
 
-    blk2 = blk_new("image 2", &error_abort);
-    bs2 = bdrv_new_open("image 2", filename2, fmt2, flags, true, quiet);
-    if (!bs2) {
+    blk2 = img_open("image 2", filename2, fmt2, flags, true, quiet);
+    if (!blk2) {
         error_report("Can't open file %s", filename2);
         ret = 2;
-        goto out1;
+        goto out2;
     }
+    bs2 = blk_bs(blk2);
 
     buf1 = qemu_blockalign(bs1, IO_BUF_SIZE);
     buf2 = qemu_blockalign(bs2, IO_BUF_SIZE);
@@ -1198,7 +1196,6 @@  static int img_compare(int argc, char **argv)
 out:
     qemu_vfree(buf1);
     qemu_vfree(buf2);
-out1:
     bdrv_unref(bs2);
     blk_unref(blk2);
 out2:
@@ -1379,15 +1376,15 @@  static int img_convert(int argc, char **argv)
     for (bs_i = 0; bs_i < bs_n; bs_i++) {
         char *id = bs_n > 1 ? g_strdup_printf("source %d", bs_i)
                             : g_strdup("source");
-        blk[bs_i] = blk_new(id, &error_abort);
-        bs[bs_i] = bdrv_new_open(id, argv[optind + bs_i], fmt, src_flags,
-                                 true, quiet);
+        blk[bs_i] = img_open(id, argv[optind + bs_i], fmt, src_flags,
+                             true, quiet);
         g_free(id);
-        if (!bs[bs_i]) {
+        if (!blk[bs_i]) {
             error_report("Could not open '%s'", argv[optind + bs_i]);
             ret = -1;
             goto out;
         }
+        bs[bs_i] = blk_bs(blk[bs_i]);
         bs_sectors[bs_i] = bdrv_nb_sectors(bs[bs_i]);
         if (bs_sectors[bs_i] < 0) {
             error_report("Could not get size of %s: %s",
@@ -1505,12 +1502,12 @@  static int img_convert(int argc, char **argv)
         goto out;
     }
 
-    out_blk = blk_new("target", &error_abort);
-    out_bs = bdrv_new_open("target", out_filename, out_fmt, flags, true, quiet);
-    if (!out_bs) {
+    out_blk = img_open("target", out_filename, out_fmt, flags, true, quiet);
+    if (!out_blk) {
         ret = -1;
         goto out;
     }
+    out_bs = blk_bs(out_blk);
 
     bs_i = 0;
     bs_offset = 0;
@@ -1897,13 +1894,12 @@  static ImageInfoList *collect_image_info_list(const char *filename,
         }
         g_hash_table_insert(filenames, (gpointer)filename, NULL);
 
-        blk = blk_new("image", &error_abort);
-        bs = bdrv_new_open("image", filename, fmt,
-                           BDRV_O_FLAGS | BDRV_O_NO_BACKING, false, false);
-        if (!bs) {
-            blk_unref(blk);
+        blk = img_open("image", filename, fmt,
+                       BDRV_O_FLAGS | BDRV_O_NO_BACKING, false, false);
+        if (!blk) {
             goto err;
         }
+        bs = blk_bs(blk);
 
         bdrv_query_image_info(bs, &info, &err);
         if (err) {
@@ -2163,12 +2159,11 @@  static int img_map(int argc, char **argv)
         return 1;
     }
 
-    blk = blk_new("image", &error_abort);
-    bs = bdrv_new_open("image", filename, fmt, BDRV_O_FLAGS, true, false);
-    if (!bs) {
-        ret = -1;
-        goto out;
+    blk = img_open("image", filename, fmt, BDRV_O_FLAGS, true, false);
+    if (!blk) {
+        return 1;
     }
+    bs = blk_bs(blk);
 
     if (output_format == OFORMAT_HUMAN) {
         printf("%-16s%-16s%-16s%s\n", "Offset", "Length", "Mapped to", "File");
@@ -2287,12 +2282,11 @@  static int img_snapshot(int argc, char **argv)
     filename = argv[optind++];
 
     /* Open the image */
-    blk = blk_new("image", &error_abort);
-    bs = bdrv_new_open("image", filename, NULL, bdrv_oflags, true, quiet);
-    if (!bs) {
-        ret = -1;
-        goto out;
+    blk = img_open("image", filename, NULL, bdrv_oflags, true, quiet);
+    if (!blk) {
+        return 1;
     }
+    bs = blk_bs(blk);
 
     /* Perform the requested action */
     switch(action) {
@@ -2335,7 +2329,6 @@  static int img_snapshot(int argc, char **argv)
     }
 
     /* Cleanup */
-out:
     bdrv_unref(bs);
     blk_unref(blk);
     if (ret) {
@@ -2435,12 +2428,12 @@  static int img_rebase(int argc, char **argv)
      * Ignore the old backing file for unsafe rebase in case we want to correct
      * the reference to a renamed or moved backing file.
      */
-    blk = blk_new("image", &error_abort);
-    bs = bdrv_new_open("image", filename, fmt, flags, true, quiet);
-    if (!bs) {
+    blk = img_open("image", filename, fmt, flags, true, quiet);
+    if (!blk) {
         ret = -1;
         goto out;
     }
+    bs = blk_bs(blk);
 
     /* Find the right drivers for the backing files */
     old_backing_drv = NULL;
@@ -2468,8 +2461,8 @@  static int img_rebase(int argc, char **argv)
     if (!unsafe) {
         char backing_name[1024];
 
-        blk_old_backing = blk_new("old_backing", &error_abort);
-        bs_old_backing = bdrv_new_root("old_backing", &error_abort);
+        blk_old_backing = blk_new_with_bs("old_backing", &error_abort);
+        bs_old_backing = blk_bs(blk_old_backing);
         bdrv_get_backing_filename(bs, backing_name, sizeof(backing_name));
         ret = bdrv_open(&bs_old_backing, backing_name, NULL, NULL, src_flags,
                         old_backing_drv, &local_err);
@@ -2480,8 +2473,8 @@  static int img_rebase(int argc, char **argv)
             goto out;
         }
         if (out_baseimg[0]) {
-            blk_new_backing = blk_new("new_backing", &error_abort);
-            bs_new_backing = bdrv_new_root("new_backing", &error_abort);
+            blk_new_backing = blk_new_with_bs("new_backing", &error_abort);
+            bs_new_backing = blk_bs(blk_new_backing);
             ret = bdrv_open(&bs_new_backing, out_baseimg, NULL, NULL, src_flags,
                             new_backing_drv, &local_err);
             if (ret) {
@@ -2757,13 +2750,13 @@  static int img_resize(int argc, char **argv)
     n = qemu_opt_get_size(param, BLOCK_OPT_SIZE, 0);
     qemu_opts_del(param);
 
-    blk = blk_new("image", &error_abort);
-    bs = bdrv_new_open("image", filename, fmt, BDRV_O_FLAGS | BDRV_O_RDWR,
-                       true, quiet);
-    if (!bs) {
+    blk = img_open("image", filename, fmt, BDRV_O_FLAGS | BDRV_O_RDWR,
+                   true, quiet);
+    if (!blk) {
         ret = -1;
         goto out;
     }
+    bs = blk_bs(blk);
 
     if (relative) {
         total_size = bdrv_getlength(bs) + n * relative;
@@ -2875,13 +2868,13 @@  static int img_amend(int argc, char **argv)
         goto out;
     }
 
-    blk = blk_new("image", &error_abort);
-    bs = bdrv_new_open("image", filename, fmt, flags, true, quiet);
-    if (!bs) {
+    blk = img_open("image", filename, fmt, flags, true, quiet);
+    if (!blk) {
         error_report("Could not open image '%s'", filename);
         ret = -1;
         goto out;
     }
+    bs = blk_bs(blk);
 
     fmt = bs->drv->format_name;
 
diff --git a/qemu-io.c b/qemu-io.c
index 57090de..ef1d3ea 100644
--- a/qemu-io.c
+++ b/qemu-io.c
@@ -62,8 +62,8 @@  static int openfile(char *name, int flags, int growable, QDict *opts)
         return 1;
     }
 
-    qemuio_blk = blk_new("hda", &error_abort);
-    qemuio_bs = bdrv_new_root("hda", &error_abort);
+    qemuio_blk = blk_new_with_bs("hda", &error_abort);
+    qemuio_bs = blk_bs(qemuio_blk);
 
     if (growable) {
         flags |= BDRV_O_PROTOCOL;
diff --git a/qemu-nbd.c b/qemu-nbd.c
index ff95da6..24808e8 100644
--- a/qemu-nbd.c
+++ b/qemu-nbd.c
@@ -688,8 +688,8 @@  int main(int argc, char **argv)
         drv = NULL;
     }
 
-    blk = blk_new("hda", &error_abort);
-    bs = bdrv_new_root("hda", &error_abort);
+    blk = blk_new_with_bs("hda", &error_abort);
+    bs = blk_bs(blk);
 
     srcpath = argv[optind];
     ret = bdrv_open(&bs, srcpath, NULL, NULL, flags, drv, &local_err);