diff mbox

[COLO-Block,v6,07/16] Add new block driver interface to connect/disconnect the remote target

Message ID 1434617361-17778-8-git-send-email-wency@cn.fujitsu.com
State New
Headers show

Commit Message

Wen Congyang June 18, 2015, 8:49 a.m. UTC
In some cases, we want to connect/disconnect the remote target when
we need, not in bdrv_open()/bdrv_close().

Signed-off-by: Wen Congyang <wency@cn.fujitsu.com>
Signed-off-by: zhanghailiang <zhang.zhanghailiang@huawei.com>
Signed-off-by: Gonglei <arei.gonglei@huawei.com>
---
 block.c                   | 24 ++++++++++++++++++++++++
 include/block/block.h     |  3 +++
 include/block/block_int.h |  3 +++
 3 files changed, 30 insertions(+)

Comments

Stefan Hajnoczi June 18, 2015, 12:55 p.m. UTC | #1
On Thu, Jun 18, 2015 at 04:49:12PM +0800, Wen Congyang wrote:
> +void bdrv_connect(BlockDriverState *bs, Error **errp)
> +{
> +    BlockDriver *drv = bs->drv;
> +
> +    if (drv && drv->bdrv_connect) {
> +        drv->bdrv_connect(bs, errp);
> +    } else if (bs->file) {
> +        bdrv_connect(bs->file, errp);
> +    } else {
> +        error_setg(errp, "this feature or command is not currently supported");
> +    }
> +}
> +
> +void bdrv_disconnect(BlockDriverState *bs)
> +{
> +    BlockDriver *drv = bs->drv;
> +
> +    if (drv && drv->bdrv_disconnect) {
> +        drv->bdrv_disconnect(bs);
> +    } else if (bs->file) {
> +        bdrv_disconnect(bs->file);
> +    }
> +}

Please add doc comments describing the semantics of these commands.

Why are these operations needed when there is already a bs->drv == NULL
case which means the BDS is not ready for read/write?
Wen Congyang June 18, 2015, 2:36 p.m. UTC | #2
At 2015/6/18 20:55, Stefan Hajnoczi Wrote:
> On Thu, Jun 18, 2015 at 04:49:12PM +0800, Wen Congyang wrote:
>> +void bdrv_connect(BlockDriverState *bs, Error **errp)
>> +{
>> +    BlockDriver *drv = bs->drv;
>> +
>> +    if (drv && drv->bdrv_connect) {
>> +        drv->bdrv_connect(bs, errp);
>> +    } else if (bs->file) {
>> +        bdrv_connect(bs->file, errp);
>> +    } else {
>> +        error_setg(errp, "this feature or command is not currently supported");
>> +    }
>> +}
>> +
>> +void bdrv_disconnect(BlockDriverState *bs)
>> +{
>> +    BlockDriver *drv = bs->drv;
>> +
>> +    if (drv && drv->bdrv_disconnect) {
>> +        drv->bdrv_disconnect(bs);
>> +    } else if (bs->file) {
>> +        bdrv_disconnect(bs->file);
>> +    }
>> +}
>
> Please add doc comments describing the semantics of these commands.

Where should it be documented? In the header file?

>
> Why are these operations needed when there is already a bs->drv == NULL
> case which means the BDS is not ready for read/write?
>

The purpos is that: don't connect to nbd server when opening a nbd 
client. connect/disconnect
to nbd server when we need to do it.

IIUC, if bs->drv is NULL, it means that the driver is ejected? Here, 
connect/disconnect
means that connect/disconnect to remote target(The target may be in 
another host).

Thanks
Wen Congyang
Stefan Hajnoczi June 18, 2015, 4:06 p.m. UTC | #3
On Thu, Jun 18, 2015 at 10:36:39PM +0800, Wen Congyang wrote:
> At 2015/6/18 20:55, Stefan Hajnoczi Wrote:
> >On Thu, Jun 18, 2015 at 04:49:12PM +0800, Wen Congyang wrote:
> >>+void bdrv_connect(BlockDriverState *bs, Error **errp)
> >>+{
> >>+    BlockDriver *drv = bs->drv;
> >>+
> >>+    if (drv && drv->bdrv_connect) {
> >>+        drv->bdrv_connect(bs, errp);
> >>+    } else if (bs->file) {
> >>+        bdrv_connect(bs->file, errp);
> >>+    } else {
> >>+        error_setg(errp, "this feature or command is not currently supported");
> >>+    }
> >>+}
> >>+
> >>+void bdrv_disconnect(BlockDriverState *bs)
> >>+{
> >>+    BlockDriver *drv = bs->drv;
> >>+
> >>+    if (drv && drv->bdrv_disconnect) {
> >>+        drv->bdrv_disconnect(bs);
> >>+    } else if (bs->file) {
> >>+        bdrv_disconnect(bs->file);
> >>+    }
> >>+}
> >
> >Please add doc comments describing the semantics of these commands.
> 
> Where should it be documented? In the header file?

block.h doesn't document prototypes in the header file, please document
the function definition in block.c.  (QEMU is not consistent here, some
places do it the other way around.)

> >Why are these operations needed when there is already a bs->drv == NULL
> >case which means the BDS is not ready for read/write?
> >
> 
> The purpos is that: don't connect to nbd server when opening a nbd client.
> connect/disconnect
> to nbd server when we need to do it.
> 
> IIUC, if bs->drv is NULL, it means that the driver is ejected? Here,
> connect/disconnect
> means that connect/disconnect to remote target(The target may be in another
> host).

Connect/disconnect puts something on the QEMU command-line that isn't
ready at startup time.

How about using monitor commands to add objects when needed instead?

That is cleaner because it doesn't introduce a new state (which is only
implemented for nbd).
Wen Congyang June 19, 2015, 12:54 a.m. UTC | #4
On 06/19/2015 12:06 AM, Stefan Hajnoczi wrote:
> On Thu, Jun 18, 2015 at 10:36:39PM +0800, Wen Congyang wrote:
>> At 2015/6/18 20:55, Stefan Hajnoczi Wrote:
>>> On Thu, Jun 18, 2015 at 04:49:12PM +0800, Wen Congyang wrote:
>>>> +void bdrv_connect(BlockDriverState *bs, Error **errp)
>>>> +{
>>>> +    BlockDriver *drv = bs->drv;
>>>> +
>>>> +    if (drv && drv->bdrv_connect) {
>>>> +        drv->bdrv_connect(bs, errp);
>>>> +    } else if (bs->file) {
>>>> +        bdrv_connect(bs->file, errp);
>>>> +    } else {
>>>> +        error_setg(errp, "this feature or command is not currently supported");
>>>> +    }
>>>> +}
>>>> +
>>>> +void bdrv_disconnect(BlockDriverState *bs)
>>>> +{
>>>> +    BlockDriver *drv = bs->drv;
>>>> +
>>>> +    if (drv && drv->bdrv_disconnect) {
>>>> +        drv->bdrv_disconnect(bs);
>>>> +    } else if (bs->file) {
>>>> +        bdrv_disconnect(bs->file);
>>>> +    }
>>>> +}
>>>
>>> Please add doc comments describing the semantics of these commands.
>>
>> Where should it be documented? In the header file?
> 
> block.h doesn't document prototypes in the header file, please document
> the function definition in block.c.  (QEMU is not consistent here, some
> places do it the other way around.)
> 
>>> Why are these operations needed when there is already a bs->drv == NULL
>>> case which means the BDS is not ready for read/write?
>>>
>>
>> The purpos is that: don't connect to nbd server when opening a nbd client.
>> connect/disconnect
>> to nbd server when we need to do it.
>>
>> IIUC, if bs->drv is NULL, it means that the driver is ejected? Here,
>> connect/disconnect
>> means that connect/disconnect to remote target(The target may be in another
>> host).
> 
> Connect/disconnect puts something on the QEMU command-line that isn't
> ready at startup time.
> 
> How about using monitor commands to add objects when needed instead?
> 
> That is cleaner because it doesn't introduce a new state (which is only
> implemented for nbd).
> 

The problem is that, nbd client is one child of quorum, and quorum must have more
than one child. The nbd server is not ready until colo is running.

Any suggestion is welcome.

Thanks
Wen Congyang
Stefan Hajnoczi June 19, 2015, 10:49 a.m. UTC | #5
On Fri, Jun 19, 2015 at 08:54:56AM +0800, Wen Congyang wrote:
> On 06/19/2015 12:06 AM, Stefan Hajnoczi wrote:
> > On Thu, Jun 18, 2015 at 10:36:39PM +0800, Wen Congyang wrote:
> >> At 2015/6/18 20:55, Stefan Hajnoczi Wrote:
> >>> On Thu, Jun 18, 2015 at 04:49:12PM +0800, Wen Congyang wrote:
> >>>> +void bdrv_connect(BlockDriverState *bs, Error **errp)
> >>>> +{
> >>>> +    BlockDriver *drv = bs->drv;
> >>>> +
> >>>> +    if (drv && drv->bdrv_connect) {
> >>>> +        drv->bdrv_connect(bs, errp);
> >>>> +    } else if (bs->file) {
> >>>> +        bdrv_connect(bs->file, errp);
> >>>> +    } else {
> >>>> +        error_setg(errp, "this feature or command is not currently supported");
> >>>> +    }
> >>>> +}
> >>>> +
> >>>> +void bdrv_disconnect(BlockDriverState *bs)
> >>>> +{
> >>>> +    BlockDriver *drv = bs->drv;
> >>>> +
> >>>> +    if (drv && drv->bdrv_disconnect) {
> >>>> +        drv->bdrv_disconnect(bs);
> >>>> +    } else if (bs->file) {
> >>>> +        bdrv_disconnect(bs->file);
> >>>> +    }
> >>>> +}
> >>>
> >>> Please add doc comments describing the semantics of these commands.
> >>
> >> Where should it be documented? In the header file?
> > 
> > block.h doesn't document prototypes in the header file, please document
> > the function definition in block.c.  (QEMU is not consistent here, some
> > places do it the other way around.)
> > 
> >>> Why are these operations needed when there is already a bs->drv == NULL
> >>> case which means the BDS is not ready for read/write?
> >>>
> >>
> >> The purpos is that: don't connect to nbd server when opening a nbd client.
> >> connect/disconnect
> >> to nbd server when we need to do it.
> >>
> >> IIUC, if bs->drv is NULL, it means that the driver is ejected? Here,
> >> connect/disconnect
> >> means that connect/disconnect to remote target(The target may be in another
> >> host).
> > 
> > Connect/disconnect puts something on the QEMU command-line that isn't
> > ready at startup time.
> > 
> > How about using monitor commands to add objects when needed instead?
> > 
> > That is cleaner because it doesn't introduce a new state (which is only
> > implemented for nbd).
> > 
> 
> The problem is that, nbd client is one child of quorum, and quorum must have more
> than one child. The nbd server is not ready until colo is running.

A monitor command to hot add/remove quorum children solves this problem
and could also be used in other scenarios (e.g. user decides to take a
quorum child offline).
Wen Congyang June 20, 2015, 3:31 a.m. UTC | #6
At 2015/6/19 18:49, Stefan Hajnoczi Wrote:
> On Fri, Jun 19, 2015 at 08:54:56AM +0800, Wen Congyang wrote:
>> On 06/19/2015 12:06 AM, Stefan Hajnoczi wrote:
>>> On Thu, Jun 18, 2015 at 10:36:39PM +0800, Wen Congyang wrote:
>>>> At 2015/6/18 20:55, Stefan Hajnoczi Wrote:
>>>>> On Thu, Jun 18, 2015 at 04:49:12PM +0800, Wen Congyang wrote:
>>>>>> +void bdrv_connect(BlockDriverState *bs, Error **errp)
>>>>>> +{
>>>>>> +    BlockDriver *drv = bs->drv;
>>>>>> +
>>>>>> +    if (drv && drv->bdrv_connect) {
>>>>>> +        drv->bdrv_connect(bs, errp);
>>>>>> +    } else if (bs->file) {
>>>>>> +        bdrv_connect(bs->file, errp);
>>>>>> +    } else {
>>>>>> +        error_setg(errp, "this feature or command is not currently supported");
>>>>>> +    }
>>>>>> +}
>>>>>> +
>>>>>> +void bdrv_disconnect(BlockDriverState *bs)
>>>>>> +{
>>>>>> +    BlockDriver *drv = bs->drv;
>>>>>> +
>>>>>> +    if (drv && drv->bdrv_disconnect) {
>>>>>> +        drv->bdrv_disconnect(bs);
>>>>>> +    } else if (bs->file) {
>>>>>> +        bdrv_disconnect(bs->file);
>>>>>> +    }
>>>>>> +}
>>>>>
>>>>> Please add doc comments describing the semantics of these commands.
>>>>
>>>> Where should it be documented? In the header file?
>>>
>>> block.h doesn't document prototypes in the header file, please document
>>> the function definition in block.c.  (QEMU is not consistent here, some
>>> places do it the other way around.)
>>>
>>>>> Why are these operations needed when there is already a bs->drv == NULL
>>>>> case which means the BDS is not ready for read/write?
>>>>>
>>>>
>>>> The purpos is that: don't connect to nbd server when opening a nbd client.
>>>> connect/disconnect
>>>> to nbd server when we need to do it.
>>>>
>>>> IIUC, if bs->drv is NULL, it means that the driver is ejected? Here,
>>>> connect/disconnect
>>>> means that connect/disconnect to remote target(The target may be in another
>>>> host).
>>>
>>> Connect/disconnect puts something on the QEMU command-line that isn't
>>> ready at startup time.
>>>
>>> How about using monitor commands to add objects when needed instead?
>>>
>>> That is cleaner because it doesn't introduce a new state (which is only
>>> implemented for nbd).
>>>
>>
>> The problem is that, nbd client is one child of quorum, and quorum must have more
>> than one child. The nbd server is not ready until colo is running.
>
> A monitor command to hot add/remove quorum children solves this problem
> and could also be used in other scenarios (e.g. user decides to take a
> quorum child offline).
>

For replication case, we always do checkpoint again and again after 
migration. If the
disk is not synced before migration, we will use disk mirgation or 
mirror job to sync
it. So we cannot start block replication when migration is running. We 
need that nbd
client is not ready when migration is running, and it is ready between 
migration ends
and checkpoint begins. Using a monotir command add the nbd client will 
cause larger
downtime. So if the nbd client has been added(only not connect to the 
nbd server),
we can connect to nbd server automatically.

Thanks
Wen Congyang
Stefan Hajnoczi June 22, 2015, 12:39 p.m. UTC | #7
On Sat, Jun 20, 2015 at 11:31:52AM +0800, Wen Congyang wrote:
> At 2015/6/19 18:49, Stefan Hajnoczi Wrote:
> >On Fri, Jun 19, 2015 at 08:54:56AM +0800, Wen Congyang wrote:
> >>On 06/19/2015 12:06 AM, Stefan Hajnoczi wrote:
> >>>On Thu, Jun 18, 2015 at 10:36:39PM +0800, Wen Congyang wrote:
> >>>>At 2015/6/18 20:55, Stefan Hajnoczi Wrote:
> >>>>>On Thu, Jun 18, 2015 at 04:49:12PM +0800, Wen Congyang wrote:
> >>>>>>+void bdrv_connect(BlockDriverState *bs, Error **errp)
> >>>>>>+{
> >>>>>>+    BlockDriver *drv = bs->drv;
> >>>>>>+
> >>>>>>+    if (drv && drv->bdrv_connect) {
> >>>>>>+        drv->bdrv_connect(bs, errp);
> >>>>>>+    } else if (bs->file) {
> >>>>>>+        bdrv_connect(bs->file, errp);
> >>>>>>+    } else {
> >>>>>>+        error_setg(errp, "this feature or command is not currently supported");
> >>>>>>+    }
> >>>>>>+}
> >>>>>>+
> >>>>>>+void bdrv_disconnect(BlockDriverState *bs)
> >>>>>>+{
> >>>>>>+    BlockDriver *drv = bs->drv;
> >>>>>>+
> >>>>>>+    if (drv && drv->bdrv_disconnect) {
> >>>>>>+        drv->bdrv_disconnect(bs);
> >>>>>>+    } else if (bs->file) {
> >>>>>>+        bdrv_disconnect(bs->file);
> >>>>>>+    }
> >>>>>>+}
> >>>>>
> >>>>>Please add doc comments describing the semantics of these commands.
> >>>>
> >>>>Where should it be documented? In the header file?
> >>>
> >>>block.h doesn't document prototypes in the header file, please document
> >>>the function definition in block.c.  (QEMU is not consistent here, some
> >>>places do it the other way around.)
> >>>
> >>>>>Why are these operations needed when there is already a bs->drv == NULL
> >>>>>case which means the BDS is not ready for read/write?
> >>>>>
> >>>>
> >>>>The purpos is that: don't connect to nbd server when opening a nbd client.
> >>>>connect/disconnect
> >>>>to nbd server when we need to do it.
> >>>>
> >>>>IIUC, if bs->drv is NULL, it means that the driver is ejected? Here,
> >>>>connect/disconnect
> >>>>means that connect/disconnect to remote target(The target may be in another
> >>>>host).
> >>>
> >>>Connect/disconnect puts something on the QEMU command-line that isn't
> >>>ready at startup time.
> >>>
> >>>How about using monitor commands to add objects when needed instead?
> >>>
> >>>That is cleaner because it doesn't introduce a new state (which is only
> >>>implemented for nbd).
> >>>
> >>
> >>The problem is that, nbd client is one child of quorum, and quorum must have more
> >>than one child. The nbd server is not ready until colo is running.
> >
> >A monitor command to hot add/remove quorum children solves this problem
> >and could also be used in other scenarios (e.g. user decides to take a
> >quorum child offline).
> >
> 
> For replication case, we always do checkpoint again and again after
> migration. If the
> disk is not synced before migration, we will use disk mirgation or mirror
> job to sync
> it. So we cannot start block replication when migration is running. We need
> that nbd
> client is not ready when migration is running, and it is ready between
> migration ends
> and checkpoint begins. Using a monotir command add the nbd client will cause
> larger
> downtime. So if the nbd client has been added(only not connect to the nbd
> server),
> we can connect to nbd server automatically.

I'm not sure I understand you.  Can you rephrase this?

The NBD connect should not be performed during downtime, regardless of
whether a monitor command is used or not.

Stefan
Wen Congyang June 22, 2015, 1:56 p.m. UTC | #8
At 2015/6/22 20:39, Stefan Hajnoczi Wrote:
> On Sat, Jun 20, 2015 at 11:31:52AM +0800, Wen Congyang wrote:
>> At 2015/6/19 18:49, Stefan Hajnoczi Wrote:
>>> On Fri, Jun 19, 2015 at 08:54:56AM +0800, Wen Congyang wrote:
>>>> On 06/19/2015 12:06 AM, Stefan Hajnoczi wrote:
>>>>> On Thu, Jun 18, 2015 at 10:36:39PM +0800, Wen Congyang wrote:
>>>>>> At 2015/6/18 20:55, Stefan Hajnoczi Wrote:
>>>>>>> On Thu, Jun 18, 2015 at 04:49:12PM +0800, Wen Congyang wrote:
>>>>>>>> +void bdrv_connect(BlockDriverState *bs, Error **errp)
>>>>>>>> +{
>>>>>>>> +    BlockDriver *drv = bs->drv;
>>>>>>>> +
>>>>>>>> +    if (drv && drv->bdrv_connect) {
>>>>>>>> +        drv->bdrv_connect(bs, errp);
>>>>>>>> +    } else if (bs->file) {
>>>>>>>> +        bdrv_connect(bs->file, errp);
>>>>>>>> +    } else {
>>>>>>>> +        error_setg(errp, "this feature or command is not currently supported");
>>>>>>>> +    }
>>>>>>>> +}
>>>>>>>> +
>>>>>>>> +void bdrv_disconnect(BlockDriverState *bs)
>>>>>>>> +{
>>>>>>>> +    BlockDriver *drv = bs->drv;
>>>>>>>> +
>>>>>>>> +    if (drv && drv->bdrv_disconnect) {
>>>>>>>> +        drv->bdrv_disconnect(bs);
>>>>>>>> +    } else if (bs->file) {
>>>>>>>> +        bdrv_disconnect(bs->file);
>>>>>>>> +    }
>>>>>>>> +}
>>>>>>>
>>>>>>> Please add doc comments describing the semantics of these commands.
>>>>>>
>>>>>> Where should it be documented? In the header file?
>>>>>
>>>>> block.h doesn't document prototypes in the header file, please document
>>>>> the function definition in block.c.  (QEMU is not consistent here, some
>>>>> places do it the other way around.)
>>>>>
>>>>>>> Why are these operations needed when there is already a bs->drv == NULL
>>>>>>> case which means the BDS is not ready for read/write?
>>>>>>>
>>>>>>
>>>>>> The purpos is that: don't connect to nbd server when opening a nbd client.
>>>>>> connect/disconnect
>>>>>> to nbd server when we need to do it.
>>>>>>
>>>>>> IIUC, if bs->drv is NULL, it means that the driver is ejected? Here,
>>>>>> connect/disconnect
>>>>>> means that connect/disconnect to remote target(The target may be in another
>>>>>> host).
>>>>>
>>>>> Connect/disconnect puts something on the QEMU command-line that isn't
>>>>> ready at startup time.
>>>>>
>>>>> How about using monitor commands to add objects when needed instead?
>>>>>
>>>>> That is cleaner because it doesn't introduce a new state (which is only
>>>>> implemented for nbd).
>>>>>
>>>>
>>>> The problem is that, nbd client is one child of quorum, and quorum must have more
>>>> than one child. The nbd server is not ready until colo is running.
>>>
>>> A monitor command to hot add/remove quorum children solves this problem
>>> and could also be used in other scenarios (e.g. user decides to take a
>>> quorum child offline).
>>>
>>
>> For replication case, we always do checkpoint again and again after
>> migration. If the
>> disk is not synced before migration, we will use disk mirgation or mirror
>> job to sync
>> it. So we cannot start block replication when migration is running. We need
>> that nbd
>> client is not ready when migration is running, and it is ready between
>> migration ends
>> and checkpoint begins. Using a monotir command add the nbd client will cause
>> larger
>> downtime. So if the nbd client has been added(only not connect to the nbd
>> server),
>> we can connect to nbd server automatically.
>
> I'm not sure I understand you.  Can you rephrase this?

We only use block replication after migration because we assume that the 
disk has been synced
before block replication is started, so we only forward new write 
requests via nbd.

>
> The NBD connect should not be performed during downtime, regardless of
> whether a monitor command is used or not.

Why?

Thanks
Wen Congyang

>
> Stefan
>
Wen Congyang June 23, 2015, 6:44 a.m. UTC | #9
On 06/19/2015 12:06 AM, Stefan Hajnoczi wrote:
> On Thu, Jun 18, 2015 at 10:36:39PM +0800, Wen Congyang wrote:
>> At 2015/6/18 20:55, Stefan Hajnoczi Wrote:
>>> On Thu, Jun 18, 2015 at 04:49:12PM +0800, Wen Congyang wrote:
>>>> +void bdrv_connect(BlockDriverState *bs, Error **errp)
>>>> +{
>>>> +    BlockDriver *drv = bs->drv;
>>>> +
>>>> +    if (drv && drv->bdrv_connect) {
>>>> +        drv->bdrv_connect(bs, errp);
>>>> +    } else if (bs->file) {
>>>> +        bdrv_connect(bs->file, errp);
>>>> +    } else {
>>>> +        error_setg(errp, "this feature or command is not currently supported");
>>>> +    }
>>>> +}
>>>> +
>>>> +void bdrv_disconnect(BlockDriverState *bs)
>>>> +{
>>>> +    BlockDriver *drv = bs->drv;
>>>> +
>>>> +    if (drv && drv->bdrv_disconnect) {
>>>> +        drv->bdrv_disconnect(bs);
>>>> +    } else if (bs->file) {
>>>> +        bdrv_disconnect(bs->file);
>>>> +    }
>>>> +}
>>>
>>> Please add doc comments describing the semantics of these commands.
>>
>> Where should it be documented? In the header file?
> 
> block.h doesn't document prototypes in the header file, please document
> the function definition in block.c.  (QEMU is not consistent here, some
> places do it the other way around.)
> 
>>> Why are these operations needed when there is already a bs->drv == NULL
>>> case which means the BDS is not ready for read/write?
>>>
>>
>> The purpos is that: don't connect to nbd server when opening a nbd client.
>> connect/disconnect
>> to nbd server when we need to do it.
>>
>> IIUC, if bs->drv is NULL, it means that the driver is ejected? Here,
>> connect/disconnect
>> means that connect/disconnect to remote target(The target may be in another
>> host).
> 
> Connect/disconnect puts something on the QEMU command-line that isn't
> ready at startup time.
> 
> How about using monitor commands to add objects when needed instead?

We have decieded use this way here:
http://lists.gnu.org/archive/html/qemu-devel/2015-04/msg02975.html

Thanks
Wen Congyang

> 
> That is cleaner because it doesn't introduce a new state (which is only
> implemented for nbd).
>
Stefan Hajnoczi June 23, 2015, 1:42 p.m. UTC | #10
On Mon, Jun 22, 2015 at 09:56:24PM +0800, Wen Congyang wrote:
> >The NBD connect should not be performed during downtime, regardless of
> >whether a monitor command is used or not.
> 
> Why?

NBD connection establishment takes unbounded time.  It can extend the
downtime period and cause users to notice that the guest is unavailable.

The point of COLO is fault tolerance and high availability.  In order to
achieve that, COLO must minimize downtime.  If an operation can be
performed outside the migration downtime phase, then it should be done
outside.

Stefan
Wen Congyang June 23, 2015, 1:54 p.m. UTC | #11
At 2015/6/23 21:42, Stefan Hajnoczi Wrote:
> On Mon, Jun 22, 2015 at 09:56:24PM +0800, Wen Congyang wrote:
>>> The NBD connect should not be performed during downtime, regardless of
>>> whether a monitor command is used or not.
>>
>> Why?
>
> NBD connection establishment takes unbounded time.  It can extend the
> downtime period and cause users to notice that the guest is unavailable.
>
> The point of COLO is fault tolerance and high availability.  In order to
> achieve that, COLO must minimize downtime.  If an operation can be
> performed outside the migration downtime phase, then it should be done
> outside.

Yes, you are right. I will think about it.

Thanks
Wen Congyang

>
> Stefan
>
Wen Congyang June 24, 2015, 1:11 a.m. UTC | #12
On 06/19/2015 06:49 PM, Stefan Hajnoczi wrote:
> On Fri, Jun 19, 2015 at 08:54:56AM +0800, Wen Congyang wrote:
>> On 06/19/2015 12:06 AM, Stefan Hajnoczi wrote:
>>> On Thu, Jun 18, 2015 at 10:36:39PM +0800, Wen Congyang wrote:
>>>> At 2015/6/18 20:55, Stefan Hajnoczi Wrote:
>>>>> On Thu, Jun 18, 2015 at 04:49:12PM +0800, Wen Congyang wrote:
>>>>>> +void bdrv_connect(BlockDriverState *bs, Error **errp)
>>>>>> +{
>>>>>> +    BlockDriver *drv = bs->drv;
>>>>>> +
>>>>>> +    if (drv && drv->bdrv_connect) {
>>>>>> +        drv->bdrv_connect(bs, errp);
>>>>>> +    } else if (bs->file) {
>>>>>> +        bdrv_connect(bs->file, errp);
>>>>>> +    } else {
>>>>>> +        error_setg(errp, "this feature or command is not currently supported");
>>>>>> +    }
>>>>>> +}
>>>>>> +
>>>>>> +void bdrv_disconnect(BlockDriverState *bs)
>>>>>> +{
>>>>>> +    BlockDriver *drv = bs->drv;
>>>>>> +
>>>>>> +    if (drv && drv->bdrv_disconnect) {
>>>>>> +        drv->bdrv_disconnect(bs);
>>>>>> +    } else if (bs->file) {
>>>>>> +        bdrv_disconnect(bs->file);
>>>>>> +    }
>>>>>> +}
>>>>>
>>>>> Please add doc comments describing the semantics of these commands.
>>>>
>>>> Where should it be documented? In the header file?
>>>
>>> block.h doesn't document prototypes in the header file, please document
>>> the function definition in block.c.  (QEMU is not consistent here, some
>>> places do it the other way around.)
>>>
>>>>> Why are these operations needed when there is already a bs->drv == NULL
>>>>> case which means the BDS is not ready for read/write?
>>>>>
>>>>
>>>> The purpos is that: don't connect to nbd server when opening a nbd client.
>>>> connect/disconnect
>>>> to nbd server when we need to do it.
>>>>
>>>> IIUC, if bs->drv is NULL, it means that the driver is ejected? Here,
>>>> connect/disconnect
>>>> means that connect/disconnect to remote target(The target may be in another
>>>> host).
>>>
>>> Connect/disconnect puts something on the QEMU command-line that isn't
>>> ready at startup time.
>>>
>>> How about using monitor commands to add objects when needed instead?
>>>
>>> That is cleaner because it doesn't introduce a new state (which is only
>>> implemented for nbd).
>>>
>>
>> The problem is that, nbd client is one child of quorum, and quorum must have more
>> than one child. The nbd server is not ready until colo is running.
> 
> A monitor command to hot add/remove quorum children solves this problem
> and could also be used in other scenarios (e.g. user decides to take a
> quorum child offline).
> 


CCing Alberto Garcia for the quorum block driver.
Dr. David Alan Gilbert June 24, 2015, 2:07 p.m. UTC | #13
* Wen Congyang (ghostwcy@gmail.com) wrote:
> At 2015/6/19 18:49, Stefan Hajnoczi Wrote:
> >On Fri, Jun 19, 2015 at 08:54:56AM +0800, Wen Congyang wrote:
> >>On 06/19/2015 12:06 AM, Stefan Hajnoczi wrote:
> >>>On Thu, Jun 18, 2015 at 10:36:39PM +0800, Wen Congyang wrote:
> >>>>At 2015/6/18 20:55, Stefan Hajnoczi Wrote:
> >>>>>On Thu, Jun 18, 2015 at 04:49:12PM +0800, Wen Congyang wrote:
> >>>>>>+void bdrv_connect(BlockDriverState *bs, Error **errp)
> >>>>>>+{
> >>>>>>+    BlockDriver *drv = bs->drv;
> >>>>>>+
> >>>>>>+    if (drv && drv->bdrv_connect) {
> >>>>>>+        drv->bdrv_connect(bs, errp);
> >>>>>>+    } else if (bs->file) {
> >>>>>>+        bdrv_connect(bs->file, errp);
> >>>>>>+    } else {
> >>>>>>+        error_setg(errp, "this feature or command is not currently supported");
> >>>>>>+    }
> >>>>>>+}
> >>>>>>+
> >>>>>>+void bdrv_disconnect(BlockDriverState *bs)
> >>>>>>+{
> >>>>>>+    BlockDriver *drv = bs->drv;
> >>>>>>+
> >>>>>>+    if (drv && drv->bdrv_disconnect) {
> >>>>>>+        drv->bdrv_disconnect(bs);
> >>>>>>+    } else if (bs->file) {
> >>>>>>+        bdrv_disconnect(bs->file);
> >>>>>>+    }
> >>>>>>+}
> >>>>>
> >>>>>Please add doc comments describing the semantics of these commands.
> >>>>
> >>>>Where should it be documented? In the header file?
> >>>
> >>>block.h doesn't document prototypes in the header file, please document
> >>>the function definition in block.c.  (QEMU is not consistent here, some
> >>>places do it the other way around.)
> >>>
> >>>>>Why are these operations needed when there is already a bs->drv == NULL
> >>>>>case which means the BDS is not ready for read/write?
> >>>>>
> >>>>
> >>>>The purpos is that: don't connect to nbd server when opening a nbd client.
> >>>>connect/disconnect
> >>>>to nbd server when we need to do it.
> >>>>
> >>>>IIUC, if bs->drv is NULL, it means that the driver is ejected? Here,
> >>>>connect/disconnect
> >>>>means that connect/disconnect to remote target(The target may be in another
> >>>>host).
> >>>
> >>>Connect/disconnect puts something on the QEMU command-line that isn't
> >>>ready at startup time.
> >>>
> >>>How about using monitor commands to add objects when needed instead?
> >>>
> >>>That is cleaner because it doesn't introduce a new state (which is only
> >>>implemented for nbd).
> >>>
> >>
> >>The problem is that, nbd client is one child of quorum, and quorum must have more
> >>than one child. The nbd server is not ready until colo is running.
> >
> >A monitor command to hot add/remove quorum children solves this problem
> >and could also be used in other scenarios (e.g. user decides to take a
> >quorum child offline).
> >
> 
> For replication case, we always do checkpoint again and again after
> migration. If the disk is not synced before migration, we will use disk mirgation or mirror
> job to sync it.

Can you document the way that you use disk migration or mirror, together
with your COLO setup?   I think it would make it easier to understand this
restriction.
At the moment I don't understand how you can switch from doing a disk migration
into COLO mode - there seems to be a gap between the end of disk migration and the start
of COLO.

> So we cannot start block replication when migration is running. We need
> that nbd
> client is not ready when migration is running, and it is ready between
> migration ends
> and checkpoint begins. Using a monotir command add the nbd client will cause
> larger
> downtime. So if the nbd client has been added(only not connect to the nbd
> server),
> we can connect to nbd server automatically.

Without the disk migration or mirror, I can't see the need for the delayed bdrv_connect.
I can see that you want to do a disconnect at failover, however you need
to take care because if the network is broken at the point of failover
you need to make sure nothing blocks.

Dave

> 
> Thanks
> Wen Congyang
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Wen Congyang June 25, 2015, 1:01 a.m. UTC | #14
On 06/24/2015 10:07 PM, Dr. David Alan Gilbert wrote:
> * Wen Congyang (ghostwcy@gmail.com) wrote:
>> At 2015/6/19 18:49, Stefan Hajnoczi Wrote:
>>> On Fri, Jun 19, 2015 at 08:54:56AM +0800, Wen Congyang wrote:
>>>> On 06/19/2015 12:06 AM, Stefan Hajnoczi wrote:
>>>>> On Thu, Jun 18, 2015 at 10:36:39PM +0800, Wen Congyang wrote:
>>>>>> At 2015/6/18 20:55, Stefan Hajnoczi Wrote:
>>>>>>> On Thu, Jun 18, 2015 at 04:49:12PM +0800, Wen Congyang wrote:
>>>>>>>> +void bdrv_connect(BlockDriverState *bs, Error **errp)
>>>>>>>> +{
>>>>>>>> +    BlockDriver *drv = bs->drv;
>>>>>>>> +
>>>>>>>> +    if (drv && drv->bdrv_connect) {
>>>>>>>> +        drv->bdrv_connect(bs, errp);
>>>>>>>> +    } else if (bs->file) {
>>>>>>>> +        bdrv_connect(bs->file, errp);
>>>>>>>> +    } else {
>>>>>>>> +        error_setg(errp, "this feature or command is not currently supported");
>>>>>>>> +    }
>>>>>>>> +}
>>>>>>>> +
>>>>>>>> +void bdrv_disconnect(BlockDriverState *bs)
>>>>>>>> +{
>>>>>>>> +    BlockDriver *drv = bs->drv;
>>>>>>>> +
>>>>>>>> +    if (drv && drv->bdrv_disconnect) {
>>>>>>>> +        drv->bdrv_disconnect(bs);
>>>>>>>> +    } else if (bs->file) {
>>>>>>>> +        bdrv_disconnect(bs->file);
>>>>>>>> +    }
>>>>>>>> +}
>>>>>>>
>>>>>>> Please add doc comments describing the semantics of these commands.
>>>>>>
>>>>>> Where should it be documented? In the header file?
>>>>>
>>>>> block.h doesn't document prototypes in the header file, please document
>>>>> the function definition in block.c.  (QEMU is not consistent here, some
>>>>> places do it the other way around.)
>>>>>
>>>>>>> Why are these operations needed when there is already a bs->drv == NULL
>>>>>>> case which means the BDS is not ready for read/write?
>>>>>>>
>>>>>>
>>>>>> The purpos is that: don't connect to nbd server when opening a nbd client.
>>>>>> connect/disconnect
>>>>>> to nbd server when we need to do it.
>>>>>>
>>>>>> IIUC, if bs->drv is NULL, it means that the driver is ejected? Here,
>>>>>> connect/disconnect
>>>>>> means that connect/disconnect to remote target(The target may be in another
>>>>>> host).
>>>>>
>>>>> Connect/disconnect puts something on the QEMU command-line that isn't
>>>>> ready at startup time.
>>>>>
>>>>> How about using monitor commands to add objects when needed instead?
>>>>>
>>>>> That is cleaner because it doesn't introduce a new state (which is only
>>>>> implemented for nbd).
>>>>>
>>>>
>>>> The problem is that, nbd client is one child of quorum, and quorum must have more
>>>> than one child. The nbd server is not ready until colo is running.
>>>
>>> A monitor command to hot add/remove quorum children solves this problem
>>> and could also be used in other scenarios (e.g. user decides to take a
>>> quorum child offline).
>>>
>>
>> For replication case, we always do checkpoint again and again after
>> migration. If the disk is not synced before migration, we will use disk mirgation or mirror
>> job to sync it.
> 
> Can you document the way that you use disk migration or mirror, together
> with your COLO setup?   I think it would make it easier to understand this
> restriction.
> At the moment I don't understand how you can switch from doing a disk migration
> into COLO mode - there seems to be a gap between the end of disk migration and the start
> of COLO.

In my test, I use disk migration.

After migration and before starting COLO, we will disable disk migration:
+    /* Disable block migration */
+    s->params.blk = 0;
+    s->params.shared = 0;
+    qemu_savevm_state_begin(trans, &s->params);

If the user uses mirror job, we don't cancel the mirror job now.

> 
>> So we cannot start block replication when migration is running. We need
>> that nbd
>> client is not ready when migration is running, and it is ready between
>> migration ends
>> and checkpoint begins. Using a monotir command add the nbd client will cause
>> larger
>> downtime. So if the nbd client has been added(only not connect to the nbd
>> server),
>> we can connect to nbd server automatically.
> 
> Without the disk migration or mirror, I can't see the need for the delayed bdrv_connect.
> I can see that you want to do a disconnect at failover, however you need
> to take care because if the network is broken at the point of failover
> you need to make sure nothing blocks.

disk migration is needed if the disk is not synced before migration.

Thanks
Wen Congyang

> 
> Dave
> 
>>
>> Thanks
>> Wen Congyang
> --
> Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
> .
>
Dr. David Alan Gilbert June 26, 2015, 7:03 p.m. UTC | #15
* Wen Congyang (wency@cn.fujitsu.com) wrote:
> On 06/24/2015 10:07 PM, Dr. David Alan Gilbert wrote:
> > * Wen Congyang (ghostwcy@gmail.com) wrote:
> >> At 2015/6/19 18:49, Stefan Hajnoczi Wrote:
> >>> On Fri, Jun 19, 2015 at 08:54:56AM +0800, Wen Congyang wrote:
> >>>> On 06/19/2015 12:06 AM, Stefan Hajnoczi wrote:
> >>>>> On Thu, Jun 18, 2015 at 10:36:39PM +0800, Wen Congyang wrote:
> >>>>>> At 2015/6/18 20:55, Stefan Hajnoczi Wrote:
> >>>>>>> On Thu, Jun 18, 2015 at 04:49:12PM +0800, Wen Congyang wrote:
> >>>>>>>> +void bdrv_connect(BlockDriverState *bs, Error **errp)
> >>>>>>>> +{
> >>>>>>>> +    BlockDriver *drv = bs->drv;
> >>>>>>>> +
> >>>>>>>> +    if (drv && drv->bdrv_connect) {
> >>>>>>>> +        drv->bdrv_connect(bs, errp);
> >>>>>>>> +    } else if (bs->file) {
> >>>>>>>> +        bdrv_connect(bs->file, errp);
> >>>>>>>> +    } else {
> >>>>>>>> +        error_setg(errp, "this feature or command is not currently supported");
> >>>>>>>> +    }
> >>>>>>>> +}
> >>>>>>>> +
> >>>>>>>> +void bdrv_disconnect(BlockDriverState *bs)
> >>>>>>>> +{
> >>>>>>>> +    BlockDriver *drv = bs->drv;
> >>>>>>>> +
> >>>>>>>> +    if (drv && drv->bdrv_disconnect) {
> >>>>>>>> +        drv->bdrv_disconnect(bs);
> >>>>>>>> +    } else if (bs->file) {
> >>>>>>>> +        bdrv_disconnect(bs->file);
> >>>>>>>> +    }
> >>>>>>>> +}
> >>>>>>>
> >>>>>>> Please add doc comments describing the semantics of these commands.
> >>>>>>
> >>>>>> Where should it be documented? In the header file?
> >>>>>
> >>>>> block.h doesn't document prototypes in the header file, please document
> >>>>> the function definition in block.c.  (QEMU is not consistent here, some
> >>>>> places do it the other way around.)
> >>>>>
> >>>>>>> Why are these operations needed when there is already a bs->drv == NULL
> >>>>>>> case which means the BDS is not ready for read/write?
> >>>>>>>
> >>>>>>
> >>>>>> The purpos is that: don't connect to nbd server when opening a nbd client.
> >>>>>> connect/disconnect
> >>>>>> to nbd server when we need to do it.
> >>>>>>
> >>>>>> IIUC, if bs->drv is NULL, it means that the driver is ejected? Here,
> >>>>>> connect/disconnect
> >>>>>> means that connect/disconnect to remote target(The target may be in another
> >>>>>> host).
> >>>>>
> >>>>> Connect/disconnect puts something on the QEMU command-line that isn't
> >>>>> ready at startup time.
> >>>>>
> >>>>> How about using monitor commands to add objects when needed instead?
> >>>>>
> >>>>> That is cleaner because it doesn't introduce a new state (which is only
> >>>>> implemented for nbd).
> >>>>>
> >>>>
> >>>> The problem is that, nbd client is one child of quorum, and quorum must have more
> >>>> than one child. The nbd server is not ready until colo is running.
> >>>
> >>> A monitor command to hot add/remove quorum children solves this problem
> >>> and could also be used in other scenarios (e.g. user decides to take a
> >>> quorum child offline).
> >>>
> >>
> >> For replication case, we always do checkpoint again and again after
> >> migration. If the disk is not synced before migration, we will use disk mirgation or mirror
> >> job to sync it.
> > 
> > Can you document the way that you use disk migration or mirror, together
> > with your COLO setup?   I think it would make it easier to understand this
> > restriction.
> > At the moment I don't understand how you can switch from doing a disk migration
> > into COLO mode - there seems to be a gap between the end of disk migration and the start
> > of COLO.
> 
> In my test, I use disk migration.
> 
> After migration and before starting COLO, we will disable disk migration:
> +    /* Disable block migration */
> +    s->params.blk = 0;
> +    s->params.shared = 0;
> +    qemu_savevm_state_begin(trans, &s->params);

Ah, I hadn't realised you could do that; so do you just do:

migrate_set_parameter colo on
migrate -d -b tcp:otherhhost:port

How does the secondary know to feed that data straight into the disk without
recording all the old data into the hidden-disk ?

> If the user uses mirror job, we don't cancel the mirror job now.

It would be good to get it to work with mirror, that seems preferred these
days to the old block migration.

With block migration or mirror, then that pretty much allows you to replace
a failed secondary doesn't it without restarting?  The only thing stopping
you is the NBD client needs to point to the address of the new secondary.
Stefan's suggestion of being able to modify the quorum on the fly would seem
to fix that problem.
(The other thing I can see is that the secondary wouldn't have the conntrack
data for open connections; but that's a separate problem).

A related topic was that a colleague was asking about starting the qemu
up and only then setting the NBD target (he's trying to wire it into OpenStack);
I suggested that instead of specifying the drive on the command line, it
might work to use a drive_add to add the whole quorum/drive stack before starting
the guest; however again something to modify the quorum would be easier.

Finally the last good reason I can see for being able to modify the quorum on the fly
is that you'll need it for continuous ft to be able to transmute a secondary into
a new primary.

Dave

> 
> > 
> >> So we cannot start block replication when migration is running. We need
> >> that nbd
> >> client is not ready when migration is running, and it is ready between
> >> migration ends
> >> and checkpoint begins. Using a monotir command add the nbd client will cause
> >> larger
> >> downtime. So if the nbd client has been added(only not connect to the nbd
> >> server),
> >> we can connect to nbd server automatically.
> > 
> > Without the disk migration or mirror, I can't see the need for the delayed bdrv_connect.
> > I can see that you want to do a disconnect at failover, however you need
> > to take care because if the network is broken at the point of failover
> > you need to make sure nothing blocks.
> 
> disk migration is needed if the disk is not synced before migration.
> 
> Thanks
> Wen Congyang
> 
> > 
> > Dave
> > 
> >>
> >> Thanks
> >> Wen Congyang
> > --
> > Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
> > .
> > 
> 
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Wen Congyang June 29, 2015, 1:05 a.m. UTC | #16
On 06/27/2015 03:03 AM, Dr. David Alan Gilbert wrote:
> * Wen Congyang (wency@cn.fujitsu.com) wrote:
>> On 06/24/2015 10:07 PM, Dr. David Alan Gilbert wrote:
>>> * Wen Congyang (ghostwcy@gmail.com) wrote:
>>>> At 2015/6/19 18:49, Stefan Hajnoczi Wrote:
>>>>> On Fri, Jun 19, 2015 at 08:54:56AM +0800, Wen Congyang wrote:
>>>>>> On 06/19/2015 12:06 AM, Stefan Hajnoczi wrote:
>>>>>>> On Thu, Jun 18, 2015 at 10:36:39PM +0800, Wen Congyang wrote:
>>>>>>>> At 2015/6/18 20:55, Stefan Hajnoczi Wrote:
>>>>>>>>> On Thu, Jun 18, 2015 at 04:49:12PM +0800, Wen Congyang wrote:
>>>>>>>>>> +void bdrv_connect(BlockDriverState *bs, Error **errp)
>>>>>>>>>> +{
>>>>>>>>>> +    BlockDriver *drv = bs->drv;
>>>>>>>>>> +
>>>>>>>>>> +    if (drv && drv->bdrv_connect) {
>>>>>>>>>> +        drv->bdrv_connect(bs, errp);
>>>>>>>>>> +    } else if (bs->file) {
>>>>>>>>>> +        bdrv_connect(bs->file, errp);
>>>>>>>>>> +    } else {
>>>>>>>>>> +        error_setg(errp, "this feature or command is not currently supported");
>>>>>>>>>> +    }
>>>>>>>>>> +}
>>>>>>>>>> +
>>>>>>>>>> +void bdrv_disconnect(BlockDriverState *bs)
>>>>>>>>>> +{
>>>>>>>>>> +    BlockDriver *drv = bs->drv;
>>>>>>>>>> +
>>>>>>>>>> +    if (drv && drv->bdrv_disconnect) {
>>>>>>>>>> +        drv->bdrv_disconnect(bs);
>>>>>>>>>> +    } else if (bs->file) {
>>>>>>>>>> +        bdrv_disconnect(bs->file);
>>>>>>>>>> +    }
>>>>>>>>>> +}
>>>>>>>>>
>>>>>>>>> Please add doc comments describing the semantics of these commands.
>>>>>>>>
>>>>>>>> Where should it be documented? In the header file?
>>>>>>>
>>>>>>> block.h doesn't document prototypes in the header file, please document
>>>>>>> the function definition in block.c.  (QEMU is not consistent here, some
>>>>>>> places do it the other way around.)
>>>>>>>
>>>>>>>>> Why are these operations needed when there is already a bs->drv == NULL
>>>>>>>>> case which means the BDS is not ready for read/write?
>>>>>>>>>
>>>>>>>>
>>>>>>>> The purpos is that: don't connect to nbd server when opening a nbd client.
>>>>>>>> connect/disconnect
>>>>>>>> to nbd server when we need to do it.
>>>>>>>>
>>>>>>>> IIUC, if bs->drv is NULL, it means that the driver is ejected? Here,
>>>>>>>> connect/disconnect
>>>>>>>> means that connect/disconnect to remote target(The target may be in another
>>>>>>>> host).
>>>>>>>
>>>>>>> Connect/disconnect puts something on the QEMU command-line that isn't
>>>>>>> ready at startup time.
>>>>>>>
>>>>>>> How about using monitor commands to add objects when needed instead?
>>>>>>>
>>>>>>> That is cleaner because it doesn't introduce a new state (which is only
>>>>>>> implemented for nbd).
>>>>>>>
>>>>>>
>>>>>> The problem is that, nbd client is one child of quorum, and quorum must have more
>>>>>> than one child. The nbd server is not ready until colo is running.
>>>>>
>>>>> A monitor command to hot add/remove quorum children solves this problem
>>>>> and could also be used in other scenarios (e.g. user decides to take a
>>>>> quorum child offline).
>>>>>
>>>>
>>>> For replication case, we always do checkpoint again and again after
>>>> migration. If the disk is not synced before migration, we will use disk mirgation or mirror
>>>> job to sync it.
>>>
>>> Can you document the way that you use disk migration or mirror, together
>>> with your COLO setup?   I think it would make it easier to understand this
>>> restriction.
>>> At the moment I don't understand how you can switch from doing a disk migration
>>> into COLO mode - there seems to be a gap between the end of disk migration and the start
>>> of COLO.
>>
>> In my test, I use disk migration.
>>
>> After migration and before starting COLO, we will disable disk migration:
>> +    /* Disable block migration */
>> +    s->params.blk = 0;
>> +    s->params.shared = 0;
>> +    qemu_savevm_state_begin(trans, &s->params);
> 
> Ah, I hadn't realised you could do that; so do you just do:
> 
> migrate_set_parameter colo on
> migrate -d -b tcp:otherhhost:port
> 
> How does the secondary know to feed that data straight into the disk without
> recording all the old data into the hidden-disk ?

hidden disk and active disk will be made empty when starting block replication.

> 
>> If the user uses mirror job, we don't cancel the mirror job now.
> 
> It would be good to get it to work with mirror, that seems preferred these
> days to the old block migration.

In normal migration, is mirror job created and cancelled by libvirt?

> 
> With block migration or mirror, then that pretty much allows you to replace
> a failed secondary doesn't it without restarting?  The only thing stopping
> you is the NBD client needs to point to the address of the new secondary.
> Stefan's suggestion of being able to modify the quorum on the fly would seem
> to fix that problem.

Yes, I am implementing adding/removing quorum child now.

> (The other thing I can see is that the secondary wouldn't have the conntrack
> data for open connections; but that's a separate problem).
> 
> A related topic was that a colleague was asking about starting the qemu
> up and only then setting the NBD target (he's trying to wire it into OpenStack);
> I suggested that instead of specifying the drive on the command line, it
> might work to use a drive_add to add the whole quorum/drive stack before starting
> the guest; however again something to modify the quorum would be easier.
> 
> Finally the last good reason I can see for being able to modify the quorum on the fly
> is that you'll need it for continuous ft to be able to transmute a secondary into
> a new primary.

Yes, I think adding a filter on the top is also needed.

Thanks
Wen Congyang

> 
> Dave
> 
>>
>>>
>>>> So we cannot start block replication when migration is running. We need
>>>> that nbd
>>>> client is not ready when migration is running, and it is ready between
>>>> migration ends
>>>> and checkpoint begins. Using a monotir command add the nbd client will cause
>>>> larger
>>>> downtime. So if the nbd client has been added(only not connect to the nbd
>>>> server),
>>>> we can connect to nbd server automatically.
>>>
>>> Without the disk migration or mirror, I can't see the need for the delayed bdrv_connect.
>>> I can see that you want to do a disconnect at failover, however you need
>>> to take care because if the network is broken at the point of failover
>>> you need to make sure nothing blocks.
>>
>> disk migration is needed if the disk is not synced before migration.
>>
>> Thanks
>> Wen Congyang
>>
>>>
>>> Dave
>>>
>>>>
>>>> Thanks
>>>> Wen Congyang
>>> --
>>> Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
>>> .
>>>
>>
> --
> Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
> .
>
Dr. David Alan Gilbert June 30, 2015, 7:01 p.m. UTC | #17
* Wen Congyang (wency@cn.fujitsu.com) wrote:
> On 06/27/2015 03:03 AM, Dr. David Alan Gilbert wrote:

<snip>

> > Ah, I hadn't realised you could do that; so do you just do:
> > 
> > migrate_set_parameter colo on
> > migrate -d -b tcp:otherhhost:port
> > 
> > How does the secondary know to feed that data straight into the disk without
> > recording all the old data into the hidden-disk ?
> 
> hidden disk and active disk will be made empty when starting block replication.

Hmm, yes - I think I need to update to your current world; in the version
from the end of May, I get a 'error while loading state for instance 0x0 of device 'block''
if I try to use migrate -d -b 
(the bdrv_write fails)

> >> If the user uses mirror job, we don't cancel the mirror job now.
> > 
> > It would be good to get it to work with mirror, that seems preferred these
> > days to the old block migration.
> 
> In normal migration, is mirror job created and cancelled by libvirt?

Yes, I think so; you should be able to turn on full logging on libvirt and
watch the qmp commands it sends.

Dave

--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Wen Congyang July 1, 2015, 1:01 a.m. UTC | #18
On 07/01/2015 03:01 AM, Dr. David Alan Gilbert wrote:
> * Wen Congyang (wency@cn.fujitsu.com) wrote:
>> On 06/27/2015 03:03 AM, Dr. David Alan Gilbert wrote:
> 
> <snip>
> 
>>> Ah, I hadn't realised you could do that; so do you just do:
>>>
>>> migrate_set_parameter colo on
>>> migrate -d -b tcp:otherhhost:port
>>>
>>> How does the secondary know to feed that data straight into the disk without
>>> recording all the old data into the hidden-disk ?
>>
>> hidden disk and active disk will be made empty when starting block replication.
> 
> Hmm, yes - I think I need to update to your current world; in the version
> from the end of May, I get a 'error while loading state for instance 0x0 of device 'block''
> if I try to use migrate -d -b 
> (the bdrv_write fails)

Can you give me both primary and secondary qemu's command? I think the command line is wrong,
and disk migration fails.

> 
>>>> If the user uses mirror job, we don't cancel the mirror job now.
>>>
>>> It would be good to get it to work with mirror, that seems preferred these
>>> days to the old block migration.
>>
>> In normal migration, is mirror job created and cancelled by libvirt?
> 
> Yes, I think so; you should be able to turn on full logging on libvirt and
> watch the qmp commands it sends.

Supporting mirror job in my TODO list now. But I think we should focus the basci function now.

Thanks
Wen Congyang

> 
> Dave
> 
> --
> Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
> .
>
Dr. David Alan Gilbert July 1, 2015, 8:11 a.m. UTC | #19
* Wen Congyang (wency@cn.fujitsu.com) wrote:
> On 07/01/2015 03:01 AM, Dr. David Alan Gilbert wrote:
> > * Wen Congyang (wency@cn.fujitsu.com) wrote:
> >> On 06/27/2015 03:03 AM, Dr. David Alan Gilbert wrote:
> > 
> > <snip>
> > 
> >>> Ah, I hadn't realised you could do that; so do you just do:
> >>>
> >>> migrate_set_parameter colo on
> >>> migrate -d -b tcp:otherhhost:port
> >>>
> >>> How does the secondary know to feed that data straight into the disk without
> >>> recording all the old data into the hidden-disk ?
> >>
> >> hidden disk and active disk will be made empty when starting block replication.
> > 
> > Hmm, yes - I think I need to update to your current world; in the version
> > from the end of May, I get a 'error while loading state for instance 0x0 of device 'block''
> > if I try to use migrate -d -b 
> > (the bdrv_write fails)
> 
> Can you give me both primary and secondary qemu's command? I think the command line is wrong,
> and disk migration fails.
> 

Primary:

./try/bin/qemu-system-x86_64 -enable-kvm -nographic \
     -boot c -m 4096 -smp 4 -S \
     -name debug-threads=on -trace events=trace-file \
     -netdev tap,id=hn0,script=$PWD/ifup-prim,\
downscript=no,colo_script=$PWD/qemu/scripts/colo-proxy-script.sh,colo_nicname=em4 \
     -device e1000,mac=9c:da:4d:1c:b5:89,id=net-pci0,netdev=hn0 \
     -drive if=virtio,driver=quorum,read-pattern=fifo,no-connect=on,\
cache=none,aio=native,\
children.0.file.filename=./bugzilla.raw,\
children.0.driver=raw,\
children.1.file.driver=nbd,\
children.1.file.host=ibpair,\
children.1.file.port=8889,\
children.1.file.export=colo1,\
children.1.driver=replication,\
children.1.mode=primary,\
children.1.ignore-errors=on


Secondary:

./try/bin/qemu-system-x86_64 -enable-kvm -nographic \
     -boot c -m 4096 -smp 4 -S \
     -name debug-threads=on -trace events=trace-file \
     -netdev tap,id=hn0,script=$PWD/ifup-slave,\
downscript=no,colo_script=$PWD/qemu/scripts/colo-proxy-script.sh,colo_nicname=em4 \
     -device e1000,mac=9c:da:4d:1c:b5:89,id=net-pci0,netdev=hn0 \
     -drive if=none,driver=raw,file=bugzilla.raw,id=nbd_target1,cache=none,aio=native \
     -drive if=virtio,driver=replication,mode=secondary,export=colo1,throttling.bps-total-max=70000000,\
file.file.filename=/run/colo-active-disk.qcow2,\
file.driver=qcow2,\
file.backing_reference.drive_id=nbd_target1,\
file.backing_reference.hidden-disk.file.filename=/run/colo-hidden-disk.qcow2,\
file.backing_reference.hidden-disk.driver=qcow2,\
file.backing_reference.hidden-disk.allow-write-backing-file=on \
     -incoming tcp:0:8888


Thanks,

Dave

> >>>> If the user uses mirror job, we don't cancel the mirror job now.
> >>>
> >>> It would be good to get it to work with mirror, that seems preferred these
> >>> days to the old block migration.
> >>
> >> In normal migration, is mirror job created and cancelled by libvirt?
> > 
> > Yes, I think so; you should be able to turn on full logging on libvirt and
> > watch the qmp commands it sends.
> 
> Supporting mirror job in my TODO list now. But I think we should focus the basci function now.
> 
> Thanks
> Wen Congyang
> 
> > 
> > Dave
> > 
> > --
> > Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
> > .
> > 
> 
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Wen Congyang July 1, 2015, 8:18 a.m. UTC | #20
On 07/01/2015 04:11 PM, Dr. David Alan Gilbert wrote:
> * Wen Congyang (wency@cn.fujitsu.com) wrote:
>> On 07/01/2015 03:01 AM, Dr. David Alan Gilbert wrote:
>>> * Wen Congyang (wency@cn.fujitsu.com) wrote:
>>>> On 06/27/2015 03:03 AM, Dr. David Alan Gilbert wrote:
>>>
>>> <snip>
>>>
>>>>> Ah, I hadn't realised you could do that; so do you just do:
>>>>>
>>>>> migrate_set_parameter colo on
>>>>> migrate -d -b tcp:otherhhost:port
>>>>>
>>>>> How does the secondary know to feed that data straight into the disk without
>>>>> recording all the old data into the hidden-disk ?
>>>>
>>>> hidden disk and active disk will be made empty when starting block replication.
>>>
>>> Hmm, yes - I think I need to update to your current world; in the version
>>> from the end of May, I get a 'error while loading state for instance 0x0 of device 'block''
>>> if I try to use migrate -d -b 
>>> (the bdrv_write fails)
>>
>> Can you give me both primary and secondary qemu's command? I think the command line is wrong,
>> and disk migration fails.
>>
> 
> Primary:
> 
> ./try/bin/qemu-system-x86_64 -enable-kvm -nographic \
>      -boot c -m 4096 -smp 4 -S \
>      -name debug-threads=on -trace events=trace-file \
>      -netdev tap,id=hn0,script=$PWD/ifup-prim,\
> downscript=no,colo_script=$PWD/qemu/scripts/colo-proxy-script.sh,colo_nicname=em4 \
>      -device e1000,mac=9c:da:4d:1c:b5:89,id=net-pci0,netdev=hn0 \
>      -drive if=virtio,driver=quorum,read-pattern=fifo,no-connect=on,\
> cache=none,aio=native,\
> children.0.file.filename=./bugzilla.raw,\
> children.0.driver=raw,\
> children.1.file.driver=nbd,\
> children.1.file.host=ibpair,\
> children.1.file.port=8889,\
> children.1.file.export=colo1,\
> children.1.driver=replication,\
> children.1.mode=primary,\
> children.1.ignore-errors=on

Add id=nbd_target1 to primary disk option, and try it. Disk migration needs the same id to sync
the disk.

Thanks
Wen Congyang

> 
> 
> Secondary:
> 
> ./try/bin/qemu-system-x86_64 -enable-kvm -nographic \
>      -boot c -m 4096 -smp 4 -S \
>      -name debug-threads=on -trace events=trace-file \
>      -netdev tap,id=hn0,script=$PWD/ifup-slave,\
> downscript=no,colo_script=$PWD/qemu/scripts/colo-proxy-script.sh,colo_nicname=em4 \
>      -device e1000,mac=9c:da:4d:1c:b5:89,id=net-pci0,netdev=hn0 \
>      -drive if=none,driver=raw,file=bugzilla.raw,id=nbd_target1,cache=none,aio=native \
>      -drive if=virtio,driver=replication,mode=secondary,export=colo1,throttling.bps-total-max=70000000,\
> file.file.filename=/run/colo-active-disk.qcow2,\
> file.driver=qcow2,\
> file.backing_reference.drive_id=nbd_target1,\
> file.backing_reference.hidden-disk.file.filename=/run/colo-hidden-disk.qcow2,\
> file.backing_reference.hidden-disk.driver=qcow2,\
> file.backing_reference.hidden-disk.allow-write-backing-file=on \
>      -incoming tcp:0:8888
> 
> 
> Thanks,
> 
> Dave
> 
>>>>>> If the user uses mirror job, we don't cancel the mirror job now.
>>>>>
>>>>> It would be good to get it to work with mirror, that seems preferred these
>>>>> days to the old block migration.
>>>>
>>>> In normal migration, is mirror job created and cancelled by libvirt?
>>>
>>> Yes, I think so; you should be able to turn on full logging on libvirt and
>>> watch the qmp commands it sends.
>>
>> Supporting mirror job in my TODO list now. But I think we should focus the basci function now.
>>
>> Thanks
>> Wen Congyang
>>
>>>
>>> Dave
>>>
>>> --
>>> Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
>>> .
>>>
>>
> --
> Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
> .
>
Dr. David Alan Gilbert July 1, 2015, 6:42 p.m. UTC | #21
* Wen Congyang (wency@cn.fujitsu.com) wrote:
> On 07/01/2015 04:11 PM, Dr. David Alan Gilbert wrote:
> > * Wen Congyang (wency@cn.fujitsu.com) wrote:
> >> On 07/01/2015 03:01 AM, Dr. David Alan Gilbert wrote:
> >>> * Wen Congyang (wency@cn.fujitsu.com) wrote:
> >>>> On 06/27/2015 03:03 AM, Dr. David Alan Gilbert wrote:
> >>>
> >>> <snip>
> >>>
> >>>>> Ah, I hadn't realised you could do that; so do you just do:
> >>>>>
> >>>>> migrate_set_parameter colo on
> >>>>> migrate -d -b tcp:otherhhost:port
> >>>>>
> >>>>> How does the secondary know to feed that data straight into the disk without
> >>>>> recording all the old data into the hidden-disk ?
> >>>>
> >>>> hidden disk and active disk will be made empty when starting block replication.
> >>>
> >>> Hmm, yes - I think I need to update to your current world; in the version
> >>> from the end of May, I get a 'error while loading state for instance 0x0 of device 'block''
> >>> if I try to use migrate -d -b 
> >>> (the bdrv_write fails)
> >>
> >> Can you give me both primary and secondary qemu's command? I think the command line is wrong,
> >> and disk migration fails.
> >>
> > 
> > Primary:
> > 
> > ./try/bin/qemu-system-x86_64 -enable-kvm -nographic \
> >      -boot c -m 4096 -smp 4 -S \
> >      -name debug-threads=on -trace events=trace-file \
> >      -netdev tap,id=hn0,script=$PWD/ifup-prim,\
> > downscript=no,colo_script=$PWD/qemu/scripts/colo-proxy-script.sh,colo_nicname=em4 \
> >      -device e1000,mac=9c:da:4d:1c:b5:89,id=net-pci0,netdev=hn0 \
> >      -drive if=virtio,driver=quorum,read-pattern=fifo,no-connect=on,\
> > cache=none,aio=native,\
> > children.0.file.filename=./bugzilla.raw,\
> > children.0.driver=raw,\
> > children.1.file.driver=nbd,\
> > children.1.file.host=ibpair,\
> > children.1.file.port=8889,\
> > children.1.file.export=colo1,\
> > children.1.driver=replication,\
> > children.1.mode=primary,\
> > children.1.ignore-errors=on
> 
> Add id=nbd_target1 to primary disk option, and try it. Disk migration needs the same id to sync
> the disk.

Thank you! That worked nicely.
The only odd thing was that the hidden disk on the secondary went upto ~2GB in size during
the disk copy; (This is still on the version you posted at the end of may).
I don't really understand why it was 2GB - the disk was 40GB, and qemu-img tells me that
2.6GB of it were used.   Still, it would be good to avoid the overhead of going through
the hidden disk on the secondary for the initial replication.

Dave

> Thanks
> Wen Congyang
> 
> > 
> > 
> > Secondary:
> > 
> > ./try/bin/qemu-system-x86_64 -enable-kvm -nographic \
> >      -boot c -m 4096 -smp 4 -S \
> >      -name debug-threads=on -trace events=trace-file \
> >      -netdev tap,id=hn0,script=$PWD/ifup-slave,\
> > downscript=no,colo_script=$PWD/qemu/scripts/colo-proxy-script.sh,colo_nicname=em4 \
> >      -device e1000,mac=9c:da:4d:1c:b5:89,id=net-pci0,netdev=hn0 \
> >      -drive if=none,driver=raw,file=bugzilla.raw,id=nbd_target1,cache=none,aio=native \
> >      -drive if=virtio,driver=replication,mode=secondary,export=colo1,throttling.bps-total-max=70000000,\
> > file.file.filename=/run/colo-active-disk.qcow2,\
> > file.driver=qcow2,\
> > file.backing_reference.drive_id=nbd_target1,\
> > file.backing_reference.hidden-disk.file.filename=/run/colo-hidden-disk.qcow2,\
> > file.backing_reference.hidden-disk.driver=qcow2,\
> > file.backing_reference.hidden-disk.allow-write-backing-file=on \
> >      -incoming tcp:0:8888
> > 
> > 
> > Thanks,
> > 
> > Dave
> > 
> >>>>>> If the user uses mirror job, we don't cancel the mirror job now.
> >>>>>
> >>>>> It would be good to get it to work with mirror, that seems preferred these
> >>>>> days to the old block migration.
> >>>>
> >>>> In normal migration, is mirror job created and cancelled by libvirt?
> >>>
> >>> Yes, I think so; you should be able to turn on full logging on libvirt and
> >>> watch the qmp commands it sends.
> >>
> >> Supporting mirror job in my TODO list now. But I think we should focus the basci function now.
> >>
> >> Thanks
> >> Wen Congyang
> >>
> >>>
> >>> Dave
> >>>
> >>> --
> >>> Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
> >>> .
> >>>
> >>
> > --
> > Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
> > .
> > 
> 
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Wen Congyang July 2, 2015, 12:55 a.m. UTC | #22
On 07/02/2015 02:42 AM, Dr. David Alan Gilbert wrote:
> * Wen Congyang (wency@cn.fujitsu.com) wrote:
>> On 07/01/2015 04:11 PM, Dr. David Alan Gilbert wrote:
>>> * Wen Congyang (wency@cn.fujitsu.com) wrote:
>>>> On 07/01/2015 03:01 AM, Dr. David Alan Gilbert wrote:
>>>>> * Wen Congyang (wency@cn.fujitsu.com) wrote:
>>>>>> On 06/27/2015 03:03 AM, Dr. David Alan Gilbert wrote:
>>>>>
>>>>> <snip>
>>>>>
>>>>>>> Ah, I hadn't realised you could do that; so do you just do:
>>>>>>>
>>>>>>> migrate_set_parameter colo on
>>>>>>> migrate -d -b tcp:otherhhost:port
>>>>>>>
>>>>>>> How does the secondary know to feed that data straight into the disk without
>>>>>>> recording all the old data into the hidden-disk ?
>>>>>>
>>>>>> hidden disk and active disk will be made empty when starting block replication.
>>>>>
>>>>> Hmm, yes - I think I need to update to your current world; in the version
>>>>> from the end of May, I get a 'error while loading state for instance 0x0 of device 'block''
>>>>> if I try to use migrate -d -b 
>>>>> (the bdrv_write fails)
>>>>
>>>> Can you give me both primary and secondary qemu's command? I think the command line is wrong,
>>>> and disk migration fails.
>>>>
>>>
>>> Primary:
>>>
>>> ./try/bin/qemu-system-x86_64 -enable-kvm -nographic \
>>>      -boot c -m 4096 -smp 4 -S \
>>>      -name debug-threads=on -trace events=trace-file \
>>>      -netdev tap,id=hn0,script=$PWD/ifup-prim,\
>>> downscript=no,colo_script=$PWD/qemu/scripts/colo-proxy-script.sh,colo_nicname=em4 \
>>>      -device e1000,mac=9c:da:4d:1c:b5:89,id=net-pci0,netdev=hn0 \
>>>      -drive if=virtio,driver=quorum,read-pattern=fifo,no-connect=on,\
>>> cache=none,aio=native,\
>>> children.0.file.filename=./bugzilla.raw,\
>>> children.0.driver=raw,\
>>> children.1.file.driver=nbd,\
>>> children.1.file.host=ibpair,\
>>> children.1.file.port=8889,\
>>> children.1.file.export=colo1,\
>>> children.1.driver=replication,\
>>> children.1.mode=primary,\
>>> children.1.ignore-errors=on
>>
>> Add id=nbd_target1 to primary disk option, and try it. Disk migration needs the same id to sync
>> the disk.
> 
> Thank you! That worked nicely.
> The only odd thing was that the hidden disk on the secondary went upto ~2GB in size during
> the disk copy; (This is still on the version you posted at the end of may).
> I don't really understand why it was 2GB - the disk was 40GB, and qemu-img tells me that
> 2.6GB of it were used.   Still, it would be good to avoid the overhead of going through
> the hidden disk on the secondary for the initial replication.

Yes, I have fixed it in v7: starting backup job later.

Thanks
Wen Congyang

> 
> Dave
> 
>> Thanks
>> Wen Congyang
>>
>>>
>>>
>>> Secondary:
>>>
>>> ./try/bin/qemu-system-x86_64 -enable-kvm -nographic \
>>>      -boot c -m 4096 -smp 4 -S \
>>>      -name debug-threads=on -trace events=trace-file \
>>>      -netdev tap,id=hn0,script=$PWD/ifup-slave,\
>>> downscript=no,colo_script=$PWD/qemu/scripts/colo-proxy-script.sh,colo_nicname=em4 \
>>>      -device e1000,mac=9c:da:4d:1c:b5:89,id=net-pci0,netdev=hn0 \
>>>      -drive if=none,driver=raw,file=bugzilla.raw,id=nbd_target1,cache=none,aio=native \
>>>      -drive if=virtio,driver=replication,mode=secondary,export=colo1,throttling.bps-total-max=70000000,\
>>> file.file.filename=/run/colo-active-disk.qcow2,\
>>> file.driver=qcow2,\
>>> file.backing_reference.drive_id=nbd_target1,\
>>> file.backing_reference.hidden-disk.file.filename=/run/colo-hidden-disk.qcow2,\
>>> file.backing_reference.hidden-disk.driver=qcow2,\
>>> file.backing_reference.hidden-disk.allow-write-backing-file=on \
>>>      -incoming tcp:0:8888
>>>
>>>
>>> Thanks,
>>>
>>> Dave
>>>
>>>>>>>> If the user uses mirror job, we don't cancel the mirror job now.
>>>>>>>
>>>>>>> It would be good to get it to work with mirror, that seems preferred these
>>>>>>> days to the old block migration.
>>>>>>
>>>>>> In normal migration, is mirror job created and cancelled by libvirt?
>>>>>
>>>>> Yes, I think so; you should be able to turn on full logging on libvirt and
>>>>> watch the qmp commands it sends.
>>>>
>>>> Supporting mirror job in my TODO list now. But I think we should focus the basci function now.
>>>>
>>>> Thanks
>>>> Wen Congyang
>>>>
>>>>>
>>>>> Dave
>>>>>
>>>>> --
>>>>> Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
>>>>> .
>>>>>
>>>>
>>> --
>>> Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
>>> .
>>>
>>
> --
> Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
> .
>
Dr. David Alan Gilbert July 2, 2015, 7:54 a.m. UTC | #23
* Wen Congyang (wency@cn.fujitsu.com) wrote:
> On 07/02/2015 02:42 AM, Dr. David Alan Gilbert wrote:
> > * Wen Congyang (wency@cn.fujitsu.com) wrote:
> >> On 07/01/2015 04:11 PM, Dr. David Alan Gilbert wrote:
> >>> * Wen Congyang (wency@cn.fujitsu.com) wrote:
> >>>> On 07/01/2015 03:01 AM, Dr. David Alan Gilbert wrote:
> >>>>> * Wen Congyang (wency@cn.fujitsu.com) wrote:
> >>>>>> On 06/27/2015 03:03 AM, Dr. David Alan Gilbert wrote:
> >>>>>
> >>>>> <snip>
> >>>>>
> >>>>>>> Ah, I hadn't realised you could do that; so do you just do:
> >>>>>>>
> >>>>>>> migrate_set_parameter colo on
> >>>>>>> migrate -d -b tcp:otherhhost:port
> >>>>>>>
> >>>>>>> How does the secondary know to feed that data straight into the disk without
> >>>>>>> recording all the old data into the hidden-disk ?
> >>>>>>
> >>>>>> hidden disk and active disk will be made empty when starting block replication.
> >>>>>
> >>>>> Hmm, yes - I think I need to update to your current world; in the version
> >>>>> from the end of May, I get a 'error while loading state for instance 0x0 of device 'block''
> >>>>> if I try to use migrate -d -b 
> >>>>> (the bdrv_write fails)
> >>>>
> >>>> Can you give me both primary and secondary qemu's command? I think the command line is wrong,
> >>>> and disk migration fails.
> >>>>
> >>>
> >>> Primary:
> >>>
> >>> ./try/bin/qemu-system-x86_64 -enable-kvm -nographic \
> >>>      -boot c -m 4096 -smp 4 -S \
> >>>      -name debug-threads=on -trace events=trace-file \
> >>>      -netdev tap,id=hn0,script=$PWD/ifup-prim,\
> >>> downscript=no,colo_script=$PWD/qemu/scripts/colo-proxy-script.sh,colo_nicname=em4 \
> >>>      -device e1000,mac=9c:da:4d:1c:b5:89,id=net-pci0,netdev=hn0 \
> >>>      -drive if=virtio,driver=quorum,read-pattern=fifo,no-connect=on,\
> >>> cache=none,aio=native,\
> >>> children.0.file.filename=./bugzilla.raw,\
> >>> children.0.driver=raw,\
> >>> children.1.file.driver=nbd,\
> >>> children.1.file.host=ibpair,\
> >>> children.1.file.port=8889,\
> >>> children.1.file.export=colo1,\
> >>> children.1.driver=replication,\
> >>> children.1.mode=primary,\
> >>> children.1.ignore-errors=on
> >>
> >> Add id=nbd_target1 to primary disk option, and try it. Disk migration needs the same id to sync
> >> the disk.
> > 
> > Thank you! That worked nicely.
> > The only odd thing was that the hidden disk on the secondary went upto ~2GB in size during
> > the disk copy; (This is still on the version you posted at the end of may).
> > I don't really understand why it was 2GB - the disk was 40GB, and qemu-img tells me that
> > 2.6GB of it were used.   Still, it would be good to avoid the overhead of going through
> > the hidden disk on the secondary for the initial replication.
> 
> Yes, I have fixed it in v7: starting backup job later.

Thanks, I'll try it.

Dave

> 
> Thanks
> Wen Congyang
> 
> > 
> > Dave
> > 
> >> Thanks
> >> Wen Congyang
> >>
> >>>
> >>>
> >>> Secondary:
> >>>
> >>> ./try/bin/qemu-system-x86_64 -enable-kvm -nographic \
> >>>      -boot c -m 4096 -smp 4 -S \
> >>>      -name debug-threads=on -trace events=trace-file \
> >>>      -netdev tap,id=hn0,script=$PWD/ifup-slave,\
> >>> downscript=no,colo_script=$PWD/qemu/scripts/colo-proxy-script.sh,colo_nicname=em4 \
> >>>      -device e1000,mac=9c:da:4d:1c:b5:89,id=net-pci0,netdev=hn0 \
> >>>      -drive if=none,driver=raw,file=bugzilla.raw,id=nbd_target1,cache=none,aio=native \
> >>>      -drive if=virtio,driver=replication,mode=secondary,export=colo1,throttling.bps-total-max=70000000,\
> >>> file.file.filename=/run/colo-active-disk.qcow2,\
> >>> file.driver=qcow2,\
> >>> file.backing_reference.drive_id=nbd_target1,\
> >>> file.backing_reference.hidden-disk.file.filename=/run/colo-hidden-disk.qcow2,\
> >>> file.backing_reference.hidden-disk.driver=qcow2,\
> >>> file.backing_reference.hidden-disk.allow-write-backing-file=on \
> >>>      -incoming tcp:0:8888
> >>>
> >>>
> >>> Thanks,
> >>>
> >>> Dave
> >>>
> >>>>>>>> If the user uses mirror job, we don't cancel the mirror job now.
> >>>>>>>
> >>>>>>> It would be good to get it to work with mirror, that seems preferred these
> >>>>>>> days to the old block migration.
> >>>>>>
> >>>>>> In normal migration, is mirror job created and cancelled by libvirt?
> >>>>>
> >>>>> Yes, I think so; you should be able to turn on full logging on libvirt and
> >>>>> watch the qmp commands it sends.
> >>>>
> >>>> Supporting mirror job in my TODO list now. But I think we should focus the basci function now.
> >>>>
> >>>> Thanks
> >>>> Wen Congyang
> >>>>
> >>>>>
> >>>>> Dave
> >>>>>
> >>>>> --
> >>>>> Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
> >>>>> .
> >>>>>
> >>>>
> >>> --
> >>> Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
> >>> .
> >>>
> >>
> > --
> > Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
> > .
> > 
> 
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Wen Congyang July 2, 2015, 8:05 a.m. UTC | #24
On 07/02/2015 03:54 PM, Dr. David Alan Gilbert wrote:
> * Wen Congyang (wency@cn.fujitsu.com) wrote:
>> On 07/02/2015 02:42 AM, Dr. David Alan Gilbert wrote:
>>> * Wen Congyang (wency@cn.fujitsu.com) wrote:
>>>> On 07/01/2015 04:11 PM, Dr. David Alan Gilbert wrote:
>>>>> * Wen Congyang (wency@cn.fujitsu.com) wrote:
>>>>>> On 07/01/2015 03:01 AM, Dr. David Alan Gilbert wrote:
>>>>>>> * Wen Congyang (wency@cn.fujitsu.com) wrote:
>>>>>>>> On 06/27/2015 03:03 AM, Dr. David Alan Gilbert wrote:
>>>>>>>
>>>>>>> <snip>
>>>>>>>
>>>>>>>>> Ah, I hadn't realised you could do that; so do you just do:
>>>>>>>>>
>>>>>>>>> migrate_set_parameter colo on
>>>>>>>>> migrate -d -b tcp:otherhhost:port
>>>>>>>>>
>>>>>>>>> How does the secondary know to feed that data straight into the disk without
>>>>>>>>> recording all the old data into the hidden-disk ?
>>>>>>>>
>>>>>>>> hidden disk and active disk will be made empty when starting block replication.
>>>>>>>
>>>>>>> Hmm, yes - I think I need to update to your current world; in the version
>>>>>>> from the end of May, I get a 'error while loading state for instance 0x0 of device 'block''
>>>>>>> if I try to use migrate -d -b 
>>>>>>> (the bdrv_write fails)
>>>>>>
>>>>>> Can you give me both primary and secondary qemu's command? I think the command line is wrong,
>>>>>> and disk migration fails.
>>>>>>
>>>>>
>>>>> Primary:
>>>>>
>>>>> ./try/bin/qemu-system-x86_64 -enable-kvm -nographic \
>>>>>      -boot c -m 4096 -smp 4 -S \
>>>>>      -name debug-threads=on -trace events=trace-file \
>>>>>      -netdev tap,id=hn0,script=$PWD/ifup-prim,\
>>>>> downscript=no,colo_script=$PWD/qemu/scripts/colo-proxy-script.sh,colo_nicname=em4 \
>>>>>      -device e1000,mac=9c:da:4d:1c:b5:89,id=net-pci0,netdev=hn0 \
>>>>>      -drive if=virtio,driver=quorum,read-pattern=fifo,no-connect=on,\
>>>>> cache=none,aio=native,\
>>>>> children.0.file.filename=./bugzilla.raw,\
>>>>> children.0.driver=raw,\
>>>>> children.1.file.driver=nbd,\
>>>>> children.1.file.host=ibpair,\
>>>>> children.1.file.port=8889,\
>>>>> children.1.file.export=colo1,\
>>>>> children.1.driver=replication,\
>>>>> children.1.mode=primary,\
>>>>> children.1.ignore-errors=on
>>>>
>>>> Add id=nbd_target1 to primary disk option, and try it. Disk migration needs the same id to sync
>>>> the disk.
>>>
>>> Thank you! That worked nicely.
>>> The only odd thing was that the hidden disk on the secondary went upto ~2GB in size during
>>> the disk copy; (This is still on the version you posted at the end of may).
>>> I don't really understand why it was 2GB - the disk was 40GB, and qemu-img tells me that
>>> 2.6GB of it were used.   Still, it would be good to avoid the overhead of going through
>>> the hidden disk on the secondary for the initial replication.
>>
>> Yes, I have fixed it in v7: starting backup job later.
> 
> Thanks, I'll try it.

Note: the usage is changed, and you can get the usage from the wiki:
http://wiki.qemu.org/Features/COLO

Thanks
Wen Congyang

> 
> Dave
> 
>>
>> Thanks
>> Wen Congyang
>>
>>>
>>> Dave
>>>
>>>> Thanks
>>>> Wen Congyang
>>>>
>>>>>
>>>>>
>>>>> Secondary:
>>>>>
>>>>> ./try/bin/qemu-system-x86_64 -enable-kvm -nographic \
>>>>>      -boot c -m 4096 -smp 4 -S \
>>>>>      -name debug-threads=on -trace events=trace-file \
>>>>>      -netdev tap,id=hn0,script=$PWD/ifup-slave,\
>>>>> downscript=no,colo_script=$PWD/qemu/scripts/colo-proxy-script.sh,colo_nicname=em4 \
>>>>>      -device e1000,mac=9c:da:4d:1c:b5:89,id=net-pci0,netdev=hn0 \
>>>>>      -drive if=none,driver=raw,file=bugzilla.raw,id=nbd_target1,cache=none,aio=native \
>>>>>      -drive if=virtio,driver=replication,mode=secondary,export=colo1,throttling.bps-total-max=70000000,\
>>>>> file.file.filename=/run/colo-active-disk.qcow2,\
>>>>> file.driver=qcow2,\
>>>>> file.backing_reference.drive_id=nbd_target1,\
>>>>> file.backing_reference.hidden-disk.file.filename=/run/colo-hidden-disk.qcow2,\
>>>>> file.backing_reference.hidden-disk.driver=qcow2,\
>>>>> file.backing_reference.hidden-disk.allow-write-backing-file=on \
>>>>>      -incoming tcp:0:8888
>>>>>
>>>>>
>>>>> Thanks,
>>>>>
>>>>> Dave
>>>>>
>>>>>>>>>> If the user uses mirror job, we don't cancel the mirror job now.
>>>>>>>>>
>>>>>>>>> It would be good to get it to work with mirror, that seems preferred these
>>>>>>>>> days to the old block migration.
>>>>>>>>
>>>>>>>> In normal migration, is mirror job created and cancelled by libvirt?
>>>>>>>
>>>>>>> Yes, I think so; you should be able to turn on full logging on libvirt and
>>>>>>> watch the qmp commands it sends.
>>>>>>
>>>>>> Supporting mirror job in my TODO list now. But I think we should focus the basci function now.
>>>>>>
>>>>>> Thanks
>>>>>> Wen Congyang
>>>>>>
>>>>>>>
>>>>>>> Dave
>>>>>>>
>>>>>>> --
>>>>>>> Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
>>>>>>> .
>>>>>>>
>>>>>>
>>>>> --
>>>>> Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
>>>>> .
>>>>>
>>>>
>>> --
>>> Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
>>> .
>>>
>>
> --
> Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
> .
>
diff mbox

Patch

diff --git a/block.c b/block.c
index 0b41af4..59071d4 100644
--- a/block.c
+++ b/block.c
@@ -4400,3 +4400,27 @@  BlockAcctStats *bdrv_get_stats(BlockDriverState *bs)
 {
     return &bs->stats;
 }
+
+void bdrv_connect(BlockDriverState *bs, Error **errp)
+{
+    BlockDriver *drv = bs->drv;
+
+    if (drv && drv->bdrv_connect) {
+        drv->bdrv_connect(bs, errp);
+    } else if (bs->file) {
+        bdrv_connect(bs->file, errp);
+    } else {
+        error_setg(errp, "this feature or command is not currently supported");
+    }
+}
+
+void bdrv_disconnect(BlockDriverState *bs)
+{
+    BlockDriver *drv = bs->drv;
+
+    if (drv && drv->bdrv_disconnect) {
+        drv->bdrv_disconnect(bs);
+    } else if (bs->file) {
+        bdrv_disconnect(bs->file);
+    }
+}
diff --git a/include/block/block.h b/include/block/block.h
index 7cdb569..2c2a0cc 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -606,4 +606,7 @@  void bdrv_flush_io_queue(BlockDriverState *bs);
 
 BlockAcctStats *bdrv_get_stats(BlockDriverState *bs);
 
+void bdrv_connect(BlockDriverState *bs, Error **errp);
+void bdrv_disconnect(BlockDriverState *bs);
+
 #endif
diff --git a/include/block/block_int.h b/include/block/block_int.h
index 87fe89a..a3e5372 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -290,6 +290,9 @@  struct BlockDriver {
      */
     int (*bdrv_probe_geometry)(BlockDriverState *bs, HDGeometry *geo);
 
+    void (*bdrv_connect)(BlockDriverState *bs, Error **errp);
+    void (*bdrv_disconnect)(BlockDriverState *bs);
+
     QLIST_ENTRY(BlockDriver) list;
 };