diff mbox series

[v5,08/10] migration: create a dedicated thread to release rdma resource

Message ID 1528212489-19137-9-git-send-email-lidongchen@tencent.com
State New
Headers show
Series Enable postcopy RDMA live migration | expand

Commit Message

858585 jemmy June 5, 2018, 3:28 p.m. UTC
ibv_dereg_mr wait for a long time for big memory size virtual server.

The test result is:
  10GB  326ms
  20GB  699ms
  30GB  1021ms
  40GB  1387ms
  50GB  1712ms
  60GB  2034ms
  70GB  2457ms
  80GB  2807ms
  90GB  3107ms
  100GB 3474ms
  110GB 3735ms
  120GB 4064ms
  130GB 4567ms
  140GB 4886ms

this will cause the guest os hang for a while when migration finished.
So create a dedicated thread to release rdma resource.

Signed-off-by: Lidong Chen <lidongchen@tencent.com>
---
 migration/rdma.c | 43 +++++++++++++++++++++++++++----------------
 1 file changed, 27 insertions(+), 16 deletions(-)

Comments

Dr. David Alan Gilbert June 27, 2018, 6:59 p.m. UTC | #1
* Lidong Chen (jemmy858585@gmail.com) wrote:
> ibv_dereg_mr wait for a long time for big memory size virtual server.
> 
> The test result is:
>   10GB  326ms
>   20GB  699ms
>   30GB  1021ms
>   40GB  1387ms
>   50GB  1712ms
>   60GB  2034ms
>   70GB  2457ms
>   80GB  2807ms
>   90GB  3107ms
>   100GB 3474ms
>   110GB 3735ms
>   120GB 4064ms
>   130GB 4567ms
>   140GB 4886ms
> 
> this will cause the guest os hang for a while when migration finished.
> So create a dedicated thread to release rdma resource.
> 
> Signed-off-by: Lidong Chen <lidongchen@tencent.com>
> ---
>  migration/rdma.c | 43 +++++++++++++++++++++++++++----------------
>  1 file changed, 27 insertions(+), 16 deletions(-)
> 
> diff --git a/migration/rdma.c b/migration/rdma.c
> index dfa4f77..f12e8d5 100644
> --- a/migration/rdma.c
> +++ b/migration/rdma.c
> @@ -2979,35 +2979,46 @@ static void qio_channel_rdma_set_aio_fd_handler(QIOChannel *ioc,
>      }
>  }
>  
> -static int qio_channel_rdma_close(QIOChannel *ioc,
> -                                  Error **errp)
> +static void *qio_channel_rdma_close_thread(void *arg)
>  {
> -    QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
> -    RDMAContext *rdmain, *rdmaout;
> -    trace_qemu_rdma_close();
> +    RDMAContext **rdma = arg;
> +    RDMAContext *rdmain = rdma[0];
> +    RDMAContext *rdmaout = rdma[1];
>  
> -    rdmain = rioc->rdmain;
> -    if (rdmain) {
> -        atomic_rcu_set(&rioc->rdmain, NULL);
> -    }
> -
> -    rdmaout = rioc->rdmaout;
> -    if (rdmaout) {
> -        atomic_rcu_set(&rioc->rdmaout, NULL);
> -    }
> +    rcu_register_thread();
>  
>      synchronize_rcu();

* see below

> -
>      if (rdmain) {
>          qemu_rdma_cleanup(rdmain);
>      }
> -
>      if (rdmaout) {
>          qemu_rdma_cleanup(rdmaout);
>      }
>  
>      g_free(rdmain);
>      g_free(rdmaout);
> +    g_free(rdma);
> +
> +    rcu_unregister_thread();
> +    return NULL;
> +}
> +
> +static int qio_channel_rdma_close(QIOChannel *ioc,
> +                                  Error **errp)
> +{
> +    QemuThread t;
> +    QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
> +    RDMAContext **rdma = g_new0(RDMAContext*, 2);
> +
> +    trace_qemu_rdma_close();
> +    if (rioc->rdmain || rioc->rdmaout) {
> +        rdma[0] =  rioc->rdmain;
> +        rdma[1] =  rioc->rdmaout;
> +        qemu_thread_create(&t, "rdma cleanup", qio_channel_rdma_close_thread,
> +                           rdma, QEMU_THREAD_DETACHED);
> +        atomic_rcu_set(&rioc->rdmain, NULL);
> +        atomic_rcu_set(&rioc->rdmaout, NULL);

I'm not sure this pair is ordered with the synchronise_rcu above;
Doesn't that mean, on a bad day, that you could get:


    main-thread          rdma_cleanup     another-thread
    qmu_thread_create
                      synchronise_rcu
                                        reads rioc->rdmain
                                        starts doing something with rdmain
    atomic_rcu_set
                      rdma_cleanup


so the another-thread is using it during the cleanup?
Would just moving the atomic_rcu_sets before the qemu_thread_create
fix that?

However, I've got other worries as well:
   a) qemu_rdma_cleanup does:
       migrate_get_current()->state == MIGRATION_STATUS_CANCELLING

      which worries me a little if someone immediately tries to restart
      the migration.

   b) I don't understand what happens if someone does try and restart
      the migration after that, but in the ~5s it takes the ibv cleanup
      to happen.

Dave

    
> +    }
>  
>      return 0;
>  }
> -- 
> 1.8.3.1
> 
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
858585 jemmy July 5, 2018, 2:26 p.m. UTC | #2
On Thu, Jun 28, 2018 at 2:59 AM, Dr. David Alan Gilbert
<dgilbert@redhat.com> wrote:
> * Lidong Chen (jemmy858585@gmail.com) wrote:
>> ibv_dereg_mr wait for a long time for big memory size virtual server.
>>
>> The test result is:
>>   10GB  326ms
>>   20GB  699ms
>>   30GB  1021ms
>>   40GB  1387ms
>>   50GB  1712ms
>>   60GB  2034ms
>>   70GB  2457ms
>>   80GB  2807ms
>>   90GB  3107ms
>>   100GB 3474ms
>>   110GB 3735ms
>>   120GB 4064ms
>>   130GB 4567ms
>>   140GB 4886ms
>>
>> this will cause the guest os hang for a while when migration finished.
>> So create a dedicated thread to release rdma resource.
>>
>> Signed-off-by: Lidong Chen <lidongchen@tencent.com>
>> ---
>>  migration/rdma.c | 43 +++++++++++++++++++++++++++----------------
>>  1 file changed, 27 insertions(+), 16 deletions(-)
>>
>> diff --git a/migration/rdma.c b/migration/rdma.c
>> index dfa4f77..f12e8d5 100644
>> --- a/migration/rdma.c
>> +++ b/migration/rdma.c
>> @@ -2979,35 +2979,46 @@ static void qio_channel_rdma_set_aio_fd_handler(QIOChannel *ioc,
>>      }
>>  }
>>
>> -static int qio_channel_rdma_close(QIOChannel *ioc,
>> -                                  Error **errp)
>> +static void *qio_channel_rdma_close_thread(void *arg)
>>  {
>> -    QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
>> -    RDMAContext *rdmain, *rdmaout;
>> -    trace_qemu_rdma_close();
>> +    RDMAContext **rdma = arg;
>> +    RDMAContext *rdmain = rdma[0];
>> +    RDMAContext *rdmaout = rdma[1];
>>
>> -    rdmain = rioc->rdmain;
>> -    if (rdmain) {
>> -        atomic_rcu_set(&rioc->rdmain, NULL);
>> -    }
>> -
>> -    rdmaout = rioc->rdmaout;
>> -    if (rdmaout) {
>> -        atomic_rcu_set(&rioc->rdmaout, NULL);
>> -    }
>> +    rcu_register_thread();
>>
>>      synchronize_rcu();
>
> * see below
>
>> -
>>      if (rdmain) {
>>          qemu_rdma_cleanup(rdmain);
>>      }
>> -
>>      if (rdmaout) {
>>          qemu_rdma_cleanup(rdmaout);
>>      }
>>
>>      g_free(rdmain);
>>      g_free(rdmaout);
>> +    g_free(rdma);
>> +
>> +    rcu_unregister_thread();
>> +    return NULL;
>> +}
>> +
>> +static int qio_channel_rdma_close(QIOChannel *ioc,
>> +                                  Error **errp)
>> +{
>> +    QemuThread t;
>> +    QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
>> +    RDMAContext **rdma = g_new0(RDMAContext*, 2);
>> +
>> +    trace_qemu_rdma_close();
>> +    if (rioc->rdmain || rioc->rdmaout) {
>> +        rdma[0] =  rioc->rdmain;
>> +        rdma[1] =  rioc->rdmaout;
>> +        qemu_thread_create(&t, "rdma cleanup", qio_channel_rdma_close_thread,
>> +                           rdma, QEMU_THREAD_DETACHED);
>> +        atomic_rcu_set(&rioc->rdmain, NULL);
>> +        atomic_rcu_set(&rioc->rdmaout, NULL);
>
> I'm not sure this pair is ordered with the synchronise_rcu above;
> Doesn't that mean, on a bad day, that you could get:
>
>
>     main-thread          rdma_cleanup     another-thread
>     qmu_thread_create
>                       synchronise_rcu
>                                         reads rioc->rdmain
>                                         starts doing something with rdmain
>     atomic_rcu_set
>                       rdma_cleanup
>
>
> so the another-thread is using it during the cleanup?
> Would just moving the atomic_rcu_sets before the qemu_thread_create
> fix that?
yes, I will fix it.

>
> However, I've got other worries as well:
>    a) qemu_rdma_cleanup does:
>        migrate_get_current()->state == MIGRATION_STATUS_CANCELLING
>
>       which worries me a little if someone immediately tries to restart
>       the migration.
>
>    b) I don't understand what happens if someone does try and restart
>       the migration after that, but in the ~5s it takes the ibv cleanup
>       to happen.

yes, I will try to fix it.

>
> Dave
>
>
>> +    }
>>
>>      return 0;
>>  }
>> --
>> 1.8.3.1
>>
> --
> Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
858585 jemmy July 19, 2018, 5:49 a.m. UTC | #3
On Thu, Jul 5, 2018 at 10:26 PM, 858585 jemmy <jemmy858585@gmail.com> wrote:
> On Thu, Jun 28, 2018 at 2:59 AM, Dr. David Alan Gilbert
> <dgilbert@redhat.com> wrote:
>> * Lidong Chen (jemmy858585@gmail.com) wrote:
>>> ibv_dereg_mr wait for a long time for big memory size virtual server.
>>>
>>> The test result is:
>>>   10GB  326ms
>>>   20GB  699ms
>>>   30GB  1021ms
>>>   40GB  1387ms
>>>   50GB  1712ms
>>>   60GB  2034ms
>>>   70GB  2457ms
>>>   80GB  2807ms
>>>   90GB  3107ms
>>>   100GB 3474ms
>>>   110GB 3735ms
>>>   120GB 4064ms
>>>   130GB 4567ms
>>>   140GB 4886ms
>>>
>>> this will cause the guest os hang for a while when migration finished.
>>> So create a dedicated thread to release rdma resource.
>>>
>>> Signed-off-by: Lidong Chen <lidongchen@tencent.com>
>>> ---
>>>  migration/rdma.c | 43 +++++++++++++++++++++++++++----------------
>>>  1 file changed, 27 insertions(+), 16 deletions(-)
>>>
>>> diff --git a/migration/rdma.c b/migration/rdma.c
>>> index dfa4f77..f12e8d5 100644
>>> --- a/migration/rdma.c
>>> +++ b/migration/rdma.c
>>> @@ -2979,35 +2979,46 @@ static void qio_channel_rdma_set_aio_fd_handler(QIOChannel *ioc,
>>>      }
>>>  }
>>>
>>> -static int qio_channel_rdma_close(QIOChannel *ioc,
>>> -                                  Error **errp)
>>> +static void *qio_channel_rdma_close_thread(void *arg)
>>>  {
>>> -    QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
>>> -    RDMAContext *rdmain, *rdmaout;
>>> -    trace_qemu_rdma_close();
>>> +    RDMAContext **rdma = arg;
>>> +    RDMAContext *rdmain = rdma[0];
>>> +    RDMAContext *rdmaout = rdma[1];
>>>
>>> -    rdmain = rioc->rdmain;
>>> -    if (rdmain) {
>>> -        atomic_rcu_set(&rioc->rdmain, NULL);
>>> -    }
>>> -
>>> -    rdmaout = rioc->rdmaout;
>>> -    if (rdmaout) {
>>> -        atomic_rcu_set(&rioc->rdmaout, NULL);
>>> -    }
>>> +    rcu_register_thread();
>>>
>>>      synchronize_rcu();
>>
>> * see below
>>
>>> -
>>>      if (rdmain) {
>>>          qemu_rdma_cleanup(rdmain);
>>>      }
>>> -
>>>      if (rdmaout) {
>>>          qemu_rdma_cleanup(rdmaout);
>>>      }
>>>
>>>      g_free(rdmain);
>>>      g_free(rdmaout);
>>> +    g_free(rdma);
>>> +
>>> +    rcu_unregister_thread();
>>> +    return NULL;
>>> +}
>>> +
>>> +static int qio_channel_rdma_close(QIOChannel *ioc,
>>> +                                  Error **errp)
>>> +{
>>> +    QemuThread t;
>>> +    QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
>>> +    RDMAContext **rdma = g_new0(RDMAContext*, 2);
>>> +
>>> +    trace_qemu_rdma_close();
>>> +    if (rioc->rdmain || rioc->rdmaout) {
>>> +        rdma[0] =  rioc->rdmain;
>>> +        rdma[1] =  rioc->rdmaout;
>>> +        qemu_thread_create(&t, "rdma cleanup", qio_channel_rdma_close_thread,
>>> +                           rdma, QEMU_THREAD_DETACHED);
>>> +        atomic_rcu_set(&rioc->rdmain, NULL);
>>> +        atomic_rcu_set(&rioc->rdmaout, NULL);
>>
>> I'm not sure this pair is ordered with the synchronise_rcu above;
>> Doesn't that mean, on a bad day, that you could get:
>>
>>
>>     main-thread          rdma_cleanup     another-thread
>>     qmu_thread_create
>>                       synchronise_rcu
>>                                         reads rioc->rdmain
>>                                         starts doing something with rdmain
>>     atomic_rcu_set
>>                       rdma_cleanup
>>
>>
>> so the another-thread is using it during the cleanup?
>> Would just moving the atomic_rcu_sets before the qemu_thread_create
>> fix that?
> yes, I will fix it.
>
>>
>> However, I've got other worries as well:
>>    a) qemu_rdma_cleanup does:
>>        migrate_get_current()->state == MIGRATION_STATUS_CANCELLING
>>
>>       which worries me a little if someone immediately tries to restart
>>       the migration.

Because the current qemu version don't wait for
RDMA_CM_EVENT_DISCONNECTED event after rdma_disconnect,
so I think it's not necessary to send RDMA_CONTROL_ERROR.

compare to send RDMA_CONTROL_ERROR, I think use cm event to notify
peer qemu is better.
maybe the rdma is already in error_state, and RDMA_CONTROL_ERROR
cannot send successfully.

For peer qemu, in qemu_rdma_wait_comp_channel function, it's should
not only poll rdma->comp_channel->fd,
it's should also poll  rdma->channel->fd.

for the source qemu, it's fixed by this patch.
migration: poll the cm event while wait RDMA work request completion

and for destination qemu, it's need a new patch to fix it.

so I prefer to remove MIGRATION_STATUS_CANCELLING in qemu_rdma_cleanup function.

>>
>>    b) I don't understand what happens if someone does try and restart
>>       the migration after that, but in the ~5s it takes the ibv cleanup
>>       to happen.

I prefer to add a new variable in current_migration.  if the rdma
cleanup thread has not
exited. it's should not start a new migration.

>
> yes, I will try to fix it.
>
>>
>> Dave
>>
>>
>>> +    }
>>>
>>>      return 0;
>>>  }
>>> --
>>> 1.8.3.1
>>>
>> --
>> Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Gal Shachaf July 23, 2018, 2:54 p.m. UTC | #4
On Thu, Jul 5, 2018 at 10:26 PM, 858585 jemmy <jemmy858585@gmail.com> wrote:
> On Thu, Jun 28, 2018 at 2:59 AM, Dr. David Alan Gilbert 
> <dgilbert@redhat.com> wrote:
>> * Lidong Chen (jemmy858585@gmail.com) wrote:
>>> ibv_dereg_mr wait for a long time for big memory size virtual server.
>>>
>>> The test result is:
>>>   10GB  326ms
>>>   20GB  699ms
>>>   30GB  1021ms
>>>   40GB  1387ms
>>>   50GB  1712ms
>>>   60GB  2034ms
>>>   70GB  2457ms
>>>   80GB  2807ms
>>>   90GB  3107ms
>>>   100GB 3474ms
>>>   110GB 3735ms
>>>   120GB 4064ms
>>>   130GB 4567ms
>>>   140GB 4886ms
>>>
>>> this will cause the guest os hang for a while when migration finished.
>>> So create a dedicated thread to release rdma resource.
>>>
>>> Signed-off-by: Lidong Chen <lidongchen@tencent.com>
>>> ---
>>>  migration/rdma.c | 43 +++++++++++++++++++++++++++----------------
>>>  1 file changed, 27 insertions(+), 16 deletions(-)
>>>
>>> diff --git a/migration/rdma.c b/migration/rdma.c index 
>>> dfa4f77..f12e8d5 100644
>>> --- a/migration/rdma.c
>>> +++ b/migration/rdma.c
>>> @@ -2979,35 +2979,46 @@ static void qio_channel_rdma_set_aio_fd_handler(QIOChannel *ioc,
>>>      }
>>>  }
>>>
>>> -static int qio_channel_rdma_close(QIOChannel *ioc,
>>> -                                  Error **errp)
>>> +static void *qio_channel_rdma_close_thread(void *arg)
>>>  {
>>> -    QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
>>> -    RDMAContext *rdmain, *rdmaout;
>>> -    trace_qemu_rdma_close();
>>> +    RDMAContext **rdma = arg;
>>> +    RDMAContext *rdmain = rdma[0];
>>> +    RDMAContext *rdmaout = rdma[1];
>>>
>>> -    rdmain = rioc->rdmain;
>>> -    if (rdmain) {
>>> -        atomic_rcu_set(&rioc->rdmain, NULL);
>>> -    }
>>> -
>>> -    rdmaout = rioc->rdmaout;
>>> -    if (rdmaout) {
>>> -        atomic_rcu_set(&rioc->rdmaout, NULL);
>>> -    }
>>> +    rcu_register_thread();
>>>
>>>      synchronize_rcu();
>>
>> * see below
>>
>>> -
>>>      if (rdmain) {
>>>          qemu_rdma_cleanup(rdmain);
>>>      }
>>> -
>>>      if (rdmaout) {
>>>          qemu_rdma_cleanup(rdmaout);
>>>      }
>>>
>>>      g_free(rdmain);
>>>      g_free(rdmaout);
>>> +    g_free(rdma);
>>> +
>>> +    rcu_unregister_thread();
>>> +    return NULL;
>>> +}
>>> +
>>> +static int qio_channel_rdma_close(QIOChannel *ioc,
>>> +                                  Error **errp) {
>>> +    QemuThread t;
>>> +    QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
>>> +    RDMAContext **rdma = g_new0(RDMAContext*, 2);
>>> +
>>> +    trace_qemu_rdma_close();
>>> +    if (rioc->rdmain || rioc->rdmaout) {
>>> +        rdma[0] =  rioc->rdmain;
>>> +        rdma[1] =  rioc->rdmaout;
>>> +        qemu_thread_create(&t, "rdma cleanup", qio_channel_rdma_close_thread,
>>> +                           rdma, QEMU_THREAD_DETACHED);
>>> +        atomic_rcu_set(&rioc->rdmain, NULL);
>>> +        atomic_rcu_set(&rioc->rdmaout, NULL);
>>
>> I'm not sure this pair is ordered with the synchronise_rcu above; 
>> Doesn't that mean, on a bad day, that you could get:
>>
>>
>>     main-thread          rdma_cleanup     another-thread
>>     qmu_thread_create
>>                       synchronise_rcu
>>                                         reads rioc->rdmain
>>                                         starts doing something with rdmain
>>     atomic_rcu_set
>>                       rdma_cleanup
>>
>>
>> so the another-thread is using it during the cleanup?
>> Would just moving the atomic_rcu_sets before the qemu_thread_create 
>> fix that?
> yes, I will fix it.
>
>>
>> However, I've got other worries as well:
>>    a) qemu_rdma_cleanup does:
>>        migrate_get_current()->state == MIGRATION_STATUS_CANCELLING
>>
>>       which worries me a little if someone immediately tries to restart
>>       the migration.

Because the current qemu version don't wait for RDMA_CM_EVENT_DISCONNECTED event after rdma_disconnect, so I think it's not necessary to send RDMA_CONTROL_ERROR.

compare to send RDMA_CONTROL_ERROR, I think use cm event to notify peer qemu is better.
maybe the rdma is already in error_state, and RDMA_CONTROL_ERROR cannot send successfully.

For peer qemu, in qemu_rdma_wait_comp_channel function, it's should not only poll rdma->comp_channel->fd, it's should also poll  rdma->channel->fd.
[GS] I agree that we should poll on CM events in addition to CQ events.

for the source qemu, it's fixed by this patch.
migration: poll the cm event while wait RDMA work request completion

and for destination qemu, it's need a new patch to fix it.

so I prefer to remove MIGRATION_STATUS_CANCELLING in qemu_rdma_cleanup function.

[GS] The intention of this patch is to move the costly ibv_dereg_mr to a separate thread, so we do not actually need to also move the RDMA_CONTROL_ERROR  and rdma_disconnect, that come beforehand in qemu_rdma_cleanup,  to a thread. I suggest that we move them out of qemu_rdma_cleanup to a new function "qemu_rdma_disconnect" and call it on each RDMAcontext before the qemu_thread_create.
>>
>>    b) I don't understand what happens if someone does try and restart
>>       the migration after that, but in the ~5s it takes the ibv cleanup
>>       to happen.

I prefer to add a new variable in current_migration.  if the rdma cleanup thread has not exited. it's should not start a new migration.
[GS] I support this suggestion. 

>
> yes, I will try to fix it.
>
>>
>> Dave
>>
>>
>>> +    }
>>>
>>>      return 0;
>>>  }
>>> --
>>> 1.8.3.1
>>>
>> --
>> Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
858585 jemmy July 27, 2018, 5:34 a.m. UTC | #5
On Mon, Jul 23, 2018 at 10:54 PM, Gal Shachaf <galsha@mellanox.com> wrote:
> On Thu, Jul 5, 2018 at 10:26 PM, 858585 jemmy <jemmy858585@gmail.com> wrote:
>> On Thu, Jun 28, 2018 at 2:59 AM, Dr. David Alan Gilbert
>> <dgilbert@redhat.com> wrote:
>>> * Lidong Chen (jemmy858585@gmail.com) wrote:
>>>> ibv_dereg_mr wait for a long time for big memory size virtual server.
>>>>
>>>> The test result is:
>>>>   10GB  326ms
>>>>   20GB  699ms
>>>>   30GB  1021ms
>>>>   40GB  1387ms
>>>>   50GB  1712ms
>>>>   60GB  2034ms
>>>>   70GB  2457ms
>>>>   80GB  2807ms
>>>>   90GB  3107ms
>>>>   100GB 3474ms
>>>>   110GB 3735ms
>>>>   120GB 4064ms
>>>>   130GB 4567ms
>>>>   140GB 4886ms
>>>>
>>>> this will cause the guest os hang for a while when migration finished.
>>>> So create a dedicated thread to release rdma resource.
>>>>
>>>> Signed-off-by: Lidong Chen <lidongchen@tencent.com>
>>>> ---
>>>>  migration/rdma.c | 43 +++++++++++++++++++++++++++----------------
>>>>  1 file changed, 27 insertions(+), 16 deletions(-)
>>>>
>>>> diff --git a/migration/rdma.c b/migration/rdma.c index
>>>> dfa4f77..f12e8d5 100644
>>>> --- a/migration/rdma.c
>>>> +++ b/migration/rdma.c
>>>> @@ -2979,35 +2979,46 @@ static void qio_channel_rdma_set_aio_fd_handler(QIOChannel *ioc,
>>>>      }
>>>>  }
>>>>
>>>> -static int qio_channel_rdma_close(QIOChannel *ioc,
>>>> -                                  Error **errp)
>>>> +static void *qio_channel_rdma_close_thread(void *arg)
>>>>  {
>>>> -    QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
>>>> -    RDMAContext *rdmain, *rdmaout;
>>>> -    trace_qemu_rdma_close();
>>>> +    RDMAContext **rdma = arg;
>>>> +    RDMAContext *rdmain = rdma[0];
>>>> +    RDMAContext *rdmaout = rdma[1];
>>>>
>>>> -    rdmain = rioc->rdmain;
>>>> -    if (rdmain) {
>>>> -        atomic_rcu_set(&rioc->rdmain, NULL);
>>>> -    }
>>>> -
>>>> -    rdmaout = rioc->rdmaout;
>>>> -    if (rdmaout) {
>>>> -        atomic_rcu_set(&rioc->rdmaout, NULL);
>>>> -    }
>>>> +    rcu_register_thread();
>>>>
>>>>      synchronize_rcu();
>>>
>>> * see below
>>>
>>>> -
>>>>      if (rdmain) {
>>>>          qemu_rdma_cleanup(rdmain);
>>>>      }
>>>> -
>>>>      if (rdmaout) {
>>>>          qemu_rdma_cleanup(rdmaout);
>>>>      }
>>>>
>>>>      g_free(rdmain);
>>>>      g_free(rdmaout);
>>>> +    g_free(rdma);
>>>> +
>>>> +    rcu_unregister_thread();
>>>> +    return NULL;
>>>> +}
>>>> +
>>>> +static int qio_channel_rdma_close(QIOChannel *ioc,
>>>> +                                  Error **errp) {
>>>> +    QemuThread t;
>>>> +    QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
>>>> +    RDMAContext **rdma = g_new0(RDMAContext*, 2);
>>>> +
>>>> +    trace_qemu_rdma_close();
>>>> +    if (rioc->rdmain || rioc->rdmaout) {
>>>> +        rdma[0] =  rioc->rdmain;
>>>> +        rdma[1] =  rioc->rdmaout;
>>>> +        qemu_thread_create(&t, "rdma cleanup", qio_channel_rdma_close_thread,
>>>> +                           rdma, QEMU_THREAD_DETACHED);
>>>> +        atomic_rcu_set(&rioc->rdmain, NULL);
>>>> +        atomic_rcu_set(&rioc->rdmaout, NULL);
>>>
>>> I'm not sure this pair is ordered with the synchronise_rcu above;
>>> Doesn't that mean, on a bad day, that you could get:
>>>
>>>
>>>     main-thread          rdma_cleanup     another-thread
>>>     qmu_thread_create
>>>                       synchronise_rcu
>>>                                         reads rioc->rdmain
>>>                                         starts doing something with rdmain
>>>     atomic_rcu_set
>>>                       rdma_cleanup
>>>
>>>
>>> so the another-thread is using it during the cleanup?
>>> Would just moving the atomic_rcu_sets before the qemu_thread_create
>>> fix that?
>> yes, I will fix it.
>>
>>>
>>> However, I've got other worries as well:
>>>    a) qemu_rdma_cleanup does:
>>>        migrate_get_current()->state == MIGRATION_STATUS_CANCELLING
>>>
>>>       which worries me a little if someone immediately tries to restart
>>>       the migration.
>
> Because the current qemu version don't wait for RDMA_CM_EVENT_DISCONNECTED event after rdma_disconnect, so I think it's not necessary to send RDMA_CONTROL_ERROR.
>
> compare to send RDMA_CONTROL_ERROR, I think use cm event to notify peer qemu is better.
> maybe the rdma is already in error_state, and RDMA_CONTROL_ERROR cannot send successfully.
>
> For peer qemu, in qemu_rdma_wait_comp_channel function, it's should not only poll rdma->comp_channel->fd, it's should also poll  rdma->channel->fd.
> [GS] I agree that we should poll on CM events in addition to CQ events.
>
> for the source qemu, it's fixed by this patch.
> migration: poll the cm event while wait RDMA work request completion
>
> and for destination qemu, it's need a new patch to fix it.
>
> so I prefer to remove MIGRATION_STATUS_CANCELLING in qemu_rdma_cleanup function.
>
> [GS] The intention of this patch is to move the costly ibv_dereg_mr to a separate thread, so we do not actually need to also move the RDMA_CONTROL_ERROR  and rdma_disconnect, that come beforehand in qemu_rdma_cleanup,  to a thread. I suggest that we move them out of qemu_rdma_cleanup to a new function "qemu_rdma_disconnect" and call it on each RDMAcontext before the qemu_thread_create.

Hi Dave, Gal:
I find the reason why we send RDMA_CONTROL_ERROR  while
MIGRATION_STATUS_CANCELLING is to fix this issue.
https://git.qemu.org/?p=qemu.git;a=commit;h=32bce196344772df8d68ab40e2dd1dd57be1aa7c

The current qemu version don't wait for RDMA_CM_EVENT_DISCONNECTED
event after rdma_disconnect, so I think it's not necessary to send
RDMA_CONTROL_ERROR while MIGRATION_STATUS_CANCELLING.

and I was confused about the purpose of RDMA_CONTROL_ERROR.
It seems to notify something error on one side, but it's only invoked
in qemu_rdma_cleanup.
and rdma_disconnect is enough to notify peer qemu.
and Gal told me that DREQ drop just the same as the DREP, so we cannot
guarantee that the destination will receive the
RDMA_CM_EVENT_DISCONNECTED
event.
but we also cannot guarantee the destination will receive the
RDMA_CONTROL_ERROR when rdma in is error state.

So I prefer to also remove send RDMA_CONTROL_ERROR in qemu_rdma_cleanup.

Any suggestion?

Thanks.

>>>
>>>    b) I don't understand what happens if someone does try and restart
>>>       the migration after that, but in the ~5s it takes the ibv cleanup
>>>       to happen.
>
> I prefer to add a new variable in current_migration.  if the rdma cleanup thread has not exited. it's should not start a new migration.
> [GS] I support this suggestion.
>
>>
>> yes, I will try to fix it.
>>
>>>
>>> Dave
>>>
>>>
>>>> +    }
>>>>
>>>>      return 0;
>>>>  }
>>>> --
>>>> 1.8.3.1
>>>>
>>> --
>>> Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
diff mbox series

Patch

diff --git a/migration/rdma.c b/migration/rdma.c
index dfa4f77..f12e8d5 100644
--- a/migration/rdma.c
+++ b/migration/rdma.c
@@ -2979,35 +2979,46 @@  static void qio_channel_rdma_set_aio_fd_handler(QIOChannel *ioc,
     }
 }
 
-static int qio_channel_rdma_close(QIOChannel *ioc,
-                                  Error **errp)
+static void *qio_channel_rdma_close_thread(void *arg)
 {
-    QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
-    RDMAContext *rdmain, *rdmaout;
-    trace_qemu_rdma_close();
+    RDMAContext **rdma = arg;
+    RDMAContext *rdmain = rdma[0];
+    RDMAContext *rdmaout = rdma[1];
 
-    rdmain = rioc->rdmain;
-    if (rdmain) {
-        atomic_rcu_set(&rioc->rdmain, NULL);
-    }
-
-    rdmaout = rioc->rdmaout;
-    if (rdmaout) {
-        atomic_rcu_set(&rioc->rdmaout, NULL);
-    }
+    rcu_register_thread();
 
     synchronize_rcu();
-
     if (rdmain) {
         qemu_rdma_cleanup(rdmain);
     }
-
     if (rdmaout) {
         qemu_rdma_cleanup(rdmaout);
     }
 
     g_free(rdmain);
     g_free(rdmaout);
+    g_free(rdma);
+
+    rcu_unregister_thread();
+    return NULL;
+}
+
+static int qio_channel_rdma_close(QIOChannel *ioc,
+                                  Error **errp)
+{
+    QemuThread t;
+    QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
+    RDMAContext **rdma = g_new0(RDMAContext*, 2);
+
+    trace_qemu_rdma_close();
+    if (rioc->rdmain || rioc->rdmaout) {
+        rdma[0] =  rioc->rdmain;
+        rdma[1] =  rioc->rdmaout;
+        qemu_thread_create(&t, "rdma cleanup", qio_channel_rdma_close_thread,
+                           rdma, QEMU_THREAD_DETACHED);
+        atomic_rcu_set(&rioc->rdmain, NULL);
+        atomic_rcu_set(&rioc->rdmaout, NULL);
+    }
 
     return 0;
 }