Patchwork fixing qemu-0.1X endless loop in qcow2_alloc_cluster_offset

login
register
mail settings
Submitter Zhouyi Zhou
Date May 13, 2012, 8:03 a.m.
Message ID <20120513160331.5b774c93.yizhouzhou@ict.ac.cn>
Download mbox | patch
Permalink /patch/158844/
State New
Headers show

Comments

Zhouyi Zhou - May 13, 2012, 8:03 a.m.
hi all
  
  sometimes, qemu/kvm-0.1x will hang in endless loop in qcow2_alloc_cluster_offset.
  after some investigation, I found that:
  in function posix_aio_process_queue(void *opaque)
440             ret = qemu_paio_error(acb);
441             if (ret == ECANCELED) {
442                 /* remove the request */
443                 *pacb = acb->next;
444                 qemu_aio_release(acb);
445                 result = 1;
446             } else if (ret != EINPROGRESS) {
  in line 444 acb got released but acb->common.opaque does not.
which will be released via guest OS via ide_dma_cancel which 
will in term call qcow_aio_cancel which does not check its argument
is in flight list or not.
  The fix is as follows: (debian 6's qemu-kvm-0.12.5)
#######################################

 Thanks for investigation
Zhouyi
Kevin Wolf - May 14, 2012, 12:20 p.m.
Am 13.05.2012 10:03, schrieb Zhouyi Zhou:
> hi all
>   
>   sometimes, qemu/kvm-0.1x will hang in endless loop in qcow2_alloc_cluster_offset.
>   after some investigation, I found that:
>   in function posix_aio_process_queue(void *opaque)
> 440             ret = qemu_paio_error(acb);
> 441             if (ret == ECANCELED) {
> 442                 /* remove the request */
> 443                 *pacb = acb->next;
> 444                 qemu_aio_release(acb);
> 445                 result = 1;
> 446             } else if (ret != EINPROGRESS) {
>   in line 444 acb got released but acb->common.opaque does not.
> which will be released via guest OS via ide_dma_cancel which 
> will in term call qcow_aio_cancel which does not check its argument
> is in flight list or not.
>   The fix is as follows: (debian 6's qemu-kvm-0.12.5)
> #######################################
> --- block/qcow2.h~      2010-07-27 08:43:53.000000000 +0800
> +++ block/qcow2.h       2012-05-13 15:51:39.000000000 +0800
> @@ -143,6 +143,7 @@
>      QLIST_HEAD(QCowAioDependencies, QCowAIOCB) dependent_requests;
>  
>      QLIST_ENTRY(QCowL2Meta) next_in_flight;
> +    int inflight;       
>  } QCowL2Meta;
> --- block/qcow2.c~  2012-05-13 15:57:09.000000000 +0800
> +++ block/qcow2.c       2012-05-13 15:57:24.000000000 +0800
> @@ -349,6 +349,10 @@
>      QCowAIOCB *acb = (QCowAIOCB *)blockacb;
>      if (acb->hd_aiocb)
>          bdrv_aio_cancel(acb->hd_aiocb);
> +    if (acb->l2meta.inflight) {
> +        QLIST_REMOVE(&acb->l2meta, next_in_flight);
> +       acb->l2meta.inflight = 0;
> +    }
>      qemu_aio_release(acb);
>  }
>  
> @@ -506,6 +510,7 @@
>      acb->n = 0;
>      acb->cluster_offset = 0;
>      acb->l2meta.nb_clusters = 0;
> +    acb->l2meta.inflight = 0;
>      QLIST_INIT(&acb->l2meta.dependent_requests);
>      return acb;
>  }
> @@ -534,6 +539,7 @@
>      /* Take the request off the list of running requests */
>      if (m->nb_clusters != 0) {
>          QLIST_REMOVE(m, next_in_flight);
> +       m->inflight = 0;
>      }
>  
>      /*
> @@ -632,6 +638,7 @@
>  fail:
>      if (acb->l2meta.nb_clusters != 0) {
>          QLIST_REMOVE(&acb->l2meta, next_in_flight);
> +       acb->l2meta.inflight  = 0;
>      }
>  done:
>      if (acb->qiov->niov > 1)
> --- block/qcow2-cluster.c~      2010-07-27 08:43:53.000000000 +0800
> +++ block/qcow2-cluster.c       2012-05-13 15:53:53.000000000 +0800
> @@ -827,6 +827,7 @@
>      m->offset = offset;
>      m->n_start = n_start;
>      m->nb_clusters = nb_clusters;
> +    m->inflight = 1;
>  
>  out:
>      m->nb_available = MIN(nb_clusters << (s->cluster_bits - 9), n_end);
> 
>  Thanks for investigation
> Zhouyi

The patch looks reasonable to me. Note however that while it fixes the
hang, it still causes cluster leaks. I'm not sure if someone is
interested in picking these up for old stable releases. Andreas, I think
you were going to take 0.15? The first version that doesn't have the
problem is 1.0.

Kevin
Andreas Färber - June 12, 2012, 1:33 p.m.
Am 14.05.2012 14:20, schrieb Kevin Wolf:
> Am 13.05.2012 10:03, schrieb Zhouyi Zhou:
>> hi all
>>   
>>   sometimes, qemu/kvm-0.1x will hang in endless loop in qcow2_alloc_cluster_offset.
>>   after some investigation, I found that:
>>   in function posix_aio_process_queue(void *opaque)
>> 440             ret = qemu_paio_error(acb);
>> 441             if (ret == ECANCELED) {
>> 442                 /* remove the request */
>> 443                 *pacb = acb->next;
>> 444                 qemu_aio_release(acb);
>> 445                 result = 1;
>> 446             } else if (ret != EINPROGRESS) {
>>   in line 444 acb got released but acb->common.opaque does not.
>> which will be released via guest OS via ide_dma_cancel which 
>> will in term call qcow_aio_cancel which does not check its argument
>> is in flight list or not.
>>   The fix is as follows: (debian 6's qemu-kvm-0.12.5)
>> #######################################
>> --- block/qcow2.h~      2010-07-27 08:43:53.000000000 +0800
>> +++ block/qcow2.h       2012-05-13 15:51:39.000000000 +0800
>> @@ -143,6 +143,7 @@
>>      QLIST_HEAD(QCowAioDependencies, QCowAIOCB) dependent_requests;
>>  
>>      QLIST_ENTRY(QCowL2Meta) next_in_flight;
>> +    int inflight;       
>>  } QCowL2Meta;
>> --- block/qcow2.c~  2012-05-13 15:57:09.000000000 +0800
>> +++ block/qcow2.c       2012-05-13 15:57:24.000000000 +0800
>> @@ -349,6 +349,10 @@
>>      QCowAIOCB *acb = (QCowAIOCB *)blockacb;
>>      if (acb->hd_aiocb)
>>          bdrv_aio_cancel(acb->hd_aiocb);
>> +    if (acb->l2meta.inflight) {
>> +        QLIST_REMOVE(&acb->l2meta, next_in_flight);
>> +       acb->l2meta.inflight = 0;
>> +    }
>>      qemu_aio_release(acb);
>>  }
>>  
>> @@ -506,6 +510,7 @@
>>      acb->n = 0;
>>      acb->cluster_offset = 0;
>>      acb->l2meta.nb_clusters = 0;
>> +    acb->l2meta.inflight = 0;
>>      QLIST_INIT(&acb->l2meta.dependent_requests);
>>      return acb;
>>  }
>> @@ -534,6 +539,7 @@
>>      /* Take the request off the list of running requests */
>>      if (m->nb_clusters != 0) {
>>          QLIST_REMOVE(m, next_in_flight);
>> +       m->inflight = 0;
>>      }
>>  
>>      /*
>> @@ -632,6 +638,7 @@
>>  fail:
>>      if (acb->l2meta.nb_clusters != 0) {
>>          QLIST_REMOVE(&acb->l2meta, next_in_flight);
>> +       acb->l2meta.inflight  = 0;
>>      }
>>  done:
>>      if (acb->qiov->niov > 1)
>> --- block/qcow2-cluster.c~      2010-07-27 08:43:53.000000000 +0800
>> +++ block/qcow2-cluster.c       2012-05-13 15:53:53.000000000 +0800
>> @@ -827,6 +827,7 @@
>>      m->offset = offset;
>>      m->n_start = n_start;
>>      m->nb_clusters = nb_clusters;
>> +    m->inflight = 1;
>>  
>>  out:
>>      m->nb_available = MIN(nb_clusters << (s->cluster_bits - 9), n_end);
>>
>>  Thanks for investigation
>> Zhouyi
> 
> The patch looks reasonable to me. Note however that while it fixes the
> hang, it still causes cluster leaks. I'm not sure if someone is
> interested in picking these up for old stable releases. Andreas, I think
> you were going to take 0.15? The first version that doesn't have the
> problem is 1.0.

Kevin, the policy as I understood it is to cherry-pick patches from
qemu.git into qemu-stable-x.y.git. So I don't think me applying this
patch to stable-0.15 would be right. I don't spot a particular qcow2 fix
among our 0.15 backports that I have now pushed. Do you have a pointer
which one(s) would fix this issue so that I can recheck?

Zhouyi, could you test git://git.qemu.org/qemu-stable-0.15.git and check
if the problem is still there?

Thanks,
Andreas
Kevin Wolf - June 12, 2012, 1:44 p.m.
Am 12.06.2012 15:33, schrieb Andreas Färber:
> Am 14.05.2012 14:20, schrieb Kevin Wolf:
>> Am 13.05.2012 10:03, schrieb Zhouyi Zhou:
>>> hi all
>>>   
>>>   sometimes, qemu/kvm-0.1x will hang in endless loop in qcow2_alloc_cluster_offset.
>>>   after some investigation, I found that:
>>>   in function posix_aio_process_queue(void *opaque)
>>> 440             ret = qemu_paio_error(acb);
>>> 441             if (ret == ECANCELED) {
>>> 442                 /* remove the request */
>>> 443                 *pacb = acb->next;
>>> 444                 qemu_aio_release(acb);
>>> 445                 result = 1;
>>> 446             } else if (ret != EINPROGRESS) {
>>>   in line 444 acb got released but acb->common.opaque does not.
>>> which will be released via guest OS via ide_dma_cancel which 
>>> will in term call qcow_aio_cancel which does not check its argument
>>> is in flight list or not.
>>>   The fix is as follows: (debian 6's qemu-kvm-0.12.5)
>>> #######################################
>>> --- block/qcow2.h~      2010-07-27 08:43:53.000000000 +0800
>>> +++ block/qcow2.h       2012-05-13 15:51:39.000000000 +0800
>>> @@ -143,6 +143,7 @@
>>>      QLIST_HEAD(QCowAioDependencies, QCowAIOCB) dependent_requests;
>>>  
>>>      QLIST_ENTRY(QCowL2Meta) next_in_flight;
>>> +    int inflight;       
>>>  } QCowL2Meta;
>>> --- block/qcow2.c~  2012-05-13 15:57:09.000000000 +0800
>>> +++ block/qcow2.c       2012-05-13 15:57:24.000000000 +0800
>>> @@ -349,6 +349,10 @@
>>>      QCowAIOCB *acb = (QCowAIOCB *)blockacb;
>>>      if (acb->hd_aiocb)
>>>          bdrv_aio_cancel(acb->hd_aiocb);
>>> +    if (acb->l2meta.inflight) {
>>> +        QLIST_REMOVE(&acb->l2meta, next_in_flight);
>>> +       acb->l2meta.inflight = 0;
>>> +    }
>>>      qemu_aio_release(acb);
>>>  }
>>>  
>>> @@ -506,6 +510,7 @@
>>>      acb->n = 0;
>>>      acb->cluster_offset = 0;
>>>      acb->l2meta.nb_clusters = 0;
>>> +    acb->l2meta.inflight = 0;
>>>      QLIST_INIT(&acb->l2meta.dependent_requests);
>>>      return acb;
>>>  }
>>> @@ -534,6 +539,7 @@
>>>      /* Take the request off the list of running requests */
>>>      if (m->nb_clusters != 0) {
>>>          QLIST_REMOVE(m, next_in_flight);
>>> +       m->inflight = 0;
>>>      }
>>>  
>>>      /*
>>> @@ -632,6 +638,7 @@
>>>  fail:
>>>      if (acb->l2meta.nb_clusters != 0) {
>>>          QLIST_REMOVE(&acb->l2meta, next_in_flight);
>>> +       acb->l2meta.inflight  = 0;
>>>      }
>>>  done:
>>>      if (acb->qiov->niov > 1)
>>> --- block/qcow2-cluster.c~      2010-07-27 08:43:53.000000000 +0800
>>> +++ block/qcow2-cluster.c       2012-05-13 15:53:53.000000000 +0800
>>> @@ -827,6 +827,7 @@
>>>      m->offset = offset;
>>>      m->n_start = n_start;
>>>      m->nb_clusters = nb_clusters;
>>> +    m->inflight = 1;
>>>  
>>>  out:
>>>      m->nb_available = MIN(nb_clusters << (s->cluster_bits - 9), n_end);
>>>
>>>  Thanks for investigation
>>> Zhouyi
>>
>> The patch looks reasonable to me. Note however that while it fixes the
>> hang, it still causes cluster leaks. I'm not sure if someone is
>> interested in picking these up for old stable releases. Andreas, I think
>> you were going to take 0.15? The first version that doesn't have the
>> problem is 1.0.
> 
> Kevin, the policy as I understood it is to cherry-pick patches from
> qemu.git into qemu-stable-x.y.git. So I don't think me applying this
> patch to stable-0.15 would be right. I don't spot a particular qcow2 fix
> among our 0.15 backports that I have now pushed. Do you have a pointer
> which one(s) would fix this issue so that I can recheck?

It's "fixed" as a side effect of the block layer conversion to
coroutines. Not exactly the kind of patches you'd want to cherry-pick
for stable-0.15.

The better fix for 0.15 could be to backport the new behaviour of
coroutine based requests with bdrv_aio_cancel:

static void bdrv_aio_co_cancel_em(BlockDriverAIOCB *blockacb)
{
    qemu_aio_flush();
}

Using that as the implementation for qcow2_aio_cancel should be safe and
fix this problem.

Kevin
Andreas Färber - Oct. 12, 2012, 3:52 p.m.
Am 12.06.2012 15:44, schrieb Kevin Wolf:
> Am 12.06.2012 15:33, schrieb Andreas Färber:
>> Am 14.05.2012 14:20, schrieb Kevin Wolf:
>>> Am 13.05.2012 10:03, schrieb Zhouyi Zhou:
>>>> hi all
>>>>   
>>>>   sometimes, qemu/kvm-0.1x will hang in endless loop in qcow2_alloc_cluster_offset.
>>>>   after some investigation, I found that:
>>>>   in function posix_aio_process_queue(void *opaque)
>>>> 440             ret = qemu_paio_error(acb);
>>>> 441             if (ret == ECANCELED) {
>>>> 442                 /* remove the request */
>>>> 443                 *pacb = acb->next;
>>>> 444                 qemu_aio_release(acb);
>>>> 445                 result = 1;
>>>> 446             } else if (ret != EINPROGRESS) {
>>>>   in line 444 acb got released but acb->common.opaque does not.
>>>> which will be released via guest OS via ide_dma_cancel which 
>>>> will in term call qcow_aio_cancel which does not check its argument
>>>> is in flight list or not.
>>>>   The fix is as follows: (debian 6's qemu-kvm-0.12.5)
>>>> #######################################
>>>> --- block/qcow2.h~      2010-07-27 08:43:53.000000000 +0800
>>>> +++ block/qcow2.h       2012-05-13 15:51:39.000000000 +0800
>>>> @@ -143,6 +143,7 @@
>>>>      QLIST_HEAD(QCowAioDependencies, QCowAIOCB) dependent_requests;
>>>>  
>>>>      QLIST_ENTRY(QCowL2Meta) next_in_flight;
>>>> +    int inflight;       
>>>>  } QCowL2Meta;
>>>> --- block/qcow2.c~  2012-05-13 15:57:09.000000000 +0800
>>>> +++ block/qcow2.c       2012-05-13 15:57:24.000000000 +0800
>>>> @@ -349,6 +349,10 @@
>>>>      QCowAIOCB *acb = (QCowAIOCB *)blockacb;
>>>>      if (acb->hd_aiocb)
>>>>          bdrv_aio_cancel(acb->hd_aiocb);
>>>> +    if (acb->l2meta.inflight) {
>>>> +        QLIST_REMOVE(&acb->l2meta, next_in_flight);
>>>> +       acb->l2meta.inflight = 0;
>>>> +    }
>>>>      qemu_aio_release(acb);
>>>>  }
>>>>  
>>>> @@ -506,6 +510,7 @@
>>>>      acb->n = 0;
>>>>      acb->cluster_offset = 0;
>>>>      acb->l2meta.nb_clusters = 0;
>>>> +    acb->l2meta.inflight = 0;
>>>>      QLIST_INIT(&acb->l2meta.dependent_requests);
>>>>      return acb;
>>>>  }
>>>> @@ -534,6 +539,7 @@
>>>>      /* Take the request off the list of running requests */
>>>>      if (m->nb_clusters != 0) {
>>>>          QLIST_REMOVE(m, next_in_flight);
>>>> +       m->inflight = 0;
>>>>      }
>>>>  
>>>>      /*
>>>> @@ -632,6 +638,7 @@
>>>>  fail:
>>>>      if (acb->l2meta.nb_clusters != 0) {
>>>>          QLIST_REMOVE(&acb->l2meta, next_in_flight);
>>>> +       acb->l2meta.inflight  = 0;
>>>>      }
>>>>  done:
>>>>      if (acb->qiov->niov > 1)
>>>> --- block/qcow2-cluster.c~      2010-07-27 08:43:53.000000000 +0800
>>>> +++ block/qcow2-cluster.c       2012-05-13 15:53:53.000000000 +0800
>>>> @@ -827,6 +827,7 @@
>>>>      m->offset = offset;
>>>>      m->n_start = n_start;
>>>>      m->nb_clusters = nb_clusters;
>>>> +    m->inflight = 1;
>>>>  
>>>>  out:
>>>>      m->nb_available = MIN(nb_clusters << (s->cluster_bits - 9), n_end);
>>>>
>>>>  Thanks for investigation
>>>> Zhouyi
>>>
>>> The patch looks reasonable to me. Note however that while it fixes the
>>> hang, it still causes cluster leaks. I'm not sure if someone is
>>> interested in picking these up for old stable releases. Andreas, I think
>>> you were going to take 0.15? The first version that doesn't have the
>>> problem is 1.0.
>>
>> Kevin, the policy as I understood it is to cherry-pick patches from
>> qemu.git into qemu-stable-x.y.git. So I don't think me applying this
>> patch to stable-0.15 would be right. I don't spot a particular qcow2 fix
>> among our 0.15 backports that I have now pushed. Do you have a pointer
>> which one(s) would fix this issue so that I can recheck?
> 
> It's "fixed" as a side effect of the block layer conversion to
> coroutines. Not exactly the kind of patches you'd want to cherry-pick
> for stable-0.15.
> 
> The better fix for 0.15 could be to backport the new behaviour of
> coroutine based requests with bdrv_aio_cancel:
> 
> static void bdrv_aio_co_cancel_em(BlockDriverAIOCB *blockacb)
> {
>     qemu_aio_flush();
> }
> 
> Using that as the implementation for qcow2_aio_cancel should be safe and
> fix this problem.

Kevin, I have stable-0.15 in a state where I'm about to tag 0.15.2 now.
The original patch does not have a Signed-off-by nor your Acked-by, so I
can't apply it as-is. stable-0.15 does not have coroutines, so I don't
understand what exactly you're suggesting as alternative here: Backport
the whole coroutine feature including coroutine function above? Or just
call qemu_aio_flush() in place of what? This is old qcow2_aio_cancel():

static BlockDriverAIOCB *qcow2_aio_flush(BlockDriverState *bs,
                                         BlockDriverCompletionFunc *cb,
                                         void *opaque)
{
    BDRVQcowState *s = bs->opaque;
    int ret;

    ret = qcow2_cache_flush(bs, s->l2_table_cache);
    if (ret < 0) {
        return NULL;
    }

    ret = qcow2_cache_flush(bs, s->refcount_block_cache);
    if (ret < 0) {
        return NULL;
    }

    return bdrv_aio_flush(bs->file, cb, opaque);
}


Andreas

Patch

--- block/qcow2.h~      2010-07-27 08:43:53.000000000 +0800
+++ block/qcow2.h       2012-05-13 15:51:39.000000000 +0800
@@ -143,6 +143,7 @@ 
     QLIST_HEAD(QCowAioDependencies, QCowAIOCB) dependent_requests;
 
     QLIST_ENTRY(QCowL2Meta) next_in_flight;
+    int inflight;       
 } QCowL2Meta;
--- block/qcow2.c~  2012-05-13 15:57:09.000000000 +0800
+++ block/qcow2.c       2012-05-13 15:57:24.000000000 +0800
@@ -349,6 +349,10 @@ 
     QCowAIOCB *acb = (QCowAIOCB *)blockacb;
     if (acb->hd_aiocb)
         bdrv_aio_cancel(acb->hd_aiocb);
+    if (acb->l2meta.inflight) {
+        QLIST_REMOVE(&acb->l2meta, next_in_flight);
+       acb->l2meta.inflight = 0;
+    }
     qemu_aio_release(acb);
 }
 
@@ -506,6 +510,7 @@ 
     acb->n = 0;
     acb->cluster_offset = 0;
     acb->l2meta.nb_clusters = 0;
+    acb->l2meta.inflight = 0;
     QLIST_INIT(&acb->l2meta.dependent_requests);
     return acb;
 }
@@ -534,6 +539,7 @@ 
     /* Take the request off the list of running requests */
     if (m->nb_clusters != 0) {
         QLIST_REMOVE(m, next_in_flight);
+       m->inflight = 0;
     }
 
     /*
@@ -632,6 +638,7 @@ 
 fail:
     if (acb->l2meta.nb_clusters != 0) {
         QLIST_REMOVE(&acb->l2meta, next_in_flight);
+       acb->l2meta.inflight  = 0;
     }
 done:
     if (acb->qiov->niov > 1)
--- block/qcow2-cluster.c~      2010-07-27 08:43:53.000000000 +0800
+++ block/qcow2-cluster.c       2012-05-13 15:53:53.000000000 +0800
@@ -827,6 +827,7 @@ 
     m->offset = offset;
     m->n_start = n_start;
     m->nb_clusters = nb_clusters;
+    m->inflight = 1;
 
 out:
     m->nb_available = MIN(nb_clusters << (s->cluster_bits - 9), n_end);