diff mbox

[v3] Improve error handling in do_snapshot_blkdev()

Message ID 1299511653-11357-1-git-send-email-Jes.Sorensen@redhat.com
State New
Headers show

Commit Message

Jes Sorensen March 7, 2011, 3:27 p.m. UTC
From: Jes Sorensen <Jes.Sorensen@redhat.com>

In case we cannot open the newly created snapshot image, try to fall
back to the original image file and continue running on that, which
should prevent the guest from aborting.

This is a corner case which can happen if the admin by mistake
specifies the snapshot file on a virtual file system which does not
support O_DIRECT. bdrv_create() does not use O_DIRECT, but the
following open in bdrv_open() does and will then fail.

Signed-off-by: Jes Sorensen <Jes.Sorensen@redhat.com>
---
 blockdev.c |   29 +++++++++++++++++++++++------
 1 files changed, 23 insertions(+), 6 deletions(-)

Comments

Kevin Wolf March 7, 2011, 3:44 p.m. UTC | #1
Am 07.03.2011 16:27, schrieb Jes.Sorensen@redhat.com:
> From: Jes Sorensen <Jes.Sorensen@redhat.com>
> 
> In case we cannot open the newly created snapshot image, try to fall
> back to the original image file and continue running on that, which
> should prevent the guest from aborting.
> 
> This is a corner case which can happen if the admin by mistake
> specifies the snapshot file on a virtual file system which does not
> support O_DIRECT. bdrv_create() does not use O_DIRECT, but the
> following open in bdrv_open() does and will then fail.
> 
> Signed-off-by: Jes Sorensen <Jes.Sorensen@redhat.com>

Thanks, applied to the block branch.

Kevin
Anthony Liguori March 7, 2011, 4:34 p.m. UTC | #2
On 03/07/2011 09:27 AM, Jes.Sorensen@redhat.com wrote:
> From: Jes Sorensen<Jes.Sorensen@redhat.com>
>
> In case we cannot open the newly created snapshot image, try to fall
> back to the original image file and continue running on that, which
> should prevent the guest from aborting.
>
> This is a corner case which can happen if the admin by mistake
> specifies the snapshot file on a virtual file system which does not
> support O_DIRECT. bdrv_create() does not use O_DIRECT, but the
> following open in bdrv_open() does and will then fail.
>
> Signed-off-by: Jes Sorensen<Jes.Sorensen@redhat.com>
> ---
>   blockdev.c |   29 +++++++++++++++++++++++------
>   1 files changed, 23 insertions(+), 6 deletions(-)
>
> diff --git a/blockdev.c b/blockdev.c
> index 0690cc8..d52eef0 100644
> --- a/blockdev.c
> +++ b/blockdev.c
> @@ -574,9 +574,10 @@ int do_snapshot_blkdev(Monitor *mon, const QDict *qdict, QObject **ret_data)
>       const char *filename = qdict_get_try_str(qdict, "snapshot_file");
>       const char *format = qdict_get_try_str(qdict, "format");
>       BlockDriverState *bs;
> -    BlockDriver *drv, *proto_drv;
> +    BlockDriver *drv, *old_drv, *proto_drv;
>       int ret = 0;
>       int flags;
> +    char old_filename[1024];
>
>       if (!filename) {
>           qerror_report(QERR_MISSING_PARAMETER, "snapshot_file");
> @@ -591,6 +592,11 @@ int do_snapshot_blkdev(Monitor *mon, const QDict *qdict, QObject **ret_data)
>           goto out;
>       }
>
> +    pstrcpy(old_filename, sizeof(old_filename), bs->filename);
> +
> +    old_drv = bs->drv;
> +    flags = bs->open_flags;
> +
>       if (!format) {
>           format = "qcow2";
>       }
> @@ -610,7 +616,7 @@ int do_snapshot_blkdev(Monitor *mon, const QDict *qdict, QObject **ret_data)
>       }
>
>       ret = bdrv_img_create(filename, format, bs->filename,
> -                          bs->drv->format_name, NULL, -1, bs->open_flags);
> +                          bs->drv->format_name, NULL, -1, flags);
>       if (ret) {
>           goto out;
>       }
> @@ -618,15 +624,26 @@ int do_snapshot_blkdev(Monitor *mon, const QDict *qdict, QObject **ret_data)
>       qemu_aio_flush();
>       bdrv_flush(bs);
>
> -    flags = bs->open_flags;
>       bdrv_close(bs);
>       ret = bdrv_open(bs, filename, flags, drv);
>       /*
> -     * If reopening the image file we just created fails, we really
> -     * are in trouble :(
> +     * If reopening the image file we just created fails, fall back
> +     * and try to re-open the original image. If that fails too, we
> +     * are in serious trouble.
>        */
>       if (ret != 0) {
> -        abort();
> +        qerror_report(QERR_OPEN_FILE_FAILED, filename);
> +        error_printf("do_snapshot_blkdev(): Unable to open newly created "
> +                     "snapshot file: \n");
> +        error_printf("%s. Attempting to revert to original image: %s\n",
> +                     filename, old_filename);

You can't combine qerror_report with continued action.  qerror_report() 
should be a terminal action.  You also shouldn't combine error_printf() 
with qerror_report().

You should restore the original image file before doing qerror_report() 
and just drop the error_printf()s as it's all redundant information.

Regards,

Anthony Liguori

> +        ret = bdrv_open(bs, old_filename, flags, old_drv);
> +        if (ret != 0) {
> +            error_printf("do_snapshot_blkdev(): Unable to re-open "
> +                         "original image - aborting!\n");
> +            qerror_report(QERR_OPEN_FILE_FAILED, old_filename);
> +            abort();
> +        }
>       }
>   out:
>       if (ret) {
Jes Sorensen March 7, 2011, 4:39 p.m. UTC | #3
On 03/07/11 17:34, Anthony Liguori wrote:
> On 03/07/2011 09:27 AM, Jes.Sorensen@redhat.com wrote:
>>       if (ret != 0) {
>> -        abort();
>> +        qerror_report(QERR_OPEN_FILE_FAILED, filename);
>> +        error_printf("do_snapshot_blkdev(): Unable to open newly
>> created "
>> +                     "snapshot file: \n");
>> +        error_printf("%s. Attempting to revert to original image: %s\n",
>> +                     filename, old_filename);
> 
> You can't combine qerror_report with continued action.  qerror_report()
> should be a terminal action.  You also shouldn't combine error_printf()
> with qerror_report().
> 
> You should restore the original image file before doing qerror_report()
> and just drop the error_printf()s as it's all redundant information.

I would hardly consider it redundant information that it failed and we
try to go back to the original image. That is an error in itself, even
though rolling back is better than abort()ing.

If qerror_report() is a fatal situation, that is problematic. Then we
need qerror_warn() or something as well, which can return non fatal
information.

The printfs are very valuable for the human monitor, but it isn't really
clear to me what is the ideal return value.

Cheers,
Jes
Anthony Liguori March 7, 2011, 5:47 p.m. UTC | #4
On 03/07/2011 10:39 AM, Jes Sorensen wrote:
> On 03/07/11 17:34, Anthony Liguori wrote:
>> On 03/07/2011 09:27 AM, Jes.Sorensen@redhat.com wrote:
>>>        if (ret != 0) {
>>> -        abort();
>>> +        qerror_report(QERR_OPEN_FILE_FAILED, filename);
>>> +        error_printf("do_snapshot_blkdev(): Unable to open newly
>>> created "
>>> +                     "snapshot file: \n");
>>> +        error_printf("%s. Attempting to revert to original image: %s\n",
>>> +                     filename, old_filename);
>> You can't combine qerror_report with continued action.  qerror_report()
>> should be a terminal action.  You also shouldn't combine error_printf()
>> with qerror_report().
>>
>> You should restore the original image file before doing qerror_report()
>> and just drop the error_printf()s as it's all redundant information.
> I would hardly consider it redundant information that it failed and we
> try to go back to the original image. That is an error in itself, even
> though rolling back is better than abort()ing.
>
> If qerror_report() is a fatal situation, that is problematic.

It's fatal for the command, yes.  You should do qerror_report() in the 
exit path.

>   Then we
> need qerror_warn() or something as well, which can return non fatal
> information.

In your case, it's definitely a fatal error for the command.  The 
command is failing and you're just printing out information about the 
rollback information you're taking.

> The printfs are very valuable for the human monitor, but it isn't really
> clear to me what is the ideal return value.

But error_printf() is meaningless in the context of QMP.  You can 
reproduce these printfs in HMP based on the errors returned by QMP.

But if you're just doing an HMP command (and don't care about QMP) then 
you shouldn't use qerror_report().  But you need to care about QMP so 
you should focus on making it a well behaved QMP command.

BTW, there shouldn't be an abort() in any of these paths.  If you fail 
to reopen, just let the failure propagate.

Regards,

Anthony Liguori

> Cheers,
> Jes
>
Jes Sorensen March 8, 2011, 8:24 a.m. UTC | #5
On 03/07/11 18:47, Anthony Liguori wrote:
> On 03/07/2011 10:39 AM, Jes Sorensen wrote:
>> On 03/07/11 17:34, Anthony Liguori wrote:
>>> You should restore the original image file before doing qerror_report()
>>> and just drop the error_printf()s as it's all redundant information.
>> I would hardly consider it redundant information that it failed and we
>> try to go back to the original image. That is an error in itself, even
>> though rolling back is better than abort()ing.
>>
>> If qerror_report() is a fatal situation, that is problematic.
> 
> It's fatal for the command, yes.  You should do qerror_report() in the
> exit path.
> 
>>   Then we
>> need qerror_warn() or something as well, which can return non fatal
>> information.
> 
> In your case, it's definitely a fatal error for the command.  The
> command is failing and you're just printing out information about the
> rollback information you're taking.

I guess the disconnect here is the definition of fatal. Fatal in my book
means we're dead, toast, gone ..... hardly the case if we manage to fail
back to the previous image.

>> The printfs are very valuable for the human monitor, but it isn't really
>> clear to me what is the ideal return value.
> 
> But error_printf() is meaningless in the context of QMP.  You can
> reproduce these printfs in HMP based on the errors returned by QMP.
> 
> But if you're just doing an HMP command (and don't care about QMP) then
> you shouldn't use qerror_report().  But you need to care about QMP so
> you should focus on making it a well behaved QMP command.

The question here is then how to propagate the message back that we
failed to switch to the new image, but stayed on the old one, as opposed
to both of them failing? This part of QMP is really black magic and
there doesn't seem to be a good error for this. Time for a new QMP error?

> BTW, there shouldn't be an abort() in any of these paths.  If you fail
> to reopen, just let the failure propagate.

In this particular case it can be argued that the situation is so fatal
that it is better to fail than to let the guest continue.

Jes
Anthony Liguori March 8, 2011, 1:42 p.m. UTC | #6
On 03/08/2011 02:24 AM, Jes Sorensen wrote:
> On 03/07/11 18:47, Anthony Liguori wrote:
>> On 03/07/2011 10:39 AM, Jes Sorensen wrote:
>>> On 03/07/11 17:34, Anthony Liguori wrote:
>>>> You should restore the original image file before doing qerror_report()
>>>> and just drop the error_printf()s as it's all redundant information.
>>> I would hardly consider it redundant information that it failed and we
>>> try to go back to the original image. That is an error in itself, even
>>> though rolling back is better than abort()ing.
>>>
>>> If qerror_report() is a fatal situation, that is problematic.
>> It's fatal for the command, yes.  You should do qerror_report() in the
>> exit path.
>>
>>>    Then we
>>> need qerror_warn() or something as well, which can return non fatal
>>> information.
>> In your case, it's definitely a fatal error for the command.  The
>> command is failing and you're just printing out information about the
>> rollback information you're taking.
> I guess the disconnect here is the definition of fatal. Fatal in my book
> means we're dead, toast, gone ..... hardly the case if we manage to fail
> back to the previous image.

Let me put it another way, you can't call qerror_report twice because 
there is only one QMP error object sent in the protocol.  You 
potentially call qerror_report twice which breaks QMP.

The way you ought to structure things is to return to the old image, and 
then throw an error saying that you couldn't open the new image.

>>> The printfs are very valuable for the human monitor, but it isn't really
>>> clear to me what is the ideal return value.
>> But error_printf() is meaningless in the context of QMP.  You can
>> reproduce these printfs in HMP based on the errors returned by QMP.
>>
>> But if you're just doing an HMP command (and don't care about QMP) then
>> you shouldn't use qerror_report().  But you need to care about QMP so
>> you should focus on making it a well behaved QMP command.
> The question here is then how to propagate the message back that we
> failed to switch to the new image, but stayed on the old one, as opposed
> to both of them failing? This part of QMP is really black magic and
> there doesn't seem to be a good error for this. Time for a new QMP error?

If FileOpenFailed has the filename of the new image, then opening the 
file failed and we're using the old image.  If FileOpenFailed has the 
filename of the old image, we're toast.

That basically covers it, no?

Regards,

Anthony Liguori

>> BTW, there shouldn't be an abort() in any of these paths.  If you fail
>> to reopen, just let the failure propagate.
> In this particular case it can be argued that the situation is so fatal
> that it is better to fail than to let the guest continue.
>
> Jes
>
Jes Sorensen March 8, 2011, 4:44 p.m. UTC | #7
On 03/08/11 14:42, Anthony Liguori wrote:
> On 03/08/2011 02:24 AM, Jes Sorensen wrote:
>> On 03/07/11 18:47, Anthony Liguori wrote:
>>> In your case, it's definitely a fatal error for the command.  The
>>> command is failing and you're just printing out information about the
>>> rollback information you're taking.
>> I guess the disconnect here is the definition of fatal. Fatal in my book
>> means we're dead, toast, gone ..... hardly the case if we manage to fail
>> back to the previous image.
> 
> Let me put it another way, you can't call qerror_report twice because
> there is only one QMP error object sent in the protocol.  You
> potentially call qerror_report twice which breaks QMP.
> 
> The way you ought to structure things is to return to the old image, and
> then throw an error saying that you couldn't open the new image.

I see, I had the impression QMP would create multiple objects and pass
them along. Guess not.

Thanks for the explanation.

>>>> The printfs are very valuable for the human monitor, but it isn't
>>>> really
>>>> clear to me what is the ideal return value.
>>> But error_printf() is meaningless in the context of QMP.  You can
>>> reproduce these printfs in HMP based on the errors returned by QMP.
>>>
>>> But if you're just doing an HMP command (and don't care about QMP) then
>>> you shouldn't use qerror_report().  But you need to care about QMP so
>>> you should focus on making it a well behaved QMP command.
>> The question here is then how to propagate the message back that we
>> failed to switch to the new image, but stayed on the old one, as opposed
>> to both of them failing? This part of QMP is really black magic and
>> there doesn't seem to be a good error for this. Time for a new QMP error?
> 
> If FileOpenFailed has the filename of the new image, then opening the
> file failed and we're using the old image.  If FileOpenFailed has the
> filename of the old image, we're toast.
> 
> That basically covers it, no?

It kinda sorta covers it. The problem with that is that you then have to
do a string match of the return values to determine which of the cases
happened, which isn't very nice. But I guess we can do that for now.

I'll have a look.

Cheers,
Jes
Anthony Liguori March 8, 2011, 5:46 p.m. UTC | #8
On 03/08/2011 10:44 AM, Jes Sorensen wrote:
> On 03/08/11 14:42, Anthony Liguori wrote:
>> On 03/08/2011 02:24 AM, Jes Sorensen wrote:
>>> On 03/07/11 18:47, Anthony Liguori wrote:
>>>> In your case, it's definitely a fatal error for the command.  The
>>>> command is failing and you're just printing out information about the
>>>> rollback information you're taking.
>>> I guess the disconnect here is the definition of fatal. Fatal in my book
>>> means we're dead, toast, gone ..... hardly the case if we manage to fail
>>> back to the previous image.
>> Let me put it another way, you can't call qerror_report twice because
>> there is only one QMP error object sent in the protocol.  You
>> potentially call qerror_report twice which breaks QMP.
>>
>> The way you ought to structure things is to return to the old image, and
>> then throw an error saying that you couldn't open the new image.
> I see, I had the impression QMP would create multiple objects and pass
> them along. Guess not.

No, this is made clearer in QAPI because an error pointer is passed and 
you explicitly set the object.

>> If FileOpenFailed has the filename of the new image, then opening the
>> file failed and we're using the old image.  If FileOpenFailed has the
>> filename of the old image, we're toast.
>>
>> That basically covers it, no?
> It kinda sorta covers it. The problem with that is that you then have to
> do a string match of the return values to determine which of the cases
> happened, which isn't very nice. But I guess we can do that for now.

Right, but this can be done in the HMP command such that the HMP command 
still prints out the warning message.

The key is to have well documented error semantics where the various 
cases can be distinguished because then we can ensure that we can not 
only print out a nice error message in HMP, but that a remote QMP client 
(like libvirt) can also generate a high quality error message.

Regards,

Anthony Liguori

> I'll have a look.
>
> Cheers,
> Jes
>
Jes Sorensen March 9, 2011, 10:20 a.m. UTC | #9
On 03/08/11 18:46, Anthony Liguori wrote:
> On 03/08/2011 10:44 AM, Jes Sorensen wrote:
>> On 03/08/11 14:42, Anthony Liguori wrote:
>> It kinda sorta covers it. The problem with that is that you then have to
>> do a string match of the return values to determine which of the cases
>> happened, which isn't very nice. But I guess we can do that for now.
> 
> Right, but this can be done in the HMP command such that the HMP command
> still prints out the warning message.
> 
> The key is to have well documented error semantics where the various
> cases can be distinguished because then we can ensure that we can not
> only print out a nice error message in HMP, but that a remote QMP client
> (like libvirt) can also generate a high quality error message.

Have a look at v4 then, I've changed it to report errors back according
to this.

Jes
diff mbox

Patch

diff --git a/blockdev.c b/blockdev.c
index 0690cc8..d52eef0 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -574,9 +574,10 @@  int do_snapshot_blkdev(Monitor *mon, const QDict *qdict, QObject **ret_data)
     const char *filename = qdict_get_try_str(qdict, "snapshot_file");
     const char *format = qdict_get_try_str(qdict, "format");
     BlockDriverState *bs;
-    BlockDriver *drv, *proto_drv;
+    BlockDriver *drv, *old_drv, *proto_drv;
     int ret = 0;
     int flags;
+    char old_filename[1024];
 
     if (!filename) {
         qerror_report(QERR_MISSING_PARAMETER, "snapshot_file");
@@ -591,6 +592,11 @@  int do_snapshot_blkdev(Monitor *mon, const QDict *qdict, QObject **ret_data)
         goto out;
     }
 
+    pstrcpy(old_filename, sizeof(old_filename), bs->filename);
+
+    old_drv = bs->drv;
+    flags = bs->open_flags;
+
     if (!format) {
         format = "qcow2";
     }
@@ -610,7 +616,7 @@  int do_snapshot_blkdev(Monitor *mon, const QDict *qdict, QObject **ret_data)
     }
 
     ret = bdrv_img_create(filename, format, bs->filename,
-                          bs->drv->format_name, NULL, -1, bs->open_flags);
+                          bs->drv->format_name, NULL, -1, flags);
     if (ret) {
         goto out;
     }
@@ -618,15 +624,26 @@  int do_snapshot_blkdev(Monitor *mon, const QDict *qdict, QObject **ret_data)
     qemu_aio_flush();
     bdrv_flush(bs);
 
-    flags = bs->open_flags;
     bdrv_close(bs);
     ret = bdrv_open(bs, filename, flags, drv);
     /*
-     * If reopening the image file we just created fails, we really
-     * are in trouble :(
+     * If reopening the image file we just created fails, fall back
+     * and try to re-open the original image. If that fails too, we
+     * are in serious trouble.
      */
     if (ret != 0) {
-        abort();
+        qerror_report(QERR_OPEN_FILE_FAILED, filename);
+        error_printf("do_snapshot_blkdev(): Unable to open newly created "
+                     "snapshot file: \n");
+        error_printf("%s. Attempting to revert to original image: %s\n",
+                     filename, old_filename);
+        ret = bdrv_open(bs, old_filename, flags, old_drv);
+        if (ret != 0) {
+            error_printf("do_snapshot_blkdev(): Unable to re-open "
+                         "original image - aborting!\n");
+            qerror_report(QERR_OPEN_FILE_FAILED, old_filename);
+            abort();
+        }
     }
 out:
     if (ret) {