diff mbox series

[2/3] block/file-posix: File locking during creation

Message ID 20180420220913.27000-3-mreitz@redhat.com
State New
Headers show
Series block/file-posix: File locking during creation | expand

Commit Message

Max Reitz April 20, 2018, 10:09 p.m. UTC
When creating a file, we should take the WRITE and RESIZE permissions.
We do not need either for the creation itself, but we do need them for
clearing and resizing it.  So we can take the proper permissions by
replacing O_TRUNC with an explicit truncation to 0, and by taking the
appropriate file locks between those two steps.

Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/file-posix.c | 41 +++++++++++++++++++++++++++++++++++++++--
 1 file changed, 39 insertions(+), 2 deletions(-)

Comments

Fam Zheng April 27, 2018, 6:22 a.m. UTC | #1
On Sat, 04/21 00:09, Max Reitz wrote:
> When creating a file, we should take the WRITE and RESIZE permissions.
> We do not need either for the creation itself, but we do need them for
> clearing and resizing it.  So we can take the proper permissions by
> replacing O_TRUNC with an explicit truncation to 0, and by taking the
> appropriate file locks between those two steps.
> 
> Signed-off-by: Max Reitz <mreitz@redhat.com>
> ---
>  block/file-posix.c | 41 +++++++++++++++++++++++++++++++++++++++--
>  1 file changed, 39 insertions(+), 2 deletions(-)
> 
> diff --git a/block/file-posix.c b/block/file-posix.c
> index c98a4a1556..ed7932d6e8 100644
> --- a/block/file-posix.c
> +++ b/block/file-posix.c
> @@ -1992,6 +1992,7 @@ static int raw_co_create(BlockdevCreateOptions *options, Error **errp)
>  {
>      BlockdevCreateOptionsFile *file_opts;
>      int fd;
> +    int perm, shared;
>      int result = 0;
>  
>      /* Validate options and set default values */
> @@ -2006,14 +2007,48 @@ static int raw_co_create(BlockdevCreateOptions *options, Error **errp)
>      }
>  
>      /* Create file */
> -    fd = qemu_open(file_opts->filename, O_RDWR | O_CREAT | O_TRUNC | O_BINARY,
> -                   0644);
> +    fd = qemu_open(file_opts->filename, O_RDWR | O_CREAT | O_BINARY, 0644);
>      if (fd < 0) {
>          result = -errno;
>          error_setg_errno(errp, -result, "Could not create file");
>          goto out;
>      }
>  
> +    /* Take permissions: We want to discard everything, so we need
> +     * BLK_PERM_WRITE; and truncation to the desired size requires
> +     * BLK_PERM_RESIZE.
> +     * On the other hand, we cannot share the RESIZE permission
> +     * because we promise that after this function, the file has the
> +     * size given in the options.  If someone else were to resize it
> +     * concurrently, we could not guarantee that. */

Strictly speaking, we do close(fd) before this function returns so the lock is
lost and race can happen.

> +    perm = BLK_PERM_WRITE | BLK_PERM_RESIZE;
> +    shared = BLK_PERM_ALL & ~BLK_PERM_RESIZE;
> +
> +    /* Step one: Take locks in shared mode */
> +    result = raw_apply_lock_bytes(fd, perm, shared, false, errp);
> +    if (result < 0) {
> +        goto out_close;
> +    }
> +
> +    /* Step two: Try to get them exclusively */
> +    result = raw_check_lock_bytes(fd, perm, shared, errp);
> +    if (result < 0) {
> +        goto out_close;
> +    }
> +
> +    /* Step three: Downgrade them to shared again, and keep
> +     *             them that way until we are done */
> +    result = raw_apply_lock_bytes(fd, perm, shared, true, errp);

You comment it as "downgrade to shared" but what this actually does in addition
to step one is to "unlock" unneeded bytes which we haven't locked anyway. So I'm
confused why we need this stop. IIUC no downgrading is necessary after step one
and step two: only necessary shared locks are acquired in step one, and step two
is read-only op (F_OFD_GETLK).

> +    if (result < 0) {
> +        goto out_close;
> +    }
> +
> +    /* Clear the file by truncating it to 0 */
> +    result = raw_regular_truncate(fd, 0, PREALLOC_MODE_OFF, errp);
> +    if (result < 0) {
> +        goto out_close;
> +    }
> +
>      if (file_opts->nocow) {
>  #ifdef __linux__
>          /* Set NOCOW flag to solve performance issue on fs like btrfs.
> @@ -2029,6 +2064,8 @@ static int raw_co_create(BlockdevCreateOptions *options, Error **errp)
>  #endif
>      }
>  
> +    /* Resize and potentially preallocate the file to the desired
> +     * final size */
>      result = raw_regular_truncate(fd, file_opts->size, file_opts->preallocation,
>                                    errp);
>      if (result < 0) {
> -- 
> 2.14.3
> 
> 

Fam
Max Reitz April 28, 2018, 11:03 a.m. UTC | #2
On 2018-04-27 08:22, Fam Zheng wrote:
> On Sat, 04/21 00:09, Max Reitz wrote:
>> When creating a file, we should take the WRITE and RESIZE permissions.
>> We do not need either for the creation itself, but we do need them for
>> clearing and resizing it.  So we can take the proper permissions by
>> replacing O_TRUNC with an explicit truncation to 0, and by taking the
>> appropriate file locks between those two steps.
>>
>> Signed-off-by: Max Reitz <mreitz@redhat.com>
>> ---
>>  block/file-posix.c | 41 +++++++++++++++++++++++++++++++++++++++--
>>  1 file changed, 39 insertions(+), 2 deletions(-)
>>
>> diff --git a/block/file-posix.c b/block/file-posix.c
>> index c98a4a1556..ed7932d6e8 100644
>> --- a/block/file-posix.c
>> +++ b/block/file-posix.c
>> @@ -1992,6 +1992,7 @@ static int raw_co_create(BlockdevCreateOptions *options, Error **errp)
>>  {
>>      BlockdevCreateOptionsFile *file_opts;
>>      int fd;
>> +    int perm, shared;
>>      int result = 0;
>>  
>>      /* Validate options and set default values */
>> @@ -2006,14 +2007,48 @@ static int raw_co_create(BlockdevCreateOptions *options, Error **errp)
>>      }
>>  
>>      /* Create file */
>> -    fd = qemu_open(file_opts->filename, O_RDWR | O_CREAT | O_TRUNC | O_BINARY,
>> -                   0644);
>> +    fd = qemu_open(file_opts->filename, O_RDWR | O_CREAT | O_BINARY, 0644);
>>      if (fd < 0) {
>>          result = -errno;
>>          error_setg_errno(errp, -result, "Could not create file");
>>          goto out;
>>      }
>>  
>> +    /* Take permissions: We want to discard everything, so we need
>> +     * BLK_PERM_WRITE; and truncation to the desired size requires
>> +     * BLK_PERM_RESIZE.
>> +     * On the other hand, we cannot share the RESIZE permission
>> +     * because we promise that after this function, the file has the
>> +     * size given in the options.  If someone else were to resize it
>> +     * concurrently, we could not guarantee that. */
> 
> Strictly speaking, we do close(fd) before this function returns so the lock is
> lost and race can happen.

Right, but then creation from the perspective of file-posix is over.  We
are going to reopen the file for formatting, but that is just a normal
bdrv_open() so it will automatically be locked, no?

>> +    perm = BLK_PERM_WRITE | BLK_PERM_RESIZE;
>> +    shared = BLK_PERM_ALL & ~BLK_PERM_RESIZE;
>> +
>> +    /* Step one: Take locks in shared mode */
>> +    result = raw_apply_lock_bytes(fd, perm, shared, false, errp);
>> +    if (result < 0) {
>> +        goto out_close;
>> +    }
>> +
>> +    /* Step two: Try to get them exclusively */
>> +    result = raw_check_lock_bytes(fd, perm, shared, errp);
>> +    if (result < 0) {
>> +        goto out_close;
>> +    }
>> +
>> +    /* Step three: Downgrade them to shared again, and keep
>> +     *             them that way until we are done */
>> +    result = raw_apply_lock_bytes(fd, perm, shared, true, errp);
> 
> You comment it as "downgrade to shared" but what this actually does in addition
> to step one is to "unlock" unneeded bytes which we haven't locked anyway. So I'm
> confused why we need this stop. IIUC no downgrading is necessary after step one
> and step two: only necessary shared locks are acquired in step one, and step two
> is read-only op (F_OFD_GETLK).

Oops.  For some reason I thought raw_check_lock_bytes() would take the
locks exclusively.  Yes, then we don't need this third step.

Thanks for reviewing!

Max

>> +    if (result < 0) {
>> +        goto out_close;
>> +    }
>> +
>> +    /* Clear the file by truncating it to 0 */
>> +    result = raw_regular_truncate(fd, 0, PREALLOC_MODE_OFF, errp);
>> +    if (result < 0) {
>> +        goto out_close;
>> +    }
>> +
>>      if (file_opts->nocow) {
>>  #ifdef __linux__
>>          /* Set NOCOW flag to solve performance issue on fs like btrfs.
>> @@ -2029,6 +2064,8 @@ static int raw_co_create(BlockdevCreateOptions *options, Error **errp)
>>  #endif
>>      }
>>  
>> +    /* Resize and potentially preallocate the file to the desired
>> +     * final size */
>>      result = raw_regular_truncate(fd, file_opts->size, file_opts->preallocation,
>>                                    errp);
>>      if (result < 0) {
>> -- 
>> 2.14.3
>>
>>
> 
> Fam
>
Fam Zheng May 3, 2018, 6:45 a.m. UTC | #3
On Sat, 04/28 13:03, Max Reitz wrote:
> On 2018-04-27 08:22, Fam Zheng wrote:
> > On Sat, 04/21 00:09, Max Reitz wrote:
> >> When creating a file, we should take the WRITE and RESIZE permissions.
> >> We do not need either for the creation itself, but we do need them for
> >> clearing and resizing it.  So we can take the proper permissions by
> >> replacing O_TRUNC with an explicit truncation to 0, and by taking the
> >> appropriate file locks between those two steps.
> >>
> >> Signed-off-by: Max Reitz <mreitz@redhat.com>
> >> ---
> >>  block/file-posix.c | 41 +++++++++++++++++++++++++++++++++++++++--
> >>  1 file changed, 39 insertions(+), 2 deletions(-)
> >>
> >> diff --git a/block/file-posix.c b/block/file-posix.c
> >> index c98a4a1556..ed7932d6e8 100644
> >> --- a/block/file-posix.c
> >> +++ b/block/file-posix.c
> >> @@ -1992,6 +1992,7 @@ static int raw_co_create(BlockdevCreateOptions *options, Error **errp)
> >>  {
> >>      BlockdevCreateOptionsFile *file_opts;
> >>      int fd;
> >> +    int perm, shared;
> >>      int result = 0;
> >>  
> >>      /* Validate options and set default values */
> >> @@ -2006,14 +2007,48 @@ static int raw_co_create(BlockdevCreateOptions *options, Error **errp)
> >>      }
> >>  
> >>      /* Create file */
> >> -    fd = qemu_open(file_opts->filename, O_RDWR | O_CREAT | O_TRUNC | O_BINARY,
> >> -                   0644);
> >> +    fd = qemu_open(file_opts->filename, O_RDWR | O_CREAT | O_BINARY, 0644);
> >>      if (fd < 0) {
> >>          result = -errno;
> >>          error_setg_errno(errp, -result, "Could not create file");
> >>          goto out;
> >>      }
> >>  
> >> +    /* Take permissions: We want to discard everything, so we need
> >> +     * BLK_PERM_WRITE; and truncation to the desired size requires
> >> +     * BLK_PERM_RESIZE.
> >> +     * On the other hand, we cannot share the RESIZE permission
> >> +     * because we promise that after this function, the file has the
> >> +     * size given in the options.  If someone else were to resize it
> >> +     * concurrently, we could not guarantee that. */
> > 
> > Strictly speaking, we do close(fd) before this function returns so the lock is
> > lost and race can happen.
> 
> Right, but then creation from the perspective of file-posix is over.  We
> are going to reopen the file for formatting, but that is just a normal
> bdrv_open() so it will automatically be locked, no?

After this function close() but before the following bdrv_open(), another
process can sneak in and steal the lock. So we cannot guarantee the RESIZE lock
does what we really want.

Fam

> 
> >> +    perm = BLK_PERM_WRITE | BLK_PERM_RESIZE;
> >> +    shared = BLK_PERM_ALL & ~BLK_PERM_RESIZE;
> >> +
> >> +    /* Step one: Take locks in shared mode */
> >> +    result = raw_apply_lock_bytes(fd, perm, shared, false, errp);
> >> +    if (result < 0) {
> >> +        goto out_close;
> >> +    }
> >> +
> >> +    /* Step two: Try to get them exclusively */
> >> +    result = raw_check_lock_bytes(fd, perm, shared, errp);
> >> +    if (result < 0) {
> >> +        goto out_close;
> >> +    }
> >> +
> >> +    /* Step three: Downgrade them to shared again, and keep
> >> +     *             them that way until we are done */
> >> +    result = raw_apply_lock_bytes(fd, perm, shared, true, errp);
> > 
> > You comment it as "downgrade to shared" but what this actually does in addition
> > to step one is to "unlock" unneeded bytes which we haven't locked anyway. So I'm
> > confused why we need this stop. IIUC no downgrading is necessary after step one
> > and step two: only necessary shared locks are acquired in step one, and step two
> > is read-only op (F_OFD_GETLK).
> 
> Oops.  For some reason I thought raw_check_lock_bytes() would take the
> locks exclusively.  Yes, then we don't need this third step.
> 
> Thanks for reviewing!
> 
> Max
> 
> >> +    if (result < 0) {
> >> +        goto out_close;
> >> +    }
> >> +
> >> +    /* Clear the file by truncating it to 0 */
> >> +    result = raw_regular_truncate(fd, 0, PREALLOC_MODE_OFF, errp);
> >> +    if (result < 0) {
> >> +        goto out_close;
> >> +    }
> >> +
> >>      if (file_opts->nocow) {
> >>  #ifdef __linux__
> >>          /* Set NOCOW flag to solve performance issue on fs like btrfs.
> >> @@ -2029,6 +2064,8 @@ static int raw_co_create(BlockdevCreateOptions *options, Error **errp)
> >>  #endif
> >>      }
> >>  
> >> +    /* Resize and potentially preallocate the file to the desired
> >> +     * final size */
> >>      result = raw_regular_truncate(fd, file_opts->size, file_opts->preallocation,
> >>                                    errp);
> >>      if (result < 0) {
> >> -- 
> >> 2.14.3
> >>
> >>
> > 
> > Fam
> > 
> 
>
Max Reitz May 4, 2018, 1:45 p.m. UTC | #4
On 2018-05-03 08:45, Fam Zheng wrote:
> On Sat, 04/28 13:03, Max Reitz wrote:
>> On 2018-04-27 08:22, Fam Zheng wrote:
>>> On Sat, 04/21 00:09, Max Reitz wrote:
>>>> When creating a file, we should take the WRITE and RESIZE permissions.
>>>> We do not need either for the creation itself, but we do need them for
>>>> clearing and resizing it.  So we can take the proper permissions by
>>>> replacing O_TRUNC with an explicit truncation to 0, and by taking the
>>>> appropriate file locks between those two steps.
>>>>
>>>> Signed-off-by: Max Reitz <mreitz@redhat.com>
>>>> ---
>>>>  block/file-posix.c | 41 +++++++++++++++++++++++++++++++++++++++--
>>>>  1 file changed, 39 insertions(+), 2 deletions(-)
>>>>
>>>> diff --git a/block/file-posix.c b/block/file-posix.c
>>>> index c98a4a1556..ed7932d6e8 100644
>>>> --- a/block/file-posix.c
>>>> +++ b/block/file-posix.c
>>>> @@ -1992,6 +1992,7 @@ static int raw_co_create(BlockdevCreateOptions *options, Error **errp)
>>>>  {
>>>>      BlockdevCreateOptionsFile *file_opts;
>>>>      int fd;
>>>> +    int perm, shared;
>>>>      int result = 0;
>>>>  
>>>>      /* Validate options and set default values */
>>>> @@ -2006,14 +2007,48 @@ static int raw_co_create(BlockdevCreateOptions *options, Error **errp)
>>>>      }
>>>>  
>>>>      /* Create file */
>>>> -    fd = qemu_open(file_opts->filename, O_RDWR | O_CREAT | O_TRUNC | O_BINARY,
>>>> -                   0644);
>>>> +    fd = qemu_open(file_opts->filename, O_RDWR | O_CREAT | O_BINARY, 0644);
>>>>      if (fd < 0) {
>>>>          result = -errno;
>>>>          error_setg_errno(errp, -result, "Could not create file");
>>>>          goto out;
>>>>      }
>>>>  
>>>> +    /* Take permissions: We want to discard everything, so we need
>>>> +     * BLK_PERM_WRITE; and truncation to the desired size requires
>>>> +     * BLK_PERM_RESIZE.
>>>> +     * On the other hand, we cannot share the RESIZE permission
>>>> +     * because we promise that after this function, the file has the
>>>> +     * size given in the options.  If someone else were to resize it
>>>> +     * concurrently, we could not guarantee that. */
>>>
>>> Strictly speaking, we do close(fd) before this function returns so the lock is
>>> lost and race can happen.
>>
>> Right, but then creation from the perspective of file-posix is over.  We
>> are going to reopen the file for formatting, but that is just a normal
>> bdrv_open() so it will automatically be locked, no?
> 
> After this function close() but before the following bdrv_open(), another
> process can sneak in and steal the lock. So we cannot guarantee the RESIZE lock
> does what we really want.

Right, but I'd argue that is not the purpose of the lock.  If someone
wants the file (or then rather the node) to be a certain size, they'd
have to call bdrv_getlength() on it to check.

But indeed you're right in that not sharing the RESIZE is hypocritical
considering it actually doesn't promise anything.

Anyway, let's consider the issues.  For formatting the file, this
behavior is OK because since Kevin's x-blockdev-create series format
drivers always truncate the file during formatting (that is, once they
have opened the file), and not during creation, so that part is secured.

But we do have issues with e.g. drive-mirror, where we create an image
and then open it.  The opened image may have a different size than what
we wanted to create.  Now in this case I wouldn't consider this a big
problem because drive-mirror is basically deprecated anyway, so there is
no reason to waste resources here; and this applies to all of the QMP
commands that create images and then open them automatically.

In the future, we want users to use blockdev-create and then
blockdev-add in two separate steps.  Then it's up to the user to realize
that the blockdev-add'ed node may have a different length then what they
wanted to achieve in blockdev-create.  Actually, it may just differ
altogether, I don't think qemu is going to make any promises on the
state of the image in the interim.

So...  I suppose there aren't any real issues with not being able to
promise that the image has the intended length immediately after
creating it.  So I guess we can indeed share RESIZE.

But OTOH, it definitely does not feel right to me to share RESIZE.  We
definitely do not want other parties to resize the image while we create it.

So I guess what I would like to do is keep RESIZE not shared and add a
note after the comment:

> Note that after this function, we can no longer guarantee that the
> file is not touched by a third party, so it may be resized then.

Ideally, we'd want the lock to stay active until blockdev-create or
qemu-img create returns, but I don't think that is really worth it.  If
there is a race condition between raw_co_create() returning and those
commands returning (and we don't have a format layer to correct things),
then I can't imagine that it couldn't bite you after those commands have
returned.  (And after those commands, it isn't our problem anymore anyway.)

Max
Fam Zheng May 7, 2018, 7:23 a.m. UTC | #5
On Fri, 05/04 15:45, Max Reitz wrote:
> On 2018-05-03 08:45, Fam Zheng wrote:
> > On Sat, 04/28 13:03, Max Reitz wrote:
> >> On 2018-04-27 08:22, Fam Zheng wrote:
> >>> On Sat, 04/21 00:09, Max Reitz wrote:
> >>>> When creating a file, we should take the WRITE and RESIZE permissions.
> >>>> We do not need either for the creation itself, but we do need them for
> >>>> clearing and resizing it.  So we can take the proper permissions by
> >>>> replacing O_TRUNC with an explicit truncation to 0, and by taking the
> >>>> appropriate file locks between those two steps.
> >>>>
> >>>> Signed-off-by: Max Reitz <mreitz@redhat.com>
> >>>> ---
> >>>>  block/file-posix.c | 41 +++++++++++++++++++++++++++++++++++++++--
> >>>>  1 file changed, 39 insertions(+), 2 deletions(-)
> >>>>
> >>>> diff --git a/block/file-posix.c b/block/file-posix.c
> >>>> index c98a4a1556..ed7932d6e8 100644
> >>>> --- a/block/file-posix.c
> >>>> +++ b/block/file-posix.c
> >>>> @@ -1992,6 +1992,7 @@ static int raw_co_create(BlockdevCreateOptions *options, Error **errp)
> >>>>  {
> >>>>      BlockdevCreateOptionsFile *file_opts;
> >>>>      int fd;
> >>>> +    int perm, shared;
> >>>>      int result = 0;
> >>>>  
> >>>>      /* Validate options and set default values */
> >>>> @@ -2006,14 +2007,48 @@ static int raw_co_create(BlockdevCreateOptions *options, Error **errp)
> >>>>      }
> >>>>  
> >>>>      /* Create file */
> >>>> -    fd = qemu_open(file_opts->filename, O_RDWR | O_CREAT | O_TRUNC | O_BINARY,
> >>>> -                   0644);
> >>>> +    fd = qemu_open(file_opts->filename, O_RDWR | O_CREAT | O_BINARY, 0644);
> >>>>      if (fd < 0) {
> >>>>          result = -errno;
> >>>>          error_setg_errno(errp, -result, "Could not create file");
> >>>>          goto out;
> >>>>      }
> >>>>  
> >>>> +    /* Take permissions: We want to discard everything, so we need
> >>>> +     * BLK_PERM_WRITE; and truncation to the desired size requires
> >>>> +     * BLK_PERM_RESIZE.
> >>>> +     * On the other hand, we cannot share the RESIZE permission
> >>>> +     * because we promise that after this function, the file has the
> >>>> +     * size given in the options.  If someone else were to resize it
> >>>> +     * concurrently, we could not guarantee that. */
> >>>
> >>> Strictly speaking, we do close(fd) before this function returns so the lock is
> >>> lost and race can happen.
> >>
> >> Right, but then creation from the perspective of file-posix is over.  We
> >> are going to reopen the file for formatting, but that is just a normal
> >> bdrv_open() so it will automatically be locked, no?
> > 
> > After this function close() but before the following bdrv_open(), another
> > process can sneak in and steal the lock. So we cannot guarantee the RESIZE lock
> > does what we really want.
> 
> Right, but I'd argue that is not the purpose of the lock.  If someone
> wants the file (or then rather the node) to be a certain size, they'd
> have to call bdrv_getlength() on it to check.
> 
> But indeed you're right in that not sharing the RESIZE is hypocritical
> considering it actually doesn't promise anything.
> 
> Anyway, let's consider the issues.  For formatting the file, this
> behavior is OK because since Kevin's x-blockdev-create series format
> drivers always truncate the file during formatting (that is, once they
> have opened the file), and not during creation, so that part is secured.
> 
> But we do have issues with e.g. drive-mirror, where we create an image
> and then open it.  The opened image may have a different size than what
> we wanted to create.  Now in this case I wouldn't consider this a big
> problem because drive-mirror is basically deprecated anyway, so there is
> no reason to waste resources here; and this applies to all of the QMP
> commands that create images and then open them automatically.
> 
> In the future, we want users to use blockdev-create and then
> blockdev-add in two separate steps.  Then it's up to the user to realize
> that the blockdev-add'ed node may have a different length then what they
> wanted to achieve in blockdev-create.  Actually, it may just differ
> altogether, I don't think qemu is going to make any promises on the
> state of the image in the interim.
> 
> So...  I suppose there aren't any real issues with not being able to
> promise that the image has the intended length immediately after
> creating it.  So I guess we can indeed share RESIZE.
> 
> But OTOH, it definitely does not feel right to me to share RESIZE.  We
> definitely do not want other parties to resize the image while we create it.
> 
> So I guess what I would like to do is keep RESIZE not shared and add a
> note after the comment:
> 
> > Note that after this function, we can no longer guarantee that the
> > file is not touched by a third party, so it may be resized then.
> 
> Ideally, we'd want the lock to stay active until blockdev-create or
> qemu-img create returns, but I don't think that is really worth it.  If
> there is a race condition between raw_co_create() returning and those
> commands returning (and we don't have a format layer to correct things),
> then I can't imagine that it couldn't bite you after those commands have
> returned.  (And after those commands, it isn't our problem anymore anyway.)

Yes, what you said makes total sense to me. Having a comment and _not_ sharing
RESIZE seems the most reasonable.

Technically we could extend the API to be able to hold the fd after
blockdev-create returns by introducing an explicit blockdev-create-cleanup
command, but like you said it is probably not worth it.

Fam
diff mbox series

Patch

diff --git a/block/file-posix.c b/block/file-posix.c
index c98a4a1556..ed7932d6e8 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -1992,6 +1992,7 @@  static int raw_co_create(BlockdevCreateOptions *options, Error **errp)
 {
     BlockdevCreateOptionsFile *file_opts;
     int fd;
+    int perm, shared;
     int result = 0;
 
     /* Validate options and set default values */
@@ -2006,14 +2007,48 @@  static int raw_co_create(BlockdevCreateOptions *options, Error **errp)
     }
 
     /* Create file */
-    fd = qemu_open(file_opts->filename, O_RDWR | O_CREAT | O_TRUNC | O_BINARY,
-                   0644);
+    fd = qemu_open(file_opts->filename, O_RDWR | O_CREAT | O_BINARY, 0644);
     if (fd < 0) {
         result = -errno;
         error_setg_errno(errp, -result, "Could not create file");
         goto out;
     }
 
+    /* Take permissions: We want to discard everything, so we need
+     * BLK_PERM_WRITE; and truncation to the desired size requires
+     * BLK_PERM_RESIZE.
+     * On the other hand, we cannot share the RESIZE permission
+     * because we promise that after this function, the file has the
+     * size given in the options.  If someone else were to resize it
+     * concurrently, we could not guarantee that. */
+    perm = BLK_PERM_WRITE | BLK_PERM_RESIZE;
+    shared = BLK_PERM_ALL & ~BLK_PERM_RESIZE;
+
+    /* Step one: Take locks in shared mode */
+    result = raw_apply_lock_bytes(fd, perm, shared, false, errp);
+    if (result < 0) {
+        goto out_close;
+    }
+
+    /* Step two: Try to get them exclusively */
+    result = raw_check_lock_bytes(fd, perm, shared, errp);
+    if (result < 0) {
+        goto out_close;
+    }
+
+    /* Step three: Downgrade them to shared again, and keep
+     *             them that way until we are done */
+    result = raw_apply_lock_bytes(fd, perm, shared, true, errp);
+    if (result < 0) {
+        goto out_close;
+    }
+
+    /* Clear the file by truncating it to 0 */
+    result = raw_regular_truncate(fd, 0, PREALLOC_MODE_OFF, errp);
+    if (result < 0) {
+        goto out_close;
+    }
+
     if (file_opts->nocow) {
 #ifdef __linux__
         /* Set NOCOW flag to solve performance issue on fs like btrfs.
@@ -2029,6 +2064,8 @@  static int raw_co_create(BlockdevCreateOptions *options, Error **errp)
 #endif
     }
 
+    /* Resize and potentially preallocate the file to the desired
+     * final size */
     result = raw_regular_truncate(fd, file_opts->size, file_opts->preallocation,
                                   errp);
     if (result < 0) {