diff mbox series

[4/4] um: Add support for DISCARD in the UBD Driver

Message ID 20181113115947.19290-5-anton.ivanov@cambridgegreys.com
State Superseded
Headers show
Series [1/4] um: Switch to block-mq constants in the UML UBD driver | expand

Commit Message

Anton Ivanov Nov. 13, 2018, 11:59 a.m. UTC
From: Anton Ivanov <anton.ivanov@cambridgegreys.com>

Support for DISCARD and WRITE_ZEROES in the ubd driver using
fallocate. DISCARD capable devices are autodetected at open
time.

Signed-off-by: Anton Ivanov <anton.ivanov@cambridgegreys.com>
---
 arch/um/drivers/Kconfig     | 21 +++++++++++++++
 arch/um/drivers/ubd_kern.c  | 66 ++++++++++++++++++++++++++++++++++++++-------
 arch/um/include/shared/os.h |  1 +
 arch/um/os-Linux/file.c     | 10 +++++++
 4 files changed, 89 insertions(+), 9 deletions(-)

Comments

Jens Axboe Nov. 13, 2018, 1:40 p.m. UTC | #1
On 11/13/18 4:59 AM, anton.ivanov@cambridgegreys.com wrote:
> diff --git a/arch/um/drivers/Kconfig b/arch/um/drivers/Kconfig
> index 2b1aaf7755aa..e817fd4a4c30 100644
> --- a/arch/um/drivers/Kconfig
> +++ b/arch/um/drivers/Kconfig
> @@ -1,4 +1,25 @@
>  # SPDX-License-Identifier: GPL-2.0
> +menu "UML Block Devices"
> +
> +config BLK_DEV_UBD
> +	bool "UBD Block Device"
> +	default y
> +	help
> +          User Mode Linux virtual block device driver
> +
> +config BLK_DEV_UBD_SYNC
> +	bool "Use Synchronous mode for UBD"
> +	default n
> +	help
> +          Perform all disk operations synchronously (extremely slow).
> +
> +config BLK_DEV_UBD_DISCARD
> +	bool "Enable DISCARD/TRIM support in UBD"
> +	default y
> +	help
> +          Enable discard/trim support. Requires host kernel 3.x or above and
> +	  may not be supported on all host filesystems.
> +endmenu

kconfig entries are horrible ways to manage this. A module parameter
would be better. The latter also gets rid of the various ifdefs.

> diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c
> index e78f27f5ce88..4fd4cb68f033 100644
> --- a/arch/um/drivers/ubd_kern.c
> +++ b/arch/um/drivers/ubd_kern.c
> @@ -513,9 +513,13 @@ static void ubd_handler(void)
>  		for (count = 0; count < n/sizeof(struct io_thread_req *); count++) {
>  			struct io_thread_req *io_req = (*irq_req_buffer)[count];
>  
> -			if (!blk_update_request(io_req->req, io_req->error, io_req->length))
> -				__blk_mq_end_request(io_req->req, io_req->error);
> -
> +			if ((io_req->error) || (io_req->buffer == NULL)) {
> +				blk_mq_end_request(io_req->req, io_req->error);
> +			} else {
> +				if (!blk_update_request(io_req->req, io_req->error, io_req->length)) {
> +					__blk_mq_end_request(io_req->req, io_req->error);
> +				}
> +			}
>  			kfree(io_req);

This looks odd, why is this change necessary? Also a lot of unneeded
parentheses.

>  		}
>  	}
> @@ -775,6 +779,7 @@ static int ubd_open_dev(struct ubd *ubd_dev)
>  	char **back_ptr;
>  	int err, create_cow, *create_ptr;
>  	int fd;
> +	int data;
>  
>  	ubd_dev->openflags = ubd_dev->boot_openflags;
>  	create_cow = 0;
> @@ -829,6 +834,23 @@ static int ubd_open_dev(struct ubd *ubd_dev)
>  		if(err < 0) goto error;
>  		ubd_dev->cow.fd = err;
>  	}
> +#ifdef CONFIG_BLK_DEV_UBD_DISCARD
> +	if (ubd_dev->cow.file != NULL)
> +		fd = ubd_dev->cow.fd;
> +	else
> +		fd = ubd_dev->fd;
> +	err = os_pread_file(fd, &data, sizeof(int), 0);
> +	if (err == sizeof(int)) {
> +		err = os_falloc_punch(fd, 0, sizeof(int));
> +		if (err == 0) {

It's not clear to me how it's safe to punch a 4-byte hole in the file as
a test?
Anton Ivanov Nov. 13, 2018, 2:04 p.m. UTC | #2
On 13/11/2018 13:40, Jens Axboe wrote:
> On 11/13/18 4:59 AM, anton.ivanov@cambridgegreys.com wrote:
>> diff --git a/arch/um/drivers/Kconfig b/arch/um/drivers/Kconfig
>> index 2b1aaf7755aa..e817fd4a4c30 100644
>> --- a/arch/um/drivers/Kconfig
>> +++ b/arch/um/drivers/Kconfig
>> @@ -1,4 +1,25 @@
>>   # SPDX-License-Identifier: GPL-2.0
>> +menu "UML Block Devices"
>> +
>> +config BLK_DEV_UBD
>> +	bool "UBD Block Device"
>> +	default y
>> +	help
>> +          User Mode Linux virtual block device driver
>> +
>> +config BLK_DEV_UBD_SYNC
>> +	bool "Use Synchronous mode for UBD"
>> +	default n
>> +	help
>> +          Perform all disk operations synchronously (extremely slow).
>> +
>> +config BLK_DEV_UBD_DISCARD
>> +	bool "Enable DISCARD/TRIM support in UBD"
>> +	default y
>> +	help
>> +          Enable discard/trim support. Requires host kernel 3.x or above and
>> +	  may not be supported on all host filesystems.
>> +endmenu
> kconfig entries are horrible ways to manage this. A module parameter
> would be better. The latter also gets rid of the various ifdefs.

I completely agree with you. Unfortunately, UBD for a number of reasons 
cannot be a module at present. It contains "host" side and loading 
something from the "inside" which breaks the hypervisor boundary is a 
definitive no go.

In order to implement the suggestion we need the concept of loadable 
host side modules and while that is possible the likelihood of someone 
doing all the work on this one at present is rather low.

>
>> diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c
>> index e78f27f5ce88..4fd4cb68f033 100644
>> --- a/arch/um/drivers/ubd_kern.c
>> +++ b/arch/um/drivers/ubd_kern.c
>> @@ -513,9 +513,13 @@ static void ubd_handler(void)
>>   		for (count = 0; count < n/sizeof(struct io_thread_req *); count++) {
>>   			struct io_thread_req *io_req = (*irq_req_buffer)[count];
>>   
>> -			if (!blk_update_request(io_req->req, io_req->error, io_req->length))
>> -				__blk_mq_end_request(io_req->req, io_req->error);
>> -
>> +			if ((io_req->error) || (io_req->buffer == NULL)) {
>> +				blk_mq_end_request(io_req->req, io_req->error);
>> +			} else {
>> +				if (!blk_update_request(io_req->req, io_req->error, io_req->length)) {
>> +					__blk_mq_end_request(io_req->req, io_req->error);
>> +				}
>> +			}
>>   			kfree(io_req);
> This looks odd, why is this change necessary? Also a lot of unneeded
> parentheses.

Loop and NBD do it that way. I can retest with the original code now 
that the underlying culprit is known and resubmit if that works.

>
>>   		}
>>   	}
>> @@ -775,6 +779,7 @@ static int ubd_open_dev(struct ubd *ubd_dev)
>>   	char **back_ptr;
>>   	int err, create_cow, *create_ptr;
>>   	int fd;
>> +	int data;
>>   
>>   	ubd_dev->openflags = ubd_dev->boot_openflags;
>>   	create_cow = 0;
>> @@ -829,6 +834,23 @@ static int ubd_open_dev(struct ubd *ubd_dev)
>>   		if(err < 0) goto error;
>>   		ubd_dev->cow.fd = err;
>>   	}
>> +#ifdef CONFIG_BLK_DEV_UBD_DISCARD
>> +	if (ubd_dev->cow.file != NULL)
>> +		fd = ubd_dev->cow.fd;
>> +	else
>> +		fd = ubd_dev->fd;
>> +	err = os_pread_file(fd, &data, sizeof(int), 0);
>> +	if (err == sizeof(int)) {
>> +		err = os_falloc_punch(fd, 0, sizeof(int));
>> +		if (err == 0) {
> It's not clear to me how it's safe to punch a 4-byte hole in the file as
> a test?

A sizeof(int) sized hole is always implemented by simply writing zeroes. 
There is no fs manipulation here.

The test reads the contents of the file, punches a hoile and writes the 
content back if the hole was punched successfully so the file is in its 
orginal state.

Life would have been much easier if fallocate did not return -EINVAL on 
a 0 sized request. Unfortunately it does so any test has no choice but 
to punch a hole for real and fix the damage after that.

The location is completely safe for most UMLs UBDs. The most common 
method of building images is to format a device directly with ext2/3/4 - 
there is no partition table. The first sector in ext2/3/4 if memory 
serves me right is unused. Other fs - less so. Partition table - totally 
safe as the boot code portion is unused.

In any case, the code restores the data to its pre-test state.
Jens Axboe Nov. 13, 2018, 3:18 p.m. UTC | #3
On 11/13/18 7:04 AM, Anton Ivanov wrote:
> On 13/11/2018 13:40, Jens Axboe wrote:
>> On 11/13/18 4:59 AM, anton.ivanov@cambridgegreys.com wrote:
>>> diff --git a/arch/um/drivers/Kconfig b/arch/um/drivers/Kconfig
>>> index 2b1aaf7755aa..e817fd4a4c30 100644
>>> --- a/arch/um/drivers/Kconfig
>>> +++ b/arch/um/drivers/Kconfig
>>> @@ -1,4 +1,25 @@
>>>   # SPDX-License-Identifier: GPL-2.0
>>> +menu "UML Block Devices"
>>> +
>>> +config BLK_DEV_UBD
>>> +	bool "UBD Block Device"
>>> +	default y
>>> +	help
>>> +          User Mode Linux virtual block device driver
>>> +
>>> +config BLK_DEV_UBD_SYNC
>>> +	bool "Use Synchronous mode for UBD"
>>> +	default n
>>> +	help
>>> +          Perform all disk operations synchronously (extremely slow).
>>> +
>>> +config BLK_DEV_UBD_DISCARD
>>> +	bool "Enable DISCARD/TRIM support in UBD"
>>> +	default y
>>> +	help
>>> +          Enable discard/trim support. Requires host kernel 3.x or above and
>>> +	  may not be supported on all host filesystems.
>>> +endmenu
>> kconfig entries are horrible ways to manage this. A module parameter
>> would be better. The latter also gets rid of the various ifdefs.
> 
> I completely agree with you. Unfortunately, UBD for a number of reasons 
> cannot be a module at present. It contains "host" side and loading 
> something from the "inside" which breaks the hypervisor boundary is a 
> definitive no go.
> 
> In order to implement the suggestion we need the concept of loadable 
> host side modules and while that is possible the likelihood of someone 
> doing all the work on this one at present is rather low.

Module parameters work for builtin as well, then you just add
ubd.discard=true to the kernel command line.

>>> diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c
>>> index e78f27f5ce88..4fd4cb68f033 100644
>>> --- a/arch/um/drivers/ubd_kern.c
>>> +++ b/arch/um/drivers/ubd_kern.c
>>> @@ -513,9 +513,13 @@ static void ubd_handler(void)
>>>   		for (count = 0; count < n/sizeof(struct io_thread_req *); count++) {
>>>   			struct io_thread_req *io_req = (*irq_req_buffer)[count];
>>>   
>>> -			if (!blk_update_request(io_req->req, io_req->error, io_req->length))
>>> -				__blk_mq_end_request(io_req->req, io_req->error);
>>> -
>>> +			if ((io_req->error) || (io_req->buffer == NULL)) {
>>> +				blk_mq_end_request(io_req->req, io_req->error);
>>> +			} else {
>>> +				if (!blk_update_request(io_req->req, io_req->error, io_req->length)) {
>>> +					__blk_mq_end_request(io_req->req, io_req->error);
>>> +				}
>>> +			}
>>>   			kfree(io_req);
>> This looks odd, why is this change necessary? Also a lot of unneeded
>> parentheses.
> 
> Loop and NBD do it that way. I can retest with the original code now 
> that the underlying culprit is known and resubmit if that works.

I missed the __ part, yes it actually looks fine. blk_mq_end_request() will
both end the bio parts and free the request, your latter part does incremental
completes.

>>> @@ -775,6 +779,7 @@ static int ubd_open_dev(struct ubd *ubd_dev)
>>>   	char **back_ptr;
>>>   	int err, create_cow, *create_ptr;
>>>   	int fd;
>>> +	int data;
>>>   
>>>   	ubd_dev->openflags = ubd_dev->boot_openflags;
>>>   	create_cow = 0;
>>> @@ -829,6 +834,23 @@ static int ubd_open_dev(struct ubd *ubd_dev)
>>>   		if(err < 0) goto error;
>>>   		ubd_dev->cow.fd = err;
>>>   	}
>>> +#ifdef CONFIG_BLK_DEV_UBD_DISCARD
>>> +	if (ubd_dev->cow.file != NULL)
>>> +		fd = ubd_dev->cow.fd;
>>> +	else
>>> +		fd = ubd_dev->fd;
>>> +	err = os_pread_file(fd, &data, sizeof(int), 0);
>>> +	if (err == sizeof(int)) {
>>> +		err = os_falloc_punch(fd, 0, sizeof(int));
>>> +		if (err == 0) {
>> It's not clear to me how it's safe to punch a 4-byte hole in the file as
>> a test?
> 
> A sizeof(int) sized hole is always implemented by simply writing zeroes. 
> There is no fs manipulation here.
> 
> The test reads the contents of the file, punches a hoile and writes the 
> content back if the hole was punched successfully so the file is in its 
> orginal state.
> 
> Life would have been much easier if fallocate did not return -EINVAL on 
> a 0 sized request. Unfortunately it does so any test has no choice but 
> to punch a hole for real and fix the damage after that.
> 
> The location is completely safe for most UMLs UBDs. The most common 
> method of building images is to format a device directly with ext2/3/4 - 
> there is no partition table. The first sector in ext2/3/4 if memory 
> serves me right is unused. Other fs - less so. Partition table - totally 
> safe as the boot code portion is unused.
> 
> In any case, the code restores the data to its pre-test state.

It's a bit nasty... You'd probably want to do a full sector though, the
fs isn't going to be issuing a trim for anything that small.

What happens if the system crashes before you successfully write back
the data you zeroed? Doesn't seem like a really safe way to do this.
Anton Ivanov Nov. 13, 2018, 3:32 p.m. UTC | #4
On 11/13/18 3:18 PM, Jens Axboe wrote:
> On 11/13/18 7:04 AM, Anton Ivanov wrote:
>> On 13/11/2018 13:40, Jens Axboe wrote:
>>> On 11/13/18 4:59 AM, anton.ivanov@cambridgegreys.com wrote:
>>>> diff --git a/arch/um/drivers/Kconfig b/arch/um/drivers/Kconfig
>>>> index 2b1aaf7755aa..e817fd4a4c30 100644
>>>> --- a/arch/um/drivers/Kconfig
>>>> +++ b/arch/um/drivers/Kconfig
>>>> @@ -1,4 +1,25 @@
>>>>    # SPDX-License-Identifier: GPL-2.0
>>>> +menu "UML Block Devices"
>>>> +
>>>> +config BLK_DEV_UBD
>>>> +	bool "UBD Block Device"
>>>> +	default y
>>>> +	help
>>>> +          User Mode Linux virtual block device driver
>>>> +
>>>> +config BLK_DEV_UBD_SYNC
>>>> +	bool "Use Synchronous mode for UBD"
>>>> +	default n
>>>> +	help
>>>> +          Perform all disk operations synchronously (extremely slow).
>>>> +
>>>> +config BLK_DEV_UBD_DISCARD
>>>> +	bool "Enable DISCARD/TRIM support in UBD"
>>>> +	default y
>>>> +	help
>>>> +          Enable discard/trim support. Requires host kernel 3.x or above and
>>>> +	  may not be supported on all host filesystems.
>>>> +endmenu
>>> kconfig entries are horrible ways to manage this. A module parameter
>>> would be better. The latter also gets rid of the various ifdefs.
>> I completely agree with you. Unfortunately, UBD for a number of reasons
>> cannot be a module at present. It contains "host" side and loading
>> something from the "inside" which breaks the hypervisor boundary is a
>> definitive no go.
>>
>> In order to implement the suggestion we need the concept of loadable
>> host side modules and while that is possible the likelihood of someone
>> doing all the work on this one at present is rather low.
> Module parameters work for builtin as well, then you just add
> ubd.discard=true to the kernel command line.
>
>>>> diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c
>>>> index e78f27f5ce88..4fd4cb68f033 100644
>>>> --- a/arch/um/drivers/ubd_kern.c
>>>> +++ b/arch/um/drivers/ubd_kern.c
>>>> @@ -513,9 +513,13 @@ static void ubd_handler(void)
>>>>    		for (count = 0; count < n/sizeof(struct io_thread_req *); count++) {
>>>>    			struct io_thread_req *io_req = (*irq_req_buffer)[count];
>>>>    
>>>> -			if (!blk_update_request(io_req->req, io_req->error, io_req->length))
>>>> -				__blk_mq_end_request(io_req->req, io_req->error);
>>>> -
>>>> +			if ((io_req->error) || (io_req->buffer == NULL)) {
>>>> +				blk_mq_end_request(io_req->req, io_req->error);
>>>> +			} else {
>>>> +				if (!blk_update_request(io_req->req, io_req->error, io_req->length)) {
>>>> +					__blk_mq_end_request(io_req->req, io_req->error);
>>>> +				}
>>>> +			}
>>>>    			kfree(io_req);
>>> This looks odd, why is this change necessary? Also a lot of unneeded
>>> parentheses.
>> Loop and NBD do it that way. I can retest with the original code now
>> that the underlying culprit is known and resubmit if that works.
> I missed the __ part, yes it actually looks fine. blk_mq_end_request() will
> both end the bio parts and free the request, your latter part does incremental
> completes.
>
>>>> @@ -775,6 +779,7 @@ static int ubd_open_dev(struct ubd *ubd_dev)
>>>>    	char **back_ptr;
>>>>    	int err, create_cow, *create_ptr;
>>>>    	int fd;
>>>> +	int data;
>>>>    
>>>>    	ubd_dev->openflags = ubd_dev->boot_openflags;
>>>>    	create_cow = 0;
>>>> @@ -829,6 +834,23 @@ static int ubd_open_dev(struct ubd *ubd_dev)
>>>>    		if(err < 0) goto error;
>>>>    		ubd_dev->cow.fd = err;
>>>>    	}
>>>> +#ifdef CONFIG_BLK_DEV_UBD_DISCARD
>>>> +	if (ubd_dev->cow.file != NULL)
>>>> +		fd = ubd_dev->cow.fd;
>>>> +	else
>>>> +		fd = ubd_dev->fd;
>>>> +	err = os_pread_file(fd, &data, sizeof(int), 0);
>>>> +	if (err == sizeof(int)) {
>>>> +		err = os_falloc_punch(fd, 0, sizeof(int));
>>>> +		if (err == 0) {
>>> It's not clear to me how it's safe to punch a 4-byte hole in the file as
>>> a test?
>> A sizeof(int) sized hole is always implemented by simply writing zeroes.
>> There is no fs manipulation here.
>>
>> The test reads the contents of the file, punches a hoile and writes the
>> content back if the hole was punched successfully so the file is in its
>> orginal state.
>>
>> Life would have been much easier if fallocate did not return -EINVAL on
>> a 0 sized request. Unfortunately it does so any test has no choice but
>> to punch a hole for real and fix the damage after that.
>>
>> The location is completely safe for most UMLs UBDs. The most common
>> method of building images is to format a device directly with ext2/3/4 -
>> there is no partition table. The first sector in ext2/3/4 if memory
>> serves me right is unused. Other fs - less so. Partition table - totally
>> safe as the boot code portion is unused.
>>
>> In any case, the code restores the data to its pre-test state.
> It's a bit nasty... You'd probably want to do a full sector though, the
> fs isn't going to be issuing a trim for anything that small.
>
> What happens if the system crashes before you successfully write back
> the data you zeroed? Doesn't seem like a really safe way to do this.

There is no safe way to do this as far as I can see :)

So if any probing is to be done it is inherently unsafe courtesy of 
fallocate returning EINVAL on zero sized requests.

IMHO there is no point to do probing to start off with. Issuing a trim 
if the underlying fs does not support it does not result in anything 
visible to the user. I tried to trim ext4 images sitting on a MSDOS 
filesystem - there is no issues as the error is eaten somewhere inside 
the kernel. Nothing showing up to the user and no visible damage :)

However, Richard asked for a probe. That is how a reliable probe would 
look like. I am not happy with it either and I think it is too risky for 
a lot of scenarios. If we are to probe, we will have to keep some piece 
of similar code in.
Jens Axboe Nov. 13, 2018, 3:49 p.m. UTC | #5
On 11/13/18 8:32 AM, Anton Ivanov wrote:
> 
> On 11/13/18 3:18 PM, Jens Axboe wrote:
>> On 11/13/18 7:04 AM, Anton Ivanov wrote:
>>> On 13/11/2018 13:40, Jens Axboe wrote:
>>>> On 11/13/18 4:59 AM, anton.ivanov@cambridgegreys.com wrote:
>>>>> diff --git a/arch/um/drivers/Kconfig b/arch/um/drivers/Kconfig
>>>>> index 2b1aaf7755aa..e817fd4a4c30 100644
>>>>> --- a/arch/um/drivers/Kconfig
>>>>> +++ b/arch/um/drivers/Kconfig
>>>>> @@ -1,4 +1,25 @@
>>>>>    # SPDX-License-Identifier: GPL-2.0
>>>>> +menu "UML Block Devices"
>>>>> +
>>>>> +config BLK_DEV_UBD
>>>>> +	bool "UBD Block Device"
>>>>> +	default y
>>>>> +	help
>>>>> +          User Mode Linux virtual block device driver
>>>>> +
>>>>> +config BLK_DEV_UBD_SYNC
>>>>> +	bool "Use Synchronous mode for UBD"
>>>>> +	default n
>>>>> +	help
>>>>> +          Perform all disk operations synchronously (extremely slow).
>>>>> +
>>>>> +config BLK_DEV_UBD_DISCARD
>>>>> +	bool "Enable DISCARD/TRIM support in UBD"
>>>>> +	default y
>>>>> +	help
>>>>> +          Enable discard/trim support. Requires host kernel 3.x or above and
>>>>> +	  may not be supported on all host filesystems.
>>>>> +endmenu
>>>> kconfig entries are horrible ways to manage this. A module parameter
>>>> would be better. The latter also gets rid of the various ifdefs.
>>> I completely agree with you. Unfortunately, UBD for a number of reasons
>>> cannot be a module at present. It contains "host" side and loading
>>> something from the "inside" which breaks the hypervisor boundary is a
>>> definitive no go.
>>>
>>> In order to implement the suggestion we need the concept of loadable
>>> host side modules and while that is possible the likelihood of someone
>>> doing all the work on this one at present is rather low.
>> Module parameters work for builtin as well, then you just add
>> ubd.discard=true to the kernel command line.
>>
>>>>> diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c
>>>>> index e78f27f5ce88..4fd4cb68f033 100644
>>>>> --- a/arch/um/drivers/ubd_kern.c
>>>>> +++ b/arch/um/drivers/ubd_kern.c
>>>>> @@ -513,9 +513,13 @@ static void ubd_handler(void)
>>>>>    		for (count = 0; count < n/sizeof(struct io_thread_req *); count++) {
>>>>>    			struct io_thread_req *io_req = (*irq_req_buffer)[count];
>>>>>    
>>>>> -			if (!blk_update_request(io_req->req, io_req->error, io_req->length))
>>>>> -				__blk_mq_end_request(io_req->req, io_req->error);
>>>>> -
>>>>> +			if ((io_req->error) || (io_req->buffer == NULL)) {
>>>>> +				blk_mq_end_request(io_req->req, io_req->error);
>>>>> +			} else {
>>>>> +				if (!blk_update_request(io_req->req, io_req->error, io_req->length)) {
>>>>> +					__blk_mq_end_request(io_req->req, io_req->error);
>>>>> +				}
>>>>> +			}
>>>>>    			kfree(io_req);
>>>> This looks odd, why is this change necessary? Also a lot of unneeded
>>>> parentheses.
>>> Loop and NBD do it that way. I can retest with the original code now
>>> that the underlying culprit is known and resubmit if that works.
>> I missed the __ part, yes it actually looks fine. blk_mq_end_request() will
>> both end the bio parts and free the request, your latter part does incremental
>> completes.
>>
>>>>> @@ -775,6 +779,7 @@ static int ubd_open_dev(struct ubd *ubd_dev)
>>>>>    	char **back_ptr;
>>>>>    	int err, create_cow, *create_ptr;
>>>>>    	int fd;
>>>>> +	int data;
>>>>>    
>>>>>    	ubd_dev->openflags = ubd_dev->boot_openflags;
>>>>>    	create_cow = 0;
>>>>> @@ -829,6 +834,23 @@ static int ubd_open_dev(struct ubd *ubd_dev)
>>>>>    		if(err < 0) goto error;
>>>>>    		ubd_dev->cow.fd = err;
>>>>>    	}
>>>>> +#ifdef CONFIG_BLK_DEV_UBD_DISCARD
>>>>> +	if (ubd_dev->cow.file != NULL)
>>>>> +		fd = ubd_dev->cow.fd;
>>>>> +	else
>>>>> +		fd = ubd_dev->fd;
>>>>> +	err = os_pread_file(fd, &data, sizeof(int), 0);
>>>>> +	if (err == sizeof(int)) {
>>>>> +		err = os_falloc_punch(fd, 0, sizeof(int));
>>>>> +		if (err == 0) {
>>>> It's not clear to me how it's safe to punch a 4-byte hole in the file as
>>>> a test?
>>> A sizeof(int) sized hole is always implemented by simply writing zeroes.
>>> There is no fs manipulation here.
>>>
>>> The test reads the contents of the file, punches a hoile and writes the
>>> content back if the hole was punched successfully so the file is in its
>>> orginal state.
>>>
>>> Life would have been much easier if fallocate did not return -EINVAL on
>>> a 0 sized request. Unfortunately it does so any test has no choice but
>>> to punch a hole for real and fix the damage after that.
>>>
>>> The location is completely safe for most UMLs UBDs. The most common
>>> method of building images is to format a device directly with ext2/3/4 -
>>> there is no partition table. The first sector in ext2/3/4 if memory
>>> serves me right is unused. Other fs - less so. Partition table - totally
>>> safe as the boot code portion is unused.
>>>
>>> In any case, the code restores the data to its pre-test state.
>> It's a bit nasty... You'd probably want to do a full sector though, the
>> fs isn't going to be issuing a trim for anything that small.
>>
>> What happens if the system crashes before you successfully write back
>> the data you zeroed? Doesn't seem like a really safe way to do this.
> 
> There is no safe way to do this as far as I can see :)
> 
> So if any probing is to be done it is inherently unsafe courtesy of 
> fallocate returning EINVAL on zero sized requests.
> 
> IMHO there is no point to do probing to start off with. Issuing a trim 
> if the underlying fs does not support it does not result in anything 
> visible to the user. I tried to trim ext4 images sitting on a MSDOS 
> filesystem - there is no issues as the error is eaten somewhere inside 
> the kernel. Nothing showing up to the user and no visible damage :)
> 
> However, Richard asked for a probe. That is how a reliable probe would 
> look like. I am not happy with it either and I think it is too risky for 
> a lot of scenarios. If we are to probe, we will have to keep some piece 
> of similar code in.

So here's what I suggest - default to discard being available, don't
offer WRITE_ZEROES support. Don't set the flag that tells the block
layer that discards guarantees zeroes. Have an internal flag that is set
when discards are enabled. If a discard fails, then clear your internal
flag and the queue support. You must still expect discards to arrive, if
they do and your internal discard flag is not set, you just end them,
not in error.

That'll just treat discards as hints, and you don't have to have any
broken probe logic.
Anton Ivanov Nov. 13, 2018, 3:52 p.m. UTC | #6
On 13/11/2018 15:49, Jens Axboe wrote:
> On 11/13/18 8:32 AM, Anton Ivanov wrote:
>> On 11/13/18 3:18 PM, Jens Axboe wrote:
>>> On 11/13/18 7:04 AM, Anton Ivanov wrote:
>>>> On 13/11/2018 13:40, Jens Axboe wrote:
>>>>> On 11/13/18 4:59 AM, anton.ivanov@cambridgegreys.com wrote:
>>>>>> diff --git a/arch/um/drivers/Kconfig b/arch/um/drivers/Kconfig
>>>>>> index 2b1aaf7755aa..e817fd4a4c30 100644
>>>>>> --- a/arch/um/drivers/Kconfig
>>>>>> +++ b/arch/um/drivers/Kconfig
>>>>>> @@ -1,4 +1,25 @@
>>>>>>     # SPDX-License-Identifier: GPL-2.0
>>>>>> +menu "UML Block Devices"
>>>>>> +
>>>>>> +config BLK_DEV_UBD
>>>>>> +	bool "UBD Block Device"
>>>>>> +	default y
>>>>>> +	help
>>>>>> +          User Mode Linux virtual block device driver
>>>>>> +
>>>>>> +config BLK_DEV_UBD_SYNC
>>>>>> +	bool "Use Synchronous mode for UBD"
>>>>>> +	default n
>>>>>> +	help
>>>>>> +          Perform all disk operations synchronously (extremely slow).
>>>>>> +
>>>>>> +config BLK_DEV_UBD_DISCARD
>>>>>> +	bool "Enable DISCARD/TRIM support in UBD"
>>>>>> +	default y
>>>>>> +	help
>>>>>> +          Enable discard/trim support. Requires host kernel 3.x or above and
>>>>>> +	  may not be supported on all host filesystems.
>>>>>> +endmenu
>>>>> kconfig entries are horrible ways to manage this. A module parameter
>>>>> would be better. The latter also gets rid of the various ifdefs.
>>>> I completely agree with you. Unfortunately, UBD for a number of reasons
>>>> cannot be a module at present. It contains "host" side and loading
>>>> something from the "inside" which breaks the hypervisor boundary is a
>>>> definitive no go.
>>>>
>>>> In order to implement the suggestion we need the concept of loadable
>>>> host side modules and while that is possible the likelihood of someone
>>>> doing all the work on this one at present is rather low.
>>> Module parameters work for builtin as well, then you just add
>>> ubd.discard=true to the kernel command line.
>>>
>>>>>> diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c
>>>>>> index e78f27f5ce88..4fd4cb68f033 100644
>>>>>> --- a/arch/um/drivers/ubd_kern.c
>>>>>> +++ b/arch/um/drivers/ubd_kern.c
>>>>>> @@ -513,9 +513,13 @@ static void ubd_handler(void)
>>>>>>     		for (count = 0; count < n/sizeof(struct io_thread_req *); count++) {
>>>>>>     			struct io_thread_req *io_req = (*irq_req_buffer)[count];
>>>>>>     
>>>>>> -			if (!blk_update_request(io_req->req, io_req->error, io_req->length))
>>>>>> -				__blk_mq_end_request(io_req->req, io_req->error);
>>>>>> -
>>>>>> +			if ((io_req->error) || (io_req->buffer == NULL)) {
>>>>>> +				blk_mq_end_request(io_req->req, io_req->error);
>>>>>> +			} else {
>>>>>> +				if (!blk_update_request(io_req->req, io_req->error, io_req->length)) {
>>>>>> +					__blk_mq_end_request(io_req->req, io_req->error);
>>>>>> +				}
>>>>>> +			}
>>>>>>     			kfree(io_req);
>>>>> This looks odd, why is this change necessary? Also a lot of unneeded
>>>>> parentheses.
>>>> Loop and NBD do it that way. I can retest with the original code now
>>>> that the underlying culprit is known and resubmit if that works.
>>> I missed the __ part, yes it actually looks fine. blk_mq_end_request() will
>>> both end the bio parts and free the request, your latter part does incremental
>>> completes.
>>>
>>>>>> @@ -775,6 +779,7 @@ static int ubd_open_dev(struct ubd *ubd_dev)
>>>>>>     	char **back_ptr;
>>>>>>     	int err, create_cow, *create_ptr;
>>>>>>     	int fd;
>>>>>> +	int data;
>>>>>>     
>>>>>>     	ubd_dev->openflags = ubd_dev->boot_openflags;
>>>>>>     	create_cow = 0;
>>>>>> @@ -829,6 +834,23 @@ static int ubd_open_dev(struct ubd *ubd_dev)
>>>>>>     		if(err < 0) goto error;
>>>>>>     		ubd_dev->cow.fd = err;
>>>>>>     	}
>>>>>> +#ifdef CONFIG_BLK_DEV_UBD_DISCARD
>>>>>> +	if (ubd_dev->cow.file != NULL)
>>>>>> +		fd = ubd_dev->cow.fd;
>>>>>> +	else
>>>>>> +		fd = ubd_dev->fd;
>>>>>> +	err = os_pread_file(fd, &data, sizeof(int), 0);
>>>>>> +	if (err == sizeof(int)) {
>>>>>> +		err = os_falloc_punch(fd, 0, sizeof(int));
>>>>>> +		if (err == 0) {
>>>>> It's not clear to me how it's safe to punch a 4-byte hole in the file as
>>>>> a test?
>>>> A sizeof(int) sized hole is always implemented by simply writing zeroes.
>>>> There is no fs manipulation here.
>>>>
>>>> The test reads the contents of the file, punches a hoile and writes the
>>>> content back if the hole was punched successfully so the file is in its
>>>> orginal state.
>>>>
>>>> Life would have been much easier if fallocate did not return -EINVAL on
>>>> a 0 sized request. Unfortunately it does so any test has no choice but
>>>> to punch a hole for real and fix the damage after that.
>>>>
>>>> The location is completely safe for most UMLs UBDs. The most common
>>>> method of building images is to format a device directly with ext2/3/4 -
>>>> there is no partition table. The first sector in ext2/3/4 if memory
>>>> serves me right is unused. Other fs - less so. Partition table - totally
>>>> safe as the boot code portion is unused.
>>>>
>>>> In any case, the code restores the data to its pre-test state.
>>> It's a bit nasty... You'd probably want to do a full sector though, the
>>> fs isn't going to be issuing a trim for anything that small.
>>>
>>> What happens if the system crashes before you successfully write back
>>> the data you zeroed? Doesn't seem like a really safe way to do this.
>> There is no safe way to do this as far as I can see :)
>>
>> So if any probing is to be done it is inherently unsafe courtesy of
>> fallocate returning EINVAL on zero sized requests.
>>
>> IMHO there is no point to do probing to start off with. Issuing a trim
>> if the underlying fs does not support it does not result in anything
>> visible to the user. I tried to trim ext4 images sitting on a MSDOS
>> filesystem - there is no issues as the error is eaten somewhere inside
>> the kernel. Nothing showing up to the user and no visible damage :)
>>
>> However, Richard asked for a probe. That is how a reliable probe would
>> look like. I am not happy with it either and I think it is too risky for
>> a lot of scenarios. If we are to probe, we will have to keep some piece
>> of similar code in.
> So here's what I suggest - default to discard being available, don't
> offer WRITE_ZEROES support. Don't set the flag that tells the block
> layer that discards guarantees zeroes. Have an internal flag that is set
> when discards are enabled. If a discard fails, then clear your internal
> flag and the queue support. You must still expect discards to arrive, if
> they do and your internal discard flag is not set, you just end them,
> not in error.
>
> That'll just treat discards as hints, and you don't have to have any
> broken probe logic.
>
Sounds good. I will implement this round of comments, retest and 
resubmit the series tomorrow.
Christoph Hellwig Nov. 14, 2018, 3:30 p.m. UTC | #7
Discards are defined to be hints.  So if your fileystems return
-EOPTNOTSUPP you just stop passing them to the file system and
complete them from ->queue_rq.

Write Zeroes isn't.  So if you really want to support it you
should manually write zeroes to the file as a fallback.  But do you
have a real use case for Write Zeroes to start with?
Anton Ivanov Nov. 14, 2018, 3:35 p.m. UTC | #8
On 14/11/2018 15:30, Christoph Hellwig wrote:
> Discards are defined to be hints.  So if your fileystems return
> -EOPTNOTSUPP you just stop passing them to the file system and
> complete them from ->queue_rq.
Ack.
>
> Write Zeroes isn't.  So if you really want to support it you
> should manually write zeroes to the file as a fallback.

Hmm... Loop treats them as the same thing. I was looking at what they 
have done.

>   But do you
> have a real use case for Write Zeroes to start with?

I am not aware of.

There is a real case just for trim for now and I was using loop and nbd 
as a "guide" on what needs to be done. Loop has both DISCARD and ZERO.

Several other ops in the API looked interesting as well, but I have not 
seen them implemented anywhere so I skipped them for now.
diff mbox series

Patch

diff --git a/arch/um/drivers/Kconfig b/arch/um/drivers/Kconfig
index 2b1aaf7755aa..e817fd4a4c30 100644
--- a/arch/um/drivers/Kconfig
+++ b/arch/um/drivers/Kconfig
@@ -1,4 +1,25 @@ 
 # SPDX-License-Identifier: GPL-2.0
+menu "UML Block Devices"
+
+config BLK_DEV_UBD
+	bool "UBD Block Device"
+	default y
+	help
+          User Mode Linux virtual block device driver
+
+config BLK_DEV_UBD_SYNC
+	bool "Use Synchronous mode for UBD"
+	default n
+	help
+          Perform all disk operations synchronously (extremely slow).
+
+config BLK_DEV_UBD_DISCARD
+	bool "Enable DISCARD/TRIM support in UBD"
+	default y
+	help
+          Enable discard/trim support. Requires host kernel 3.x or above and
+	  may not be supported on all host filesystems.
+endmenu
 
 menu "UML Character Devices"
 
diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c
index e78f27f5ce88..4fd4cb68f033 100644
--- a/arch/um/drivers/ubd_kern.c
+++ b/arch/um/drivers/ubd_kern.c
@@ -513,9 +513,13 @@  static void ubd_handler(void)
 		for (count = 0; count < n/sizeof(struct io_thread_req *); count++) {
 			struct io_thread_req *io_req = (*irq_req_buffer)[count];
 
-			if (!blk_update_request(io_req->req, io_req->error, io_req->length))
-				__blk_mq_end_request(io_req->req, io_req->error);
-
+			if ((io_req->error) || (io_req->buffer == NULL)) {
+				blk_mq_end_request(io_req->req, io_req->error);
+			} else {
+				if (!blk_update_request(io_req->req, io_req->error, io_req->length)) {
+					__blk_mq_end_request(io_req->req, io_req->error);
+				}
+			}
 			kfree(io_req);
 		}
 	}
@@ -775,6 +779,7 @@  static int ubd_open_dev(struct ubd *ubd_dev)
 	char **back_ptr;
 	int err, create_cow, *create_ptr;
 	int fd;
+	int data;
 
 	ubd_dev->openflags = ubd_dev->boot_openflags;
 	create_cow = 0;
@@ -829,6 +834,23 @@  static int ubd_open_dev(struct ubd *ubd_dev)
 		if(err < 0) goto error;
 		ubd_dev->cow.fd = err;
 	}
+#ifdef CONFIG_BLK_DEV_UBD_DISCARD
+	if (ubd_dev->cow.file != NULL)
+		fd = ubd_dev->cow.fd;
+	else
+		fd = ubd_dev->fd;
+	err = os_pread_file(fd, &data, sizeof(int), 0);
+	if (err == sizeof(int)) {
+		err = os_falloc_punch(fd, 0, sizeof(int));
+		if (err == 0) {
+			ubd_dev->queue->limits.discard_granularity = UBD_SECTOR_SIZE;
+			ubd_dev->queue->limits.discard_alignment = UBD_SECTOR_SIZE;
+			blk_queue_max_discard_sectors(ubd_dev->queue, UBD_MAX_REQUEST);
+			blk_queue_flag_set(QUEUE_FLAG_DISCARD, ubd_dev->queue);
+			os_pwrite_file(fd, &data, sizeof(int), 0);
+		}
+	}
+#endif
 	blk_queue_flag_set(QUEUE_FLAG_NONROT, ubd_dev->queue);
 	return 0;
  error:
@@ -1368,6 +1390,12 @@  static blk_status_t ubd_queue_rq(struct blk_mq_hw_ctx *hctx,
 			}
 		}
 		break;
+#ifdef CONFIG_BLK_DEV_UBD_DISCARD
+	case REQ_OP_DISCARD:
+	case REQ_OP_WRITE_ZEROES:
+		ret = ubd_queue_one_vec(hctx, req, (u64)blk_rq_pos(req) << 9, NULL);
+		break;
+#endif
 	default:
 		WARN_ON_ONCE(1);
 		spin_unlock_irq(&ubd_dev->lock);
@@ -1447,7 +1475,7 @@  static int update_bitmap(struct io_thread_req *req)
 
 	n = os_pwrite_file(req->fds[1], &req->bitmap_words,
 			  sizeof(req->bitmap_words), req->cow_offset);
-	if(n != sizeof(req->bitmap_words))
+	if (n != sizeof(req->bitmap_words))
 		return map_error(-n);
 
 	return map_error(0);
@@ -1455,11 +1483,13 @@  static int update_bitmap(struct io_thread_req *req)
 
 static void do_io(struct io_thread_req *req)
 {
-	char *buf;
+	char *buf = NULL;
 	unsigned long len;
 	int n, nsectors, start, end, bit;
 	__u64 off;
 
+	/* FLUSH is really a special case, we cannot "case" it with others */
+
 	if (req_op(req->req) == REQ_OP_FLUSH) {
 		/* fds[0] is always either the rw image or our cow file */
 		req->error = map_error(-os_sync_file(req->fds[0]));
@@ -1479,26 +1509,44 @@  static void do_io(struct io_thread_req *req)
 		off = req->offset + req->offsets[bit] +
 			start * req->sectorsize;
 		len = (end - start) * req->sectorsize;
-		buf = &req->buffer[start * req->sectorsize];
+		if (req->buffer != NULL)
+			buf = &req->buffer[start * req->sectorsize];
 
-		if (req_op(req->req) == REQ_OP_READ) {
+		switch (req_op(req->req)) {
+		case REQ_OP_READ:
 			n = 0;
 			do {
 				buf = &buf[n];
 				len -= n;
 				n = os_pread_file(req->fds[bit], buf, len, off);
-				if(n < 0){
+				if (n < 0) {
 					req->error = map_error(-n);
 					return;
 				}
 			} while((n < len) && (n != 0));
 			if (n < len) memset(&buf[n], 0, len - n);
-		} else {
+			break;
+		case REQ_OP_WRITE:
 			n = os_pwrite_file(req->fds[bit], buf, len, off);
 			if(n != len){
 				req->error = map_error(-n);
 				return;
 			}
+			break;
+#ifdef CONFIG_BLK_DEV_UBD_DISCARD
+		case REQ_OP_DISCARD:
+		case REQ_OP_WRITE_ZEROES:
+			n = os_falloc_punch(req->fds[bit], off, len);
+			if (n) {
+				req->error = map_error(-n);
+				return;
+			}
+			break;
+#endif
+		default:
+			WARN_ON_ONCE(1);
+			req->error = BLK_STS_NOTSUPP;
+			return;
 		}
 
 		start = end;
diff --git a/arch/um/include/shared/os.h b/arch/um/include/shared/os.h
index 048ae37eb5aa..ebf23012a59b 100644
--- a/arch/um/include/shared/os.h
+++ b/arch/um/include/shared/os.h
@@ -175,6 +175,7 @@  extern int os_fchange_dir(int fd);
 extern unsigned os_major(unsigned long long dev);
 extern unsigned os_minor(unsigned long long dev);
 extern unsigned long long os_makedev(unsigned major, unsigned minor);
+extern int os_falloc_punch(int fd, unsigned long long offset, int count);
 
 /* start_up.c */
 extern void os_early_checks(void);
diff --git a/arch/um/os-Linux/file.c b/arch/um/os-Linux/file.c
index c0197097c86e..f25b110d4e70 100644
--- a/arch/um/os-Linux/file.c
+++ b/arch/um/os-Linux/file.c
@@ -610,3 +610,13 @@  unsigned long long os_makedev(unsigned major, unsigned minor)
 {
 	return makedev(major, minor);
 }
+
+int os_falloc_punch(int fd, unsigned long long offset, int len)
+{
+	int n = fallocate(fd, FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE, offset, len);
+
+	if (n < 0)
+		return -errno;
+	return n;
+}
+