Message ID | 20181113115947.19290-5-anton.ivanov@cambridgegreys.com |
---|---|
State | Superseded |
Headers | show |
Series | [1/4] um: Switch to block-mq constants in the UML UBD driver | expand |
On 11/13/18 4:59 AM, anton.ivanov@cambridgegreys.com wrote: > diff --git a/arch/um/drivers/Kconfig b/arch/um/drivers/Kconfig > index 2b1aaf7755aa..e817fd4a4c30 100644 > --- a/arch/um/drivers/Kconfig > +++ b/arch/um/drivers/Kconfig > @@ -1,4 +1,25 @@ > # SPDX-License-Identifier: GPL-2.0 > +menu "UML Block Devices" > + > +config BLK_DEV_UBD > + bool "UBD Block Device" > + default y > + help > + User Mode Linux virtual block device driver > + > +config BLK_DEV_UBD_SYNC > + bool "Use Synchronous mode for UBD" > + default n > + help > + Perform all disk operations synchronously (extremely slow). > + > +config BLK_DEV_UBD_DISCARD > + bool "Enable DISCARD/TRIM support in UBD" > + default y > + help > + Enable discard/trim support. Requires host kernel 3.x or above and > + may not be supported on all host filesystems. > +endmenu kconfig entries are horrible ways to manage this. A module parameter would be better. The latter also gets rid of the various ifdefs. > diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c > index e78f27f5ce88..4fd4cb68f033 100644 > --- a/arch/um/drivers/ubd_kern.c > +++ b/arch/um/drivers/ubd_kern.c > @@ -513,9 +513,13 @@ static void ubd_handler(void) > for (count = 0; count < n/sizeof(struct io_thread_req *); count++) { > struct io_thread_req *io_req = (*irq_req_buffer)[count]; > > - if (!blk_update_request(io_req->req, io_req->error, io_req->length)) > - __blk_mq_end_request(io_req->req, io_req->error); > - > + if ((io_req->error) || (io_req->buffer == NULL)) { > + blk_mq_end_request(io_req->req, io_req->error); > + } else { > + if (!blk_update_request(io_req->req, io_req->error, io_req->length)) { > + __blk_mq_end_request(io_req->req, io_req->error); > + } > + } > kfree(io_req); This looks odd, why is this change necessary? Also a lot of unneeded parentheses. > } > } > @@ -775,6 +779,7 @@ static int ubd_open_dev(struct ubd *ubd_dev) > char **back_ptr; > int err, create_cow, *create_ptr; > int fd; > + int data; > > ubd_dev->openflags = ubd_dev->boot_openflags; > create_cow = 0; > @@ -829,6 +834,23 @@ static int ubd_open_dev(struct ubd *ubd_dev) > if(err < 0) goto error; > ubd_dev->cow.fd = err; > } > +#ifdef CONFIG_BLK_DEV_UBD_DISCARD > + if (ubd_dev->cow.file != NULL) > + fd = ubd_dev->cow.fd; > + else > + fd = ubd_dev->fd; > + err = os_pread_file(fd, &data, sizeof(int), 0); > + if (err == sizeof(int)) { > + err = os_falloc_punch(fd, 0, sizeof(int)); > + if (err == 0) { It's not clear to me how it's safe to punch a 4-byte hole in the file as a test?
On 13/11/2018 13:40, Jens Axboe wrote: > On 11/13/18 4:59 AM, anton.ivanov@cambridgegreys.com wrote: >> diff --git a/arch/um/drivers/Kconfig b/arch/um/drivers/Kconfig >> index 2b1aaf7755aa..e817fd4a4c30 100644 >> --- a/arch/um/drivers/Kconfig >> +++ b/arch/um/drivers/Kconfig >> @@ -1,4 +1,25 @@ >> # SPDX-License-Identifier: GPL-2.0 >> +menu "UML Block Devices" >> + >> +config BLK_DEV_UBD >> + bool "UBD Block Device" >> + default y >> + help >> + User Mode Linux virtual block device driver >> + >> +config BLK_DEV_UBD_SYNC >> + bool "Use Synchronous mode for UBD" >> + default n >> + help >> + Perform all disk operations synchronously (extremely slow). >> + >> +config BLK_DEV_UBD_DISCARD >> + bool "Enable DISCARD/TRIM support in UBD" >> + default y >> + help >> + Enable discard/trim support. Requires host kernel 3.x or above and >> + may not be supported on all host filesystems. >> +endmenu > kconfig entries are horrible ways to manage this. A module parameter > would be better. The latter also gets rid of the various ifdefs. I completely agree with you. Unfortunately, UBD for a number of reasons cannot be a module at present. It contains "host" side and loading something from the "inside" which breaks the hypervisor boundary is a definitive no go. In order to implement the suggestion we need the concept of loadable host side modules and while that is possible the likelihood of someone doing all the work on this one at present is rather low. > >> diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c >> index e78f27f5ce88..4fd4cb68f033 100644 >> --- a/arch/um/drivers/ubd_kern.c >> +++ b/arch/um/drivers/ubd_kern.c >> @@ -513,9 +513,13 @@ static void ubd_handler(void) >> for (count = 0; count < n/sizeof(struct io_thread_req *); count++) { >> struct io_thread_req *io_req = (*irq_req_buffer)[count]; >> >> - if (!blk_update_request(io_req->req, io_req->error, io_req->length)) >> - __blk_mq_end_request(io_req->req, io_req->error); >> - >> + if ((io_req->error) || (io_req->buffer == NULL)) { >> + blk_mq_end_request(io_req->req, io_req->error); >> + } else { >> + if (!blk_update_request(io_req->req, io_req->error, io_req->length)) { >> + __blk_mq_end_request(io_req->req, io_req->error); >> + } >> + } >> kfree(io_req); > This looks odd, why is this change necessary? Also a lot of unneeded > parentheses. Loop and NBD do it that way. I can retest with the original code now that the underlying culprit is known and resubmit if that works. > >> } >> } >> @@ -775,6 +779,7 @@ static int ubd_open_dev(struct ubd *ubd_dev) >> char **back_ptr; >> int err, create_cow, *create_ptr; >> int fd; >> + int data; >> >> ubd_dev->openflags = ubd_dev->boot_openflags; >> create_cow = 0; >> @@ -829,6 +834,23 @@ static int ubd_open_dev(struct ubd *ubd_dev) >> if(err < 0) goto error; >> ubd_dev->cow.fd = err; >> } >> +#ifdef CONFIG_BLK_DEV_UBD_DISCARD >> + if (ubd_dev->cow.file != NULL) >> + fd = ubd_dev->cow.fd; >> + else >> + fd = ubd_dev->fd; >> + err = os_pread_file(fd, &data, sizeof(int), 0); >> + if (err == sizeof(int)) { >> + err = os_falloc_punch(fd, 0, sizeof(int)); >> + if (err == 0) { > It's not clear to me how it's safe to punch a 4-byte hole in the file as > a test? A sizeof(int) sized hole is always implemented by simply writing zeroes. There is no fs manipulation here. The test reads the contents of the file, punches a hoile and writes the content back if the hole was punched successfully so the file is in its orginal state. Life would have been much easier if fallocate did not return -EINVAL on a 0 sized request. Unfortunately it does so any test has no choice but to punch a hole for real and fix the damage after that. The location is completely safe for most UMLs UBDs. The most common method of building images is to format a device directly with ext2/3/4 - there is no partition table. The first sector in ext2/3/4 if memory serves me right is unused. Other fs - less so. Partition table - totally safe as the boot code portion is unused. In any case, the code restores the data to its pre-test state.
On 11/13/18 7:04 AM, Anton Ivanov wrote: > On 13/11/2018 13:40, Jens Axboe wrote: >> On 11/13/18 4:59 AM, anton.ivanov@cambridgegreys.com wrote: >>> diff --git a/arch/um/drivers/Kconfig b/arch/um/drivers/Kconfig >>> index 2b1aaf7755aa..e817fd4a4c30 100644 >>> --- a/arch/um/drivers/Kconfig >>> +++ b/arch/um/drivers/Kconfig >>> @@ -1,4 +1,25 @@ >>> # SPDX-License-Identifier: GPL-2.0 >>> +menu "UML Block Devices" >>> + >>> +config BLK_DEV_UBD >>> + bool "UBD Block Device" >>> + default y >>> + help >>> + User Mode Linux virtual block device driver >>> + >>> +config BLK_DEV_UBD_SYNC >>> + bool "Use Synchronous mode for UBD" >>> + default n >>> + help >>> + Perform all disk operations synchronously (extremely slow). >>> + >>> +config BLK_DEV_UBD_DISCARD >>> + bool "Enable DISCARD/TRIM support in UBD" >>> + default y >>> + help >>> + Enable discard/trim support. Requires host kernel 3.x or above and >>> + may not be supported on all host filesystems. >>> +endmenu >> kconfig entries are horrible ways to manage this. A module parameter >> would be better. The latter also gets rid of the various ifdefs. > > I completely agree with you. Unfortunately, UBD for a number of reasons > cannot be a module at present. It contains "host" side and loading > something from the "inside" which breaks the hypervisor boundary is a > definitive no go. > > In order to implement the suggestion we need the concept of loadable > host side modules and while that is possible the likelihood of someone > doing all the work on this one at present is rather low. Module parameters work for builtin as well, then you just add ubd.discard=true to the kernel command line. >>> diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c >>> index e78f27f5ce88..4fd4cb68f033 100644 >>> --- a/arch/um/drivers/ubd_kern.c >>> +++ b/arch/um/drivers/ubd_kern.c >>> @@ -513,9 +513,13 @@ static void ubd_handler(void) >>> for (count = 0; count < n/sizeof(struct io_thread_req *); count++) { >>> struct io_thread_req *io_req = (*irq_req_buffer)[count]; >>> >>> - if (!blk_update_request(io_req->req, io_req->error, io_req->length)) >>> - __blk_mq_end_request(io_req->req, io_req->error); >>> - >>> + if ((io_req->error) || (io_req->buffer == NULL)) { >>> + blk_mq_end_request(io_req->req, io_req->error); >>> + } else { >>> + if (!blk_update_request(io_req->req, io_req->error, io_req->length)) { >>> + __blk_mq_end_request(io_req->req, io_req->error); >>> + } >>> + } >>> kfree(io_req); >> This looks odd, why is this change necessary? Also a lot of unneeded >> parentheses. > > Loop and NBD do it that way. I can retest with the original code now > that the underlying culprit is known and resubmit if that works. I missed the __ part, yes it actually looks fine. blk_mq_end_request() will both end the bio parts and free the request, your latter part does incremental completes. >>> @@ -775,6 +779,7 @@ static int ubd_open_dev(struct ubd *ubd_dev) >>> char **back_ptr; >>> int err, create_cow, *create_ptr; >>> int fd; >>> + int data; >>> >>> ubd_dev->openflags = ubd_dev->boot_openflags; >>> create_cow = 0; >>> @@ -829,6 +834,23 @@ static int ubd_open_dev(struct ubd *ubd_dev) >>> if(err < 0) goto error; >>> ubd_dev->cow.fd = err; >>> } >>> +#ifdef CONFIG_BLK_DEV_UBD_DISCARD >>> + if (ubd_dev->cow.file != NULL) >>> + fd = ubd_dev->cow.fd; >>> + else >>> + fd = ubd_dev->fd; >>> + err = os_pread_file(fd, &data, sizeof(int), 0); >>> + if (err == sizeof(int)) { >>> + err = os_falloc_punch(fd, 0, sizeof(int)); >>> + if (err == 0) { >> It's not clear to me how it's safe to punch a 4-byte hole in the file as >> a test? > > A sizeof(int) sized hole is always implemented by simply writing zeroes. > There is no fs manipulation here. > > The test reads the contents of the file, punches a hoile and writes the > content back if the hole was punched successfully so the file is in its > orginal state. > > Life would have been much easier if fallocate did not return -EINVAL on > a 0 sized request. Unfortunately it does so any test has no choice but > to punch a hole for real and fix the damage after that. > > The location is completely safe for most UMLs UBDs. The most common > method of building images is to format a device directly with ext2/3/4 - > there is no partition table. The first sector in ext2/3/4 if memory > serves me right is unused. Other fs - less so. Partition table - totally > safe as the boot code portion is unused. > > In any case, the code restores the data to its pre-test state. It's a bit nasty... You'd probably want to do a full sector though, the fs isn't going to be issuing a trim for anything that small. What happens if the system crashes before you successfully write back the data you zeroed? Doesn't seem like a really safe way to do this.
On 11/13/18 3:18 PM, Jens Axboe wrote: > On 11/13/18 7:04 AM, Anton Ivanov wrote: >> On 13/11/2018 13:40, Jens Axboe wrote: >>> On 11/13/18 4:59 AM, anton.ivanov@cambridgegreys.com wrote: >>>> diff --git a/arch/um/drivers/Kconfig b/arch/um/drivers/Kconfig >>>> index 2b1aaf7755aa..e817fd4a4c30 100644 >>>> --- a/arch/um/drivers/Kconfig >>>> +++ b/arch/um/drivers/Kconfig >>>> @@ -1,4 +1,25 @@ >>>> # SPDX-License-Identifier: GPL-2.0 >>>> +menu "UML Block Devices" >>>> + >>>> +config BLK_DEV_UBD >>>> + bool "UBD Block Device" >>>> + default y >>>> + help >>>> + User Mode Linux virtual block device driver >>>> + >>>> +config BLK_DEV_UBD_SYNC >>>> + bool "Use Synchronous mode for UBD" >>>> + default n >>>> + help >>>> + Perform all disk operations synchronously (extremely slow). >>>> + >>>> +config BLK_DEV_UBD_DISCARD >>>> + bool "Enable DISCARD/TRIM support in UBD" >>>> + default y >>>> + help >>>> + Enable discard/trim support. Requires host kernel 3.x or above and >>>> + may not be supported on all host filesystems. >>>> +endmenu >>> kconfig entries are horrible ways to manage this. A module parameter >>> would be better. The latter also gets rid of the various ifdefs. >> I completely agree with you. Unfortunately, UBD for a number of reasons >> cannot be a module at present. It contains "host" side and loading >> something from the "inside" which breaks the hypervisor boundary is a >> definitive no go. >> >> In order to implement the suggestion we need the concept of loadable >> host side modules and while that is possible the likelihood of someone >> doing all the work on this one at present is rather low. > Module parameters work for builtin as well, then you just add > ubd.discard=true to the kernel command line. > >>>> diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c >>>> index e78f27f5ce88..4fd4cb68f033 100644 >>>> --- a/arch/um/drivers/ubd_kern.c >>>> +++ b/arch/um/drivers/ubd_kern.c >>>> @@ -513,9 +513,13 @@ static void ubd_handler(void) >>>> for (count = 0; count < n/sizeof(struct io_thread_req *); count++) { >>>> struct io_thread_req *io_req = (*irq_req_buffer)[count]; >>>> >>>> - if (!blk_update_request(io_req->req, io_req->error, io_req->length)) >>>> - __blk_mq_end_request(io_req->req, io_req->error); >>>> - >>>> + if ((io_req->error) || (io_req->buffer == NULL)) { >>>> + blk_mq_end_request(io_req->req, io_req->error); >>>> + } else { >>>> + if (!blk_update_request(io_req->req, io_req->error, io_req->length)) { >>>> + __blk_mq_end_request(io_req->req, io_req->error); >>>> + } >>>> + } >>>> kfree(io_req); >>> This looks odd, why is this change necessary? Also a lot of unneeded >>> parentheses. >> Loop and NBD do it that way. I can retest with the original code now >> that the underlying culprit is known and resubmit if that works. > I missed the __ part, yes it actually looks fine. blk_mq_end_request() will > both end the bio parts and free the request, your latter part does incremental > completes. > >>>> @@ -775,6 +779,7 @@ static int ubd_open_dev(struct ubd *ubd_dev) >>>> char **back_ptr; >>>> int err, create_cow, *create_ptr; >>>> int fd; >>>> + int data; >>>> >>>> ubd_dev->openflags = ubd_dev->boot_openflags; >>>> create_cow = 0; >>>> @@ -829,6 +834,23 @@ static int ubd_open_dev(struct ubd *ubd_dev) >>>> if(err < 0) goto error; >>>> ubd_dev->cow.fd = err; >>>> } >>>> +#ifdef CONFIG_BLK_DEV_UBD_DISCARD >>>> + if (ubd_dev->cow.file != NULL) >>>> + fd = ubd_dev->cow.fd; >>>> + else >>>> + fd = ubd_dev->fd; >>>> + err = os_pread_file(fd, &data, sizeof(int), 0); >>>> + if (err == sizeof(int)) { >>>> + err = os_falloc_punch(fd, 0, sizeof(int)); >>>> + if (err == 0) { >>> It's not clear to me how it's safe to punch a 4-byte hole in the file as >>> a test? >> A sizeof(int) sized hole is always implemented by simply writing zeroes. >> There is no fs manipulation here. >> >> The test reads the contents of the file, punches a hoile and writes the >> content back if the hole was punched successfully so the file is in its >> orginal state. >> >> Life would have been much easier if fallocate did not return -EINVAL on >> a 0 sized request. Unfortunately it does so any test has no choice but >> to punch a hole for real and fix the damage after that. >> >> The location is completely safe for most UMLs UBDs. The most common >> method of building images is to format a device directly with ext2/3/4 - >> there is no partition table. The first sector in ext2/3/4 if memory >> serves me right is unused. Other fs - less so. Partition table - totally >> safe as the boot code portion is unused. >> >> In any case, the code restores the data to its pre-test state. > It's a bit nasty... You'd probably want to do a full sector though, the > fs isn't going to be issuing a trim for anything that small. > > What happens if the system crashes before you successfully write back > the data you zeroed? Doesn't seem like a really safe way to do this. There is no safe way to do this as far as I can see :) So if any probing is to be done it is inherently unsafe courtesy of fallocate returning EINVAL on zero sized requests. IMHO there is no point to do probing to start off with. Issuing a trim if the underlying fs does not support it does not result in anything visible to the user. I tried to trim ext4 images sitting on a MSDOS filesystem - there is no issues as the error is eaten somewhere inside the kernel. Nothing showing up to the user and no visible damage :) However, Richard asked for a probe. That is how a reliable probe would look like. I am not happy with it either and I think it is too risky for a lot of scenarios. If we are to probe, we will have to keep some piece of similar code in.
On 11/13/18 8:32 AM, Anton Ivanov wrote: > > On 11/13/18 3:18 PM, Jens Axboe wrote: >> On 11/13/18 7:04 AM, Anton Ivanov wrote: >>> On 13/11/2018 13:40, Jens Axboe wrote: >>>> On 11/13/18 4:59 AM, anton.ivanov@cambridgegreys.com wrote: >>>>> diff --git a/arch/um/drivers/Kconfig b/arch/um/drivers/Kconfig >>>>> index 2b1aaf7755aa..e817fd4a4c30 100644 >>>>> --- a/arch/um/drivers/Kconfig >>>>> +++ b/arch/um/drivers/Kconfig >>>>> @@ -1,4 +1,25 @@ >>>>> # SPDX-License-Identifier: GPL-2.0 >>>>> +menu "UML Block Devices" >>>>> + >>>>> +config BLK_DEV_UBD >>>>> + bool "UBD Block Device" >>>>> + default y >>>>> + help >>>>> + User Mode Linux virtual block device driver >>>>> + >>>>> +config BLK_DEV_UBD_SYNC >>>>> + bool "Use Synchronous mode for UBD" >>>>> + default n >>>>> + help >>>>> + Perform all disk operations synchronously (extremely slow). >>>>> + >>>>> +config BLK_DEV_UBD_DISCARD >>>>> + bool "Enable DISCARD/TRIM support in UBD" >>>>> + default y >>>>> + help >>>>> + Enable discard/trim support. Requires host kernel 3.x or above and >>>>> + may not be supported on all host filesystems. >>>>> +endmenu >>>> kconfig entries are horrible ways to manage this. A module parameter >>>> would be better. The latter also gets rid of the various ifdefs. >>> I completely agree with you. Unfortunately, UBD for a number of reasons >>> cannot be a module at present. It contains "host" side and loading >>> something from the "inside" which breaks the hypervisor boundary is a >>> definitive no go. >>> >>> In order to implement the suggestion we need the concept of loadable >>> host side modules and while that is possible the likelihood of someone >>> doing all the work on this one at present is rather low. >> Module parameters work for builtin as well, then you just add >> ubd.discard=true to the kernel command line. >> >>>>> diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c >>>>> index e78f27f5ce88..4fd4cb68f033 100644 >>>>> --- a/arch/um/drivers/ubd_kern.c >>>>> +++ b/arch/um/drivers/ubd_kern.c >>>>> @@ -513,9 +513,13 @@ static void ubd_handler(void) >>>>> for (count = 0; count < n/sizeof(struct io_thread_req *); count++) { >>>>> struct io_thread_req *io_req = (*irq_req_buffer)[count]; >>>>> >>>>> - if (!blk_update_request(io_req->req, io_req->error, io_req->length)) >>>>> - __blk_mq_end_request(io_req->req, io_req->error); >>>>> - >>>>> + if ((io_req->error) || (io_req->buffer == NULL)) { >>>>> + blk_mq_end_request(io_req->req, io_req->error); >>>>> + } else { >>>>> + if (!blk_update_request(io_req->req, io_req->error, io_req->length)) { >>>>> + __blk_mq_end_request(io_req->req, io_req->error); >>>>> + } >>>>> + } >>>>> kfree(io_req); >>>> This looks odd, why is this change necessary? Also a lot of unneeded >>>> parentheses. >>> Loop and NBD do it that way. I can retest with the original code now >>> that the underlying culprit is known and resubmit if that works. >> I missed the __ part, yes it actually looks fine. blk_mq_end_request() will >> both end the bio parts and free the request, your latter part does incremental >> completes. >> >>>>> @@ -775,6 +779,7 @@ static int ubd_open_dev(struct ubd *ubd_dev) >>>>> char **back_ptr; >>>>> int err, create_cow, *create_ptr; >>>>> int fd; >>>>> + int data; >>>>> >>>>> ubd_dev->openflags = ubd_dev->boot_openflags; >>>>> create_cow = 0; >>>>> @@ -829,6 +834,23 @@ static int ubd_open_dev(struct ubd *ubd_dev) >>>>> if(err < 0) goto error; >>>>> ubd_dev->cow.fd = err; >>>>> } >>>>> +#ifdef CONFIG_BLK_DEV_UBD_DISCARD >>>>> + if (ubd_dev->cow.file != NULL) >>>>> + fd = ubd_dev->cow.fd; >>>>> + else >>>>> + fd = ubd_dev->fd; >>>>> + err = os_pread_file(fd, &data, sizeof(int), 0); >>>>> + if (err == sizeof(int)) { >>>>> + err = os_falloc_punch(fd, 0, sizeof(int)); >>>>> + if (err == 0) { >>>> It's not clear to me how it's safe to punch a 4-byte hole in the file as >>>> a test? >>> A sizeof(int) sized hole is always implemented by simply writing zeroes. >>> There is no fs manipulation here. >>> >>> The test reads the contents of the file, punches a hoile and writes the >>> content back if the hole was punched successfully so the file is in its >>> orginal state. >>> >>> Life would have been much easier if fallocate did not return -EINVAL on >>> a 0 sized request. Unfortunately it does so any test has no choice but >>> to punch a hole for real and fix the damage after that. >>> >>> The location is completely safe for most UMLs UBDs. The most common >>> method of building images is to format a device directly with ext2/3/4 - >>> there is no partition table. The first sector in ext2/3/4 if memory >>> serves me right is unused. Other fs - less so. Partition table - totally >>> safe as the boot code portion is unused. >>> >>> In any case, the code restores the data to its pre-test state. >> It's a bit nasty... You'd probably want to do a full sector though, the >> fs isn't going to be issuing a trim for anything that small. >> >> What happens if the system crashes before you successfully write back >> the data you zeroed? Doesn't seem like a really safe way to do this. > > There is no safe way to do this as far as I can see :) > > So if any probing is to be done it is inherently unsafe courtesy of > fallocate returning EINVAL on zero sized requests. > > IMHO there is no point to do probing to start off with. Issuing a trim > if the underlying fs does not support it does not result in anything > visible to the user. I tried to trim ext4 images sitting on a MSDOS > filesystem - there is no issues as the error is eaten somewhere inside > the kernel. Nothing showing up to the user and no visible damage :) > > However, Richard asked for a probe. That is how a reliable probe would > look like. I am not happy with it either and I think it is too risky for > a lot of scenarios. If we are to probe, we will have to keep some piece > of similar code in. So here's what I suggest - default to discard being available, don't offer WRITE_ZEROES support. Don't set the flag that tells the block layer that discards guarantees zeroes. Have an internal flag that is set when discards are enabled. If a discard fails, then clear your internal flag and the queue support. You must still expect discards to arrive, if they do and your internal discard flag is not set, you just end them, not in error. That'll just treat discards as hints, and you don't have to have any broken probe logic.
On 13/11/2018 15:49, Jens Axboe wrote: > On 11/13/18 8:32 AM, Anton Ivanov wrote: >> On 11/13/18 3:18 PM, Jens Axboe wrote: >>> On 11/13/18 7:04 AM, Anton Ivanov wrote: >>>> On 13/11/2018 13:40, Jens Axboe wrote: >>>>> On 11/13/18 4:59 AM, anton.ivanov@cambridgegreys.com wrote: >>>>>> diff --git a/arch/um/drivers/Kconfig b/arch/um/drivers/Kconfig >>>>>> index 2b1aaf7755aa..e817fd4a4c30 100644 >>>>>> --- a/arch/um/drivers/Kconfig >>>>>> +++ b/arch/um/drivers/Kconfig >>>>>> @@ -1,4 +1,25 @@ >>>>>> # SPDX-License-Identifier: GPL-2.0 >>>>>> +menu "UML Block Devices" >>>>>> + >>>>>> +config BLK_DEV_UBD >>>>>> + bool "UBD Block Device" >>>>>> + default y >>>>>> + help >>>>>> + User Mode Linux virtual block device driver >>>>>> + >>>>>> +config BLK_DEV_UBD_SYNC >>>>>> + bool "Use Synchronous mode for UBD" >>>>>> + default n >>>>>> + help >>>>>> + Perform all disk operations synchronously (extremely slow). >>>>>> + >>>>>> +config BLK_DEV_UBD_DISCARD >>>>>> + bool "Enable DISCARD/TRIM support in UBD" >>>>>> + default y >>>>>> + help >>>>>> + Enable discard/trim support. Requires host kernel 3.x or above and >>>>>> + may not be supported on all host filesystems. >>>>>> +endmenu >>>>> kconfig entries are horrible ways to manage this. A module parameter >>>>> would be better. The latter also gets rid of the various ifdefs. >>>> I completely agree with you. Unfortunately, UBD for a number of reasons >>>> cannot be a module at present. It contains "host" side and loading >>>> something from the "inside" which breaks the hypervisor boundary is a >>>> definitive no go. >>>> >>>> In order to implement the suggestion we need the concept of loadable >>>> host side modules and while that is possible the likelihood of someone >>>> doing all the work on this one at present is rather low. >>> Module parameters work for builtin as well, then you just add >>> ubd.discard=true to the kernel command line. >>> >>>>>> diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c >>>>>> index e78f27f5ce88..4fd4cb68f033 100644 >>>>>> --- a/arch/um/drivers/ubd_kern.c >>>>>> +++ b/arch/um/drivers/ubd_kern.c >>>>>> @@ -513,9 +513,13 @@ static void ubd_handler(void) >>>>>> for (count = 0; count < n/sizeof(struct io_thread_req *); count++) { >>>>>> struct io_thread_req *io_req = (*irq_req_buffer)[count]; >>>>>> >>>>>> - if (!blk_update_request(io_req->req, io_req->error, io_req->length)) >>>>>> - __blk_mq_end_request(io_req->req, io_req->error); >>>>>> - >>>>>> + if ((io_req->error) || (io_req->buffer == NULL)) { >>>>>> + blk_mq_end_request(io_req->req, io_req->error); >>>>>> + } else { >>>>>> + if (!blk_update_request(io_req->req, io_req->error, io_req->length)) { >>>>>> + __blk_mq_end_request(io_req->req, io_req->error); >>>>>> + } >>>>>> + } >>>>>> kfree(io_req); >>>>> This looks odd, why is this change necessary? Also a lot of unneeded >>>>> parentheses. >>>> Loop and NBD do it that way. I can retest with the original code now >>>> that the underlying culprit is known and resubmit if that works. >>> I missed the __ part, yes it actually looks fine. blk_mq_end_request() will >>> both end the bio parts and free the request, your latter part does incremental >>> completes. >>> >>>>>> @@ -775,6 +779,7 @@ static int ubd_open_dev(struct ubd *ubd_dev) >>>>>> char **back_ptr; >>>>>> int err, create_cow, *create_ptr; >>>>>> int fd; >>>>>> + int data; >>>>>> >>>>>> ubd_dev->openflags = ubd_dev->boot_openflags; >>>>>> create_cow = 0; >>>>>> @@ -829,6 +834,23 @@ static int ubd_open_dev(struct ubd *ubd_dev) >>>>>> if(err < 0) goto error; >>>>>> ubd_dev->cow.fd = err; >>>>>> } >>>>>> +#ifdef CONFIG_BLK_DEV_UBD_DISCARD >>>>>> + if (ubd_dev->cow.file != NULL) >>>>>> + fd = ubd_dev->cow.fd; >>>>>> + else >>>>>> + fd = ubd_dev->fd; >>>>>> + err = os_pread_file(fd, &data, sizeof(int), 0); >>>>>> + if (err == sizeof(int)) { >>>>>> + err = os_falloc_punch(fd, 0, sizeof(int)); >>>>>> + if (err == 0) { >>>>> It's not clear to me how it's safe to punch a 4-byte hole in the file as >>>>> a test? >>>> A sizeof(int) sized hole is always implemented by simply writing zeroes. >>>> There is no fs manipulation here. >>>> >>>> The test reads the contents of the file, punches a hoile and writes the >>>> content back if the hole was punched successfully so the file is in its >>>> orginal state. >>>> >>>> Life would have been much easier if fallocate did not return -EINVAL on >>>> a 0 sized request. Unfortunately it does so any test has no choice but >>>> to punch a hole for real and fix the damage after that. >>>> >>>> The location is completely safe for most UMLs UBDs. The most common >>>> method of building images is to format a device directly with ext2/3/4 - >>>> there is no partition table. The first sector in ext2/3/4 if memory >>>> serves me right is unused. Other fs - less so. Partition table - totally >>>> safe as the boot code portion is unused. >>>> >>>> In any case, the code restores the data to its pre-test state. >>> It's a bit nasty... You'd probably want to do a full sector though, the >>> fs isn't going to be issuing a trim for anything that small. >>> >>> What happens if the system crashes before you successfully write back >>> the data you zeroed? Doesn't seem like a really safe way to do this. >> There is no safe way to do this as far as I can see :) >> >> So if any probing is to be done it is inherently unsafe courtesy of >> fallocate returning EINVAL on zero sized requests. >> >> IMHO there is no point to do probing to start off with. Issuing a trim >> if the underlying fs does not support it does not result in anything >> visible to the user. I tried to trim ext4 images sitting on a MSDOS >> filesystem - there is no issues as the error is eaten somewhere inside >> the kernel. Nothing showing up to the user and no visible damage :) >> >> However, Richard asked for a probe. That is how a reliable probe would >> look like. I am not happy with it either and I think it is too risky for >> a lot of scenarios. If we are to probe, we will have to keep some piece >> of similar code in. > So here's what I suggest - default to discard being available, don't > offer WRITE_ZEROES support. Don't set the flag that tells the block > layer that discards guarantees zeroes. Have an internal flag that is set > when discards are enabled. If a discard fails, then clear your internal > flag and the queue support. You must still expect discards to arrive, if > they do and your internal discard flag is not set, you just end them, > not in error. > > That'll just treat discards as hints, and you don't have to have any > broken probe logic. > Sounds good. I will implement this round of comments, retest and resubmit the series tomorrow.
Discards are defined to be hints. So if your fileystems return -EOPTNOTSUPP you just stop passing them to the file system and complete them from ->queue_rq. Write Zeroes isn't. So if you really want to support it you should manually write zeroes to the file as a fallback. But do you have a real use case for Write Zeroes to start with?
On 14/11/2018 15:30, Christoph Hellwig wrote: > Discards are defined to be hints. So if your fileystems return > -EOPTNOTSUPP you just stop passing them to the file system and > complete them from ->queue_rq. Ack. > > Write Zeroes isn't. So if you really want to support it you > should manually write zeroes to the file as a fallback. Hmm... Loop treats them as the same thing. I was looking at what they have done. > But do you > have a real use case for Write Zeroes to start with? I am not aware of. There is a real case just for trim for now and I was using loop and nbd as a "guide" on what needs to be done. Loop has both DISCARD and ZERO. Several other ops in the API looked interesting as well, but I have not seen them implemented anywhere so I skipped them for now.
diff --git a/arch/um/drivers/Kconfig b/arch/um/drivers/Kconfig index 2b1aaf7755aa..e817fd4a4c30 100644 --- a/arch/um/drivers/Kconfig +++ b/arch/um/drivers/Kconfig @@ -1,4 +1,25 @@ # SPDX-License-Identifier: GPL-2.0 +menu "UML Block Devices" + +config BLK_DEV_UBD + bool "UBD Block Device" + default y + help + User Mode Linux virtual block device driver + +config BLK_DEV_UBD_SYNC + bool "Use Synchronous mode for UBD" + default n + help + Perform all disk operations synchronously (extremely slow). + +config BLK_DEV_UBD_DISCARD + bool "Enable DISCARD/TRIM support in UBD" + default y + help + Enable discard/trim support. Requires host kernel 3.x or above and + may not be supported on all host filesystems. +endmenu menu "UML Character Devices" diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c index e78f27f5ce88..4fd4cb68f033 100644 --- a/arch/um/drivers/ubd_kern.c +++ b/arch/um/drivers/ubd_kern.c @@ -513,9 +513,13 @@ static void ubd_handler(void) for (count = 0; count < n/sizeof(struct io_thread_req *); count++) { struct io_thread_req *io_req = (*irq_req_buffer)[count]; - if (!blk_update_request(io_req->req, io_req->error, io_req->length)) - __blk_mq_end_request(io_req->req, io_req->error); - + if ((io_req->error) || (io_req->buffer == NULL)) { + blk_mq_end_request(io_req->req, io_req->error); + } else { + if (!blk_update_request(io_req->req, io_req->error, io_req->length)) { + __blk_mq_end_request(io_req->req, io_req->error); + } + } kfree(io_req); } } @@ -775,6 +779,7 @@ static int ubd_open_dev(struct ubd *ubd_dev) char **back_ptr; int err, create_cow, *create_ptr; int fd; + int data; ubd_dev->openflags = ubd_dev->boot_openflags; create_cow = 0; @@ -829,6 +834,23 @@ static int ubd_open_dev(struct ubd *ubd_dev) if(err < 0) goto error; ubd_dev->cow.fd = err; } +#ifdef CONFIG_BLK_DEV_UBD_DISCARD + if (ubd_dev->cow.file != NULL) + fd = ubd_dev->cow.fd; + else + fd = ubd_dev->fd; + err = os_pread_file(fd, &data, sizeof(int), 0); + if (err == sizeof(int)) { + err = os_falloc_punch(fd, 0, sizeof(int)); + if (err == 0) { + ubd_dev->queue->limits.discard_granularity = UBD_SECTOR_SIZE; + ubd_dev->queue->limits.discard_alignment = UBD_SECTOR_SIZE; + blk_queue_max_discard_sectors(ubd_dev->queue, UBD_MAX_REQUEST); + blk_queue_flag_set(QUEUE_FLAG_DISCARD, ubd_dev->queue); + os_pwrite_file(fd, &data, sizeof(int), 0); + } + } +#endif blk_queue_flag_set(QUEUE_FLAG_NONROT, ubd_dev->queue); return 0; error: @@ -1368,6 +1390,12 @@ static blk_status_t ubd_queue_rq(struct blk_mq_hw_ctx *hctx, } } break; +#ifdef CONFIG_BLK_DEV_UBD_DISCARD + case REQ_OP_DISCARD: + case REQ_OP_WRITE_ZEROES: + ret = ubd_queue_one_vec(hctx, req, (u64)blk_rq_pos(req) << 9, NULL); + break; +#endif default: WARN_ON_ONCE(1); spin_unlock_irq(&ubd_dev->lock); @@ -1447,7 +1475,7 @@ static int update_bitmap(struct io_thread_req *req) n = os_pwrite_file(req->fds[1], &req->bitmap_words, sizeof(req->bitmap_words), req->cow_offset); - if(n != sizeof(req->bitmap_words)) + if (n != sizeof(req->bitmap_words)) return map_error(-n); return map_error(0); @@ -1455,11 +1483,13 @@ static int update_bitmap(struct io_thread_req *req) static void do_io(struct io_thread_req *req) { - char *buf; + char *buf = NULL; unsigned long len; int n, nsectors, start, end, bit; __u64 off; + /* FLUSH is really a special case, we cannot "case" it with others */ + if (req_op(req->req) == REQ_OP_FLUSH) { /* fds[0] is always either the rw image or our cow file */ req->error = map_error(-os_sync_file(req->fds[0])); @@ -1479,26 +1509,44 @@ static void do_io(struct io_thread_req *req) off = req->offset + req->offsets[bit] + start * req->sectorsize; len = (end - start) * req->sectorsize; - buf = &req->buffer[start * req->sectorsize]; + if (req->buffer != NULL) + buf = &req->buffer[start * req->sectorsize]; - if (req_op(req->req) == REQ_OP_READ) { + switch (req_op(req->req)) { + case REQ_OP_READ: n = 0; do { buf = &buf[n]; len -= n; n = os_pread_file(req->fds[bit], buf, len, off); - if(n < 0){ + if (n < 0) { req->error = map_error(-n); return; } } while((n < len) && (n != 0)); if (n < len) memset(&buf[n], 0, len - n); - } else { + break; + case REQ_OP_WRITE: n = os_pwrite_file(req->fds[bit], buf, len, off); if(n != len){ req->error = map_error(-n); return; } + break; +#ifdef CONFIG_BLK_DEV_UBD_DISCARD + case REQ_OP_DISCARD: + case REQ_OP_WRITE_ZEROES: + n = os_falloc_punch(req->fds[bit], off, len); + if (n) { + req->error = map_error(-n); + return; + } + break; +#endif + default: + WARN_ON_ONCE(1); + req->error = BLK_STS_NOTSUPP; + return; } start = end; diff --git a/arch/um/include/shared/os.h b/arch/um/include/shared/os.h index 048ae37eb5aa..ebf23012a59b 100644 --- a/arch/um/include/shared/os.h +++ b/arch/um/include/shared/os.h @@ -175,6 +175,7 @@ extern int os_fchange_dir(int fd); extern unsigned os_major(unsigned long long dev); extern unsigned os_minor(unsigned long long dev); extern unsigned long long os_makedev(unsigned major, unsigned minor); +extern int os_falloc_punch(int fd, unsigned long long offset, int count); /* start_up.c */ extern void os_early_checks(void); diff --git a/arch/um/os-Linux/file.c b/arch/um/os-Linux/file.c index c0197097c86e..f25b110d4e70 100644 --- a/arch/um/os-Linux/file.c +++ b/arch/um/os-Linux/file.c @@ -610,3 +610,13 @@ unsigned long long os_makedev(unsigned major, unsigned minor) { return makedev(major, minor); } + +int os_falloc_punch(int fd, unsigned long long offset, int len) +{ + int n = fallocate(fd, FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE, offset, len); + + if (n < 0) + return -errno; + return n; +} +