diff mbox series

[RFC] ubd: remove use of blk_rq_map_sg

Message ID 20181015065637.1860-1-hch@lst.de
State Superseded
Headers show
Series [RFC] ubd: remove use of blk_rq_map_sg | expand

Commit Message

Christoph Hellwig Oct. 15, 2018, 6:56 a.m. UTC
There is no good reason to create a scatterlist in the ubd driver,
it can just iterate the request directly.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---

Now that we have the blk-mq conversion something like the patch below
should help to simplify the driver even further.

 arch/um/drivers/ubd_kern.c | 154 ++++++++++++-------------------------
 1 file changed, 50 insertions(+), 104 deletions(-)

Comments

Richard Weinberger Oct. 15, 2018, 8:40 a.m. UTC | #1
Christoph,

[CC'ing also Anton, he improved the old driver a lot.]

Anton, this patch is based on:
https://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux-block.git/?h=mq-conversions

Am Montag, 15. Oktober 2018, 08:56:37 CEST schrieb Christoph Hellwig:
> There is no good reason to create a scatterlist in the ubd driver,
> it can just iterate the request directly.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
> 
> Now that we have the blk-mq conversion something like the patch below
> should help to simplify the driver even further.

hm, this breaks UML.
Every filesystem fails to mount.

I did some very rough tests, it seems that the driver fails to read
data correctly as soon the upper layer tries to get more than 4096 bytes
at once out of the block device.

IOW:
dd if=/dev/ubda bs=4096 count=1 skip=0 2>/dev/null| md5sum -
is good.
As soon I set bs to something greater it returns garbage.

Later this day I might have some cycles left to debug further.

Thanks,
//richard
Christoph Hellwig Oct. 15, 2018, 8:45 a.m. UTC | #2
On Mon, Oct 15, 2018 at 10:40:06AM +0200, Richard Weinberger wrote:
> hm, this breaks UML.
> Every filesystem fails to mount.
> 
> I did some very rough tests, it seems that the driver fails to read
> data correctly as soon the upper layer tries to get more than 4096 bytes
> at once out of the block device.
> 
> IOW:
> dd if=/dev/ubda bs=4096 count=1 skip=0 2>/dev/null| md5sum -
> is good.
> As soon I set bs to something greater it returns garbage.
> 
> Later this day I might have some cycles left to debug further.

It probably needs this on top:

diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c
index aafe9c467fdc..5adce80a7ee3 100644
--- a/arch/um/drivers/ubd_kern.c
+++ b/arch/um/drivers/ubd_kern.c
@@ -1290,7 +1290,7 @@ static void cowify_req(struct io_thread_req *req, unsigned long *bitmap,
 }
 
 static int ubd_queue_one_vec(struct blk_mq_hw_ctx *hctx, struct request *req,
-		sector_t pos, struct bio_vec *bvec)
+		u64 off, struct bio_vec *bvec)
 {
 	struct ubd *dev = hctx->queue->queuedata;
 	struct io_thread_req *io_req;
@@ -1311,7 +1311,7 @@ static int ubd_queue_one_vec(struct blk_mq_hw_ctx *hctx, struct request *req,
 	} else {
 		io_req->fds[1] = dev->fd;
 		io_req->cow_offset = -1;
-		io_req->offset = (unsigned long long)pos << 9;
+		io_req->offset = off;
 		io_req->length = bvec->bv_len;
 		io_req->error = 0;
 		io_req->sector_mask = 0;
@@ -1346,17 +1346,17 @@ static blk_status_t ubd_queue_rq(struct blk_mq_hw_ctx *hctx,
 	struct bio *bio = req->bio;
 	struct bvec_iter iter;
 	struct bio_vec bvec;
-	sector_t pos = blk_rq_pos(req);
+	u64 off = (u64)blk_rq_pos(req) << 9;
 
 	blk_mq_start_request(req);
 
 	for_each_bio(bio) {
 		bio_for_each_segment(bvec, bio, iter) {
-			if (ubd_queue_one_vec(hctx, req, pos, &bvec) < 0) {
+			if (ubd_queue_one_vec(hctx, req, off, &bvec) < 0) {
 				blk_mq_requeue_request(req, true);
 				return BLK_STS_OK;
 			}
-			pos += bvec.bv_len;
+			off += bvec.bv_len;
 		}
 	}
Anton Ivanov Oct. 15, 2018, 8:46 a.m. UTC | #3
On 10/15/18 9:40 AM, Richard Weinberger wrote:
> Christoph,
>
> [CC'ing also Anton, he improved the old driver a lot.]

I was just about to start testing it :)

>
> Anton, this patch is based on:
> https://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux-block.git/?h=mq-conversions
Thanks. I remember an RFC patch going across the mailing list during the 
summer, but did not know where did this work get to.
>
> Am Montag, 15. Oktober 2018, 08:56:37 CEST schrieb Christoph Hellwig:
>> There is no good reason to create a scatterlist in the ubd driver,
>> it can just iterate the request directly.
>>
>> Signed-off-by: Christoph Hellwig <hch@lst.de>
>> ---
>>
>> Now that we have the blk-mq conversion something like the patch below
>> should help to simplify the driver even further.
> hm, this breaks UML.
> Every filesystem fails to mount.
>
> I did some very rough tests, it seems that the driver fails to read
> data correctly as soon the upper layer tries to get more than 4096 bytes
> at once out of the block device.
>
> IOW:
> dd if=/dev/ubda bs=4096 count=1 skip=0 2>/dev/null| md5sum -
> is good.
> As soon I set bs to something greater it returns garbage.
>
> Later this day I might have some cycles left to debug further.
I will have some time to look into this in the afternoon, but I have to 
have a look at the mq-conversions first.
> Thanks,
> //richard
>
>
>
Richard Weinberger Oct. 15, 2018, 7:17 p.m. UTC | #4
Am Montag, 15. Oktober 2018, 10:45:41 CEST schrieb Christoph Hellwig:
> On Mon, Oct 15, 2018 at 10:40:06AM +0200, Richard Weinberger wrote:
> > hm, this breaks UML.
> > Every filesystem fails to mount.
> > 
> > I did some very rough tests, it seems that the driver fails to read
> > data correctly as soon the upper layer tries to get more than 4096 bytes
> > at once out of the block device.
> > 
> > IOW:
> > dd if=/dev/ubda bs=4096 count=1 skip=0 2>/dev/null| md5sum -
> > is good.
> > As soon I set bs to something greater it returns garbage.
> > 
> > Later this day I might have some cycles left to debug further.
> 
> It probably needs this on top:

Sadly not. I'm checking now what exactly is broken.
BTW: Why are you using for_each_bio() with bio_for_each_segment()?
I thought we have rq_for_each_segment() for that.
 
Thanks,
//richard
Richard Weinberger Oct. 15, 2018, 8:42 p.m. UTC | #5
Am Montag, 15. Oktober 2018, 21:17:46 CEST schrieb Richard Weinberger:
> Am Montag, 15. Oktober 2018, 10:45:41 CEST schrieb Christoph Hellwig:
> > On Mon, Oct 15, 2018 at 10:40:06AM +0200, Richard Weinberger wrote:
> > > hm, this breaks UML.
> > > Every filesystem fails to mount.
> > > 
> > > I did some very rough tests, it seems that the driver fails to read
> > > data correctly as soon the upper layer tries to get more than 4096 bytes
> > > at once out of the block device.
> > > 
> > > IOW:
> > > dd if=/dev/ubda bs=4096 count=1 skip=0 2>/dev/null| md5sum -
> > > is good.
> > > As soon I set bs to something greater it returns garbage.
> > > 
> > > Later this day I might have some cycles left to debug further.
> > 
> > It probably needs this on top:
> 
> Sadly not. I'm checking now what exactly is broken.

I take this back. Christoph's fixup makes reading work.
The previous version corrupted my test block device in interesting ways
and confused all tests.
But the removal of blk_rq_map_sg() still has issues.
Now the device blocks endless upon flush.

:/ # cat /proc/251/stack
[<0>] __switch_to+0x56/0x85
[<0>] __schedule+0x427/0x472
[<0>] schedule+0x7c/0x95
[<0>] schedule_timeout+0x2b/0x1d2
[<0>] io_schedule_timeout+0x2b/0x48
[<0>] wait_for_common_io.constprop.2+0xdd/0x154
[<0>] wait_for_completion_io+0x1a/0x1c
[<0>] submit_bio_wait+0x5b/0x74
[<0>] blkdev_issue_flush+0x95/0xc3
[<0>] jbd2_journal_recover+0xe4/0xf6
[<0>] jbd2_journal_load+0x183/0x3b2
[<0>] ext4_fill_super+0x23e0/0x3602
[<0>] mount_bdev+0x18f/0x1f8
[<0>] ext4_mount+0x1a/0x1c
[<0>] mount_fs+0x13/0xf6
[<0>] vfs_kern_mount+0x78/0x139
[<0>] do_mount+0x874/0xb48
[<0>] ksys_mount+0x99/0xc0
[<0>] sys_mount+0x10/0x14
[<0>] handle_syscall+0x79/0xa7
[<0>] userspace+0x487/0x514
[<0>] fork_handler+0x94/0x96
[<0>] 0xffffffffffffffff

Just checked, the number of calls to blk_mq_start_request()
matches the number of calls to __blk_mq_end_request().
So I don't really get what is being waited on.

Thanks,
//richard
Christoph Hellwig Oct. 15, 2018, 8:55 p.m. UTC | #6
On Mon, Oct 15, 2018 at 10:42:47PM +0200, Richard Weinberger wrote:
> > Sadly not. I'm checking now what exactly is broken.
> 
> I take this back. Christoph's fixup makes reading work.
> The previous version corrupted my test block device in interesting ways
> and confused all tests.
> But the removal of blk_rq_map_sg() still has issues.
> Now the device blocks endless upon flush.

I suspect we still need to special case flush.  Updated patch below
including your other suggestion:

--
From ddde6697dca1034f8c6e9b6a96305ba417123362 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 15 Oct 2018 08:52:46 +0200
Subject: ubd: remove use of blk_rq_map_sg

There is no good reason to create a scatterlist in the ubd driver,
it can just iterate the request directly.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 arch/um/drivers/ubd_kern.c | 158 +++++++++++++------------------------
 1 file changed, 54 insertions(+), 104 deletions(-)

diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c
index 9cb0cabb4e02..38492a61a21f 100644
--- a/arch/um/drivers/ubd_kern.c
+++ b/arch/um/drivers/ubd_kern.c
@@ -160,12 +160,6 @@ struct ubd {
 	spinlock_t lock;
 };
 
-struct ubd_pdu {
-	struct scatterlist sg[MAX_SG];
-	int start_sg, end_sg;
-	sector_t rq_pos;
-};
-
 #define DEFAULT_COW { \
 	.file =			NULL, \
 	.fd =			-1,	\
@@ -197,9 +191,6 @@ static struct proc_dir_entry *proc_ide = NULL;
 
 static blk_status_t ubd_queue_rq(struct blk_mq_hw_ctx *hctx,
 				 const struct blk_mq_queue_data *bd);
-static int ubd_init_request(struct blk_mq_tag_set *set,
-			    struct request *req, unsigned int hctx_idx,
-			    unsigned int numa_node);
 
 static void make_proc_ide(void)
 {
@@ -895,7 +886,6 @@ static int ubd_disk_register(int major, u64 size, int unit,
 
 static const struct blk_mq_ops ubd_mq_ops = {
 	.queue_rq = ubd_queue_rq,
-	.init_request = ubd_init_request,
 };
 
 static int ubd_add(int n, char **error_out)
@@ -918,7 +908,6 @@ static int ubd_add(int n, char **error_out)
 	ubd_dev->tag_set.queue_depth = 64;
 	ubd_dev->tag_set.numa_node = NUMA_NO_NODE;
 	ubd_dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
-	ubd_dev->tag_set.cmd_size = sizeof(struct ubd_pdu);
 	ubd_dev->tag_set.driver_data = ubd_dev;
 	ubd_dev->tag_set.nr_hw_queues = 1;
 
@@ -1300,123 +1289,84 @@ static void cowify_req(struct io_thread_req *req, unsigned long *bitmap,
 			   req->bitmap_words, bitmap_len);
 }
 
-/* Called with dev->lock held */
-static void prepare_request(struct request *req, struct io_thread_req *io_req,
-			    unsigned long long offset, int page_offset,
-			    int len, struct page *page)
+static int ubd_queue_one_vec(struct blk_mq_hw_ctx *hctx, struct request *req,
+		u64 off, struct bio_vec *bvec)
 {
-	struct gendisk *disk = req->rq_disk;
-	struct ubd *ubd_dev = disk->private_data;
-
-	io_req->req = req;
-	io_req->fds[0] = (ubd_dev->cow.file != NULL) ? ubd_dev->cow.fd :
-		ubd_dev->fd;
-	io_req->fds[1] = ubd_dev->fd;
-	io_req->cow_offset = -1;
-	io_req->offset = offset;
-	io_req->length = len;
-	io_req->error = 0;
-	io_req->sector_mask = 0;
-
-	io_req->op = (rq_data_dir(req) == READ) ? UBD_READ : UBD_WRITE;
-	io_req->offsets[0] = 0;
-	io_req->offsets[1] = ubd_dev->cow.data_offset;
-	io_req->buffer = page_address(page) + page_offset;
-	io_req->sectorsize = 1 << 9;
-
-	if(ubd_dev->cow.file != NULL)
-		cowify_req(io_req, ubd_dev->cow.bitmap,
-			   ubd_dev->cow.bitmap_offset, ubd_dev->cow.bitmap_len);
-
-}
+	struct ubd *dev = hctx->queue->queuedata;
+	struct io_thread_req *io_req;
+	int written;
 
-/* Called with dev->lock held */
-static void prepare_flush_request(struct request *req,
-				  struct io_thread_req *io_req)
-{
-	struct gendisk *disk = req->rq_disk;
-	struct ubd *ubd_dev = disk->private_data;
+	io_req = kmalloc(sizeof(struct io_thread_req), GFP_ATOMIC);
+	if (!io_req)
+		return -ENOMEM;
 
 	io_req->req = req;
-	io_req->fds[0] = (ubd_dev->cow.file != NULL) ? ubd_dev->cow.fd :
-		ubd_dev->fd;
-	io_req->op = UBD_FLUSH;
-}
-
-static void submit_request(struct io_thread_req *io_req, struct ubd *dev)
-{
-	int n = os_write_file(thread_fd, &io_req,
-			     sizeof(io_req));
+	if (dev->cow.file)
+		io_req->fds[0] = dev->cow.fd;
+	else
+		io_req->fds[0] = dev->fd;
 
-	if (n != sizeof(io_req)) {
-		if (n != -EAGAIN)
-			pr_err("write to io thread failed: %d\n", -n);
+	if (req_op(req) == REQ_OP_FLUSH) {
+		io_req->op = UBD_FLUSH;
+	} else {
+		io_req->fds[1] = dev->fd;
+		io_req->cow_offset = -1;
+		io_req->offset = off;
+		io_req->length = bvec->bv_len;
+		io_req->error = 0;
+		io_req->sector_mask = 0;
+
+		io_req->op = rq_data_dir(req) == READ ? UBD_READ : UBD_WRITE;
+		io_req->offsets[0] = 0;
+		io_req->offsets[1] = dev->cow.data_offset;
+		io_req->buffer = page_address(bvec->bv_page) + bvec->bv_offset;
+		io_req->sectorsize = 1 << 9;
+
+		if (dev->cow.file) {
+			cowify_req(io_req, dev->cow.bitmap,
+				   dev->cow.bitmap_offset, dev->cow.bitmap_len);
+		}
+	}
 
+	written = os_write_file(thread_fd, &io_req, sizeof(io_req));
+	if (written != sizeof(io_req)) {
+		if (written != -EAGAIN)
+			pr_err("write to io thread failed: %d\n", -written);
 		blk_mq_requeue_request(io_req->req, true);
 		kfree(io_req);
 	}
+
+	return 0;
 }
 
 static blk_status_t ubd_queue_rq(struct blk_mq_hw_ctx *hctx,
 				 const struct blk_mq_queue_data *bd)
 {
 	struct request *req = bd->rq;
-	struct ubd *dev = hctx->queue->queuedata;
-	struct ubd_pdu *pdu = blk_mq_rq_to_pdu(req);
-	struct io_thread_req *io_req;
+	int ret = 0;
 
 	blk_mq_start_request(req);
 
-	pdu->rq_pos = blk_rq_pos(req);
-	pdu->start_sg = 0;
-	pdu->end_sg = blk_rq_map_sg(req->q, req, pdu->sg);
-
 	if (req_op(req) == REQ_OP_FLUSH) {
-		io_req = kmalloc(sizeof(struct io_thread_req), GFP_ATOMIC);
-		if (io_req == NULL) {
-			blk_mq_requeue_request(req, true);
-			goto done;
-		}
-		prepare_flush_request(req, io_req);
-		submit_request(io_req, dev);
-
-		goto done;
-	}
-
-	while (pdu->start_sg < pdu->end_sg) {
-		struct scatterlist *sg = &pdu->sg[pdu->start_sg];
-
-		io_req = kmalloc(sizeof(struct io_thread_req),
-				 GFP_ATOMIC);
-		if (io_req == NULL) {
-			blk_mq_requeue_request(req, true);
-			goto done;
+		ret = ubd_queue_one_vec(hctx, req, 0, NULL);
+	} else {
+		struct req_iterator iter;
+		struct bio_vec bvec;
+		u64 off = (u64)blk_rq_pos(req) << 9;
+
+		rq_for_each_segment(bvec, req, iter) {
+			ret = ubd_queue_one_vec(hctx, req, off, &bvec);
+			if (ret < 0)
+				goto out;
+			off += bvec.bv_len;
 		}
-		prepare_request(req, io_req,
-				(unsigned long long)pdu->rq_pos << 9,
-				sg->offset, sg->length, sg_page(sg));
-
-		submit_request(io_req, dev);
-
-		pdu->rq_pos += sg->length >> 9;
-		pdu->start_sg++;
 	}
-
-done:
+out:
+	if (ret < 0)
+		blk_mq_requeue_request(req, true);
 	return BLK_STS_OK;
 }
 
-static int ubd_init_request(struct blk_mq_tag_set *set,
-			    struct request *req, unsigned int hctx_idx,
-			    unsigned int numa_node)
-{
-	struct ubd_pdu *pdu = blk_mq_rq_to_pdu(req);
-
-	sg_init_table(pdu->sg, MAX_SG);
-
-	return 0;
-}
-
 static int ubd_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 {
 	struct ubd *ubd_dev = bdev->bd_disk->private_data;
Richard Weinberger Oct. 15, 2018, 9:46 p.m. UTC | #7
Am Montag, 15. Oktober 2018, 22:55:29 CEST schrieb Christoph Hellwig:
> On Mon, Oct 15, 2018 at 10:42:47PM +0200, Richard Weinberger wrote:
> > > Sadly not. I'm checking now what exactly is broken.
> > 
> > I take this back. Christoph's fixup makes reading work.
> > The previous version corrupted my test block device in interesting ways
> > and confused all tests.
> > But the removal of blk_rq_map_sg() still has issues.
> > Now the device blocks endless upon flush.
> 
> I suspect we still need to special case flush.  Updated patch below
> including your other suggestion:

While playing further with the patch I managed to hit
BUG_ON(blk_queued_rq(rq)) in blk_mq_requeue_request().

UML requeues the request in ubd_queue_one_vec() if it was not able
to submit the request to the host io-thread.
The fd can return -EAGAIN, then UML has to try later.

Isn't this allowed in that context?

drivers/block/xen-blkfront.c seem to faced the same problem, since
it does a list_del_init(&req->queuelist) right before calling
blk_mq_requeue_request() o_O.

Thanks,
//richard
Jens Axboe Oct. 15, 2018, 10:04 p.m. UTC | #8
On 10/15/18 3:46 PM, Richard Weinberger wrote:
> Am Montag, 15. Oktober 2018, 22:55:29 CEST schrieb Christoph Hellwig:
>> On Mon, Oct 15, 2018 at 10:42:47PM +0200, Richard Weinberger wrote:
>>>> Sadly not. I'm checking now what exactly is broken.
>>>
>>> I take this back. Christoph's fixup makes reading work.
>>> The previous version corrupted my test block device in interesting ways
>>> and confused all tests.
>>> But the removal of blk_rq_map_sg() still has issues.
>>> Now the device blocks endless upon flush.
>>
>> I suspect we still need to special case flush.  Updated patch below
>> including your other suggestion:
> 
> While playing further with the patch I managed to hit
> BUG_ON(blk_queued_rq(rq)) in blk_mq_requeue_request().
> 
> UML requeues the request in ubd_queue_one_vec() if it was not able
> to submit the request to the host io-thread.
> The fd can return -EAGAIN, then UML has to try later.
> 
> Isn't this allowed in that context?

It is, the problem is that queue_one_vec() doesn't always return an
error. The caller is doing a loop per bio, so we can encounter an
error, requeue, and then the caller will call us again. We're in
an illegal state at that point, and the next requeue will make that
obvious since it's already pending. Actually, both the caller and
ubd_queue_one_vec() also requeue. So it's a bit of a mess, the below
might help.


--- arch/um//drivers/ubd_kern.c~	2018-10-15 16:01:06.813391968 -0600
+++ arch/um/drivers/ubd_kern.c	2018-10-15 16:03:51.870657352 -0600
@@ -1294,7 +1294,7 @@
 {
 	struct ubd *dev = hctx->queue->queuedata;
 	struct io_thread_req *io_req;
-	int written;
+	int ret;
 
 	io_req = kmalloc(sizeof(struct io_thread_req), GFP_ATOMIC);
 	if (!io_req)
@@ -1328,15 +1328,14 @@
 		}
 	}
 
-	written = os_write_file(thread_fd, &io_req, sizeof(io_req));
-	if (written != sizeof(io_req)) {
+	ret = os_write_file(thread_fd, &io_req, sizeof(io_req));
+	if (ret != sizeof(io_req)) {
 		if (written != -EAGAIN)
-			pr_err("write to io thread failed: %d\n", -written);
-		blk_mq_requeue_request(io_req->req, true);
+			pr_err("write to io thread failed: %d\n", -ret);
 		kfree(io_req);
 	}
 
-	return 0;
+	return ret;
 }
 
 static blk_status_t ubd_queue_rq(struct blk_mq_hw_ctx *hctx,
Richard Weinberger Oct. 15, 2018, 10:44 p.m. UTC | #9
Am Dienstag, 16. Oktober 2018, 00:04:20 CEST schrieb Jens Axboe:
> On 10/15/18 3:46 PM, Richard Weinberger wrote:
> > Am Montag, 15. Oktober 2018, 22:55:29 CEST schrieb Christoph Hellwig:
> >> On Mon, Oct 15, 2018 at 10:42:47PM +0200, Richard Weinberger wrote:
> >>>> Sadly not. I'm checking now what exactly is broken.
> >>>
> >>> I take this back. Christoph's fixup makes reading work.
> >>> The previous version corrupted my test block device in interesting ways
> >>> and confused all tests.
> >>> But the removal of blk_rq_map_sg() still has issues.
> >>> Now the device blocks endless upon flush.
> >>
> >> I suspect we still need to special case flush.  Updated patch below
> >> including your other suggestion:
> > 
> > While playing further with the patch I managed to hit
> > BUG_ON(blk_queued_rq(rq)) in blk_mq_requeue_request().
> > 
> > UML requeues the request in ubd_queue_one_vec() if it was not able
> > to submit the request to the host io-thread.
> > The fd can return -EAGAIN, then UML has to try later.
> > 
> > Isn't this allowed in that context?
> 
> It is, the problem is that queue_one_vec() doesn't always return an
> error. The caller is doing a loop per bio, so we can encounter an
> error, requeue, and then the caller will call us again. We're in
> an illegal state at that point, and the next requeue will make that
> obvious since it's already pending. Actually, both the caller and
> ubd_queue_one_vec() also requeue. So it's a bit of a mess, the below
> might help.

I agree, the driver *is* a mess.
Unless someone else volunteers to clean it up, I'll push that task on
my never ending TODO list.

Thanks for your hint with the illegal state.
Now with correct requeuing the driver seems to work fine!
Write/Flush support also suffered from that but didn't trigger the BUG_ON()...

Thanks,
//richard
Jens Axboe Oct. 16, 2018, 2:19 a.m. UTC | #10
On 10/15/18 4:44 PM, Richard Weinberger wrote:
> Am Dienstag, 16. Oktober 2018, 00:04:20 CEST schrieb Jens Axboe:
>> On 10/15/18 3:46 PM, Richard Weinberger wrote:
>>> Am Montag, 15. Oktober 2018, 22:55:29 CEST schrieb Christoph Hellwig:
>>>> On Mon, Oct 15, 2018 at 10:42:47PM +0200, Richard Weinberger wrote:
>>>>>> Sadly not. I'm checking now what exactly is broken.
>>>>>
>>>>> I take this back. Christoph's fixup makes reading work.
>>>>> The previous version corrupted my test block device in interesting ways
>>>>> and confused all tests.
>>>>> But the removal of blk_rq_map_sg() still has issues.
>>>>> Now the device blocks endless upon flush.
>>>>
>>>> I suspect we still need to special case flush.  Updated patch below
>>>> including your other suggestion:
>>>
>>> While playing further with the patch I managed to hit
>>> BUG_ON(blk_queued_rq(rq)) in blk_mq_requeue_request().
>>>
>>> UML requeues the request in ubd_queue_one_vec() if it was not able
>>> to submit the request to the host io-thread.
>>> The fd can return -EAGAIN, then UML has to try later.
>>>
>>> Isn't this allowed in that context?
>>
>> It is, the problem is that queue_one_vec() doesn't always return an
>> error. The caller is doing a loop per bio, so we can encounter an
>> error, requeue, and then the caller will call us again. We're in
>> an illegal state at that point, and the next requeue will make that
>> obvious since it's already pending. Actually, both the caller and
>> ubd_queue_one_vec() also requeue. So it's a bit of a mess, the below
>> might help.
> 
> I agree, the driver *is* a mess.
> Unless someone else volunteers to clean it up, I'll push that task on
> my never ending TODO list.

I doubt you'll have to fight anyone for that task ;-)

> Thanks for your hint with the illegal state.
> Now with correct requeuing the driver seems to work fine!
> Write/Flush support also suffered from that but didn't trigger the BUG_ON()...

OK good, at least we're making progress!
Richard Weinberger Oct. 16, 2018, 8:38 a.m. UTC | #11
Am Dienstag, 16. Oktober 2018, 04:19:51 CEST schrieb Jens Axboe:
> On 10/15/18 4:44 PM, Richard Weinberger wrote:
> > Am Dienstag, 16. Oktober 2018, 00:04:20 CEST schrieb Jens Axboe:
> >> On 10/15/18 3:46 PM, Richard Weinberger wrote:
> >>> Am Montag, 15. Oktober 2018, 22:55:29 CEST schrieb Christoph Hellwig:
> >>>> On Mon, Oct 15, 2018 at 10:42:47PM +0200, Richard Weinberger wrote:
> >>>>>> Sadly not. I'm checking now what exactly is broken.
> >>>>>
> >>>>> I take this back. Christoph's fixup makes reading work.
> >>>>> The previous version corrupted my test block device in interesting ways
> >>>>> and confused all tests.
> >>>>> But the removal of blk_rq_map_sg() still has issues.
> >>>>> Now the device blocks endless upon flush.
> >>>>
> >>>> I suspect we still need to special case flush.  Updated patch below
> >>>> including your other suggestion:
> >>>
> >>> While playing further with the patch I managed to hit
> >>> BUG_ON(blk_queued_rq(rq)) in blk_mq_requeue_request().
> >>>
> >>> UML requeues the request in ubd_queue_one_vec() if it was not able
> >>> to submit the request to the host io-thread.
> >>> The fd can return -EAGAIN, then UML has to try later.
> >>>
> >>> Isn't this allowed in that context?
> >>
> >> It is, the problem is that queue_one_vec() doesn't always return an
> >> error. The caller is doing a loop per bio, so we can encounter an
> >> error, requeue, and then the caller will call us again. We're in
> >> an illegal state at that point, and the next requeue will make that
> >> obvious since it's already pending. Actually, both the caller and
> >> ubd_queue_one_vec() also requeue. So it's a bit of a mess, the below
> >> might help.
> > 
> > I agree, the driver *is* a mess.
> > Unless someone else volunteers to clean it up, I'll push that task on
> > my never ending TODO list.
> 
> I doubt you'll have to fight anyone for that task ;-)
> 
> > Thanks for your hint with the illegal state.
> > Now with correct requeuing the driver seems to work fine!
> > Write/Flush support also suffered from that but didn't trigger the BUG_ON()...
> 
> OK good, at least we're making progress!

Yes. Shall I send a patch with your suggestion or will you?

I have one more question, in your first conversion you set queue_depth to 2.
How does one know this value?
My conversion has 64, which is more or less an educated guess... ;)

Thanks,
//richard
Jens Axboe Oct. 16, 2018, 2:26 p.m. UTC | #12
On 10/16/18 2:38 AM, Richard Weinberger wrote:
> Am Dienstag, 16. Oktober 2018, 04:19:51 CEST schrieb Jens Axboe:
>> On 10/15/18 4:44 PM, Richard Weinberger wrote:
>>> Am Dienstag, 16. Oktober 2018, 00:04:20 CEST schrieb Jens Axboe:
>>>> On 10/15/18 3:46 PM, Richard Weinberger wrote:
>>>>> Am Montag, 15. Oktober 2018, 22:55:29 CEST schrieb Christoph Hellwig:
>>>>>> On Mon, Oct 15, 2018 at 10:42:47PM +0200, Richard Weinberger wrote:
>>>>>>>> Sadly not. I'm checking now what exactly is broken.
>>>>>>>
>>>>>>> I take this back. Christoph's fixup makes reading work.
>>>>>>> The previous version corrupted my test block device in interesting ways
>>>>>>> and confused all tests.
>>>>>>> But the removal of blk_rq_map_sg() still has issues.
>>>>>>> Now the device blocks endless upon flush.
>>>>>>
>>>>>> I suspect we still need to special case flush.  Updated patch below
>>>>>> including your other suggestion:
>>>>>
>>>>> While playing further with the patch I managed to hit
>>>>> BUG_ON(blk_queued_rq(rq)) in blk_mq_requeue_request().
>>>>>
>>>>> UML requeues the request in ubd_queue_one_vec() if it was not able
>>>>> to submit the request to the host io-thread.
>>>>> The fd can return -EAGAIN, then UML has to try later.
>>>>>
>>>>> Isn't this allowed in that context?
>>>>
>>>> It is, the problem is that queue_one_vec() doesn't always return an
>>>> error. The caller is doing a loop per bio, so we can encounter an
>>>> error, requeue, and then the caller will call us again. We're in
>>>> an illegal state at that point, and the next requeue will make that
>>>> obvious since it's already pending. Actually, both the caller and
>>>> ubd_queue_one_vec() also requeue. So it's a bit of a mess, the below
>>>> might help.
>>>
>>> I agree, the driver *is* a mess.
>>> Unless someone else volunteers to clean it up, I'll push that task on
>>> my never ending TODO list.
>>
>> I doubt you'll have to fight anyone for that task ;-)
>>
>>> Thanks for your hint with the illegal state.
>>> Now with correct requeuing the driver seems to work fine!
>>> Write/Flush support also suffered from that but didn't trigger the BUG_ON()...
>>
>> OK good, at least we're making progress!
> 
> Yes. Shall I send a patch with your suggestion or will you?

Christoph should just fold it in, the bug only exists after his
change to it.

> I have one more question, in your first conversion you set queue_depth to 2.
> How does one know this value?
> My conversion has 64, which is more or less an educated guess... ;)

64 is most likely just fine. Some drivers rely on having 1 in flight and
it's easier to manage to just let blk-mq take care of that. Outside of that,
there aren't any magic values. We should probably just use BLKDEV_MAX_RQ
for ones that don't have a specific hw limit or need.
Richard Weinberger Oct. 16, 2018, 10:43 p.m. UTC | #13
On Mon, Oct 15, 2018 at 8:56 AM Christoph Hellwig <hch@lst.de> wrote:
>
> There is no good reason to create a scatterlist in the ubd driver,
> it can just iterate the request directly.

BTW: Does it make sense to drop blk_rq_map_sq from()
drivers/mtd/ubi/block.c too?
If so we have to allocate a temporary structure for the worker thread
for each segment, just like
UBD does already. I'm not sure if that is cheaper than blk_rq_map_sq().
Christoph Hellwig Oct. 17, 2018, 6:14 a.m. UTC | #14
On Wed, Oct 17, 2018 at 12:43:14AM +0200, Richard Weinberger wrote:
> On Mon, Oct 15, 2018 at 8:56 AM Christoph Hellwig <hch@lst.de> wrote:
> >
> > There is no good reason to create a scatterlist in the ubd driver,
> > it can just iterate the request directly.
> 
> BTW: Does it make sense to drop blk_rq_map_sq from()
> drivers/mtd/ubi/block.c too?
> If so we have to allocate a temporary structure for the worker thread
> for each segment, just like
> UBD does already. I'm not sure if that is cheaper than blk_rq_map_sq().

UBI should not need a new per-thread structure, mostly because there
are no threads involved.  The scatterlist support in UBI only seems
to exists for ubiblock, but it goes down a few layers.  In the end
all that could switch to a iov_iter-like setup and clean things up
a lot, but it would be a fair amount of work.
Christoph Hellwig Oct. 17, 2018, 6:21 a.m. UTC | #15
On Tue, Oct 16, 2018 at 08:26:31AM -0600, Jens Axboe wrote:
> > Yes. Shall I send a patch with your suggestion or will you?
> 
> Christoph should just fold it in, the bug only exists after his
> change to it.

Sorry, I missed what suggestion we had.  Is that the patch form Jens
earlier?

> 
> > I have one more question, in your first conversion you set queue_depth to 2.
> > How does one know this value?
> > My conversion has 64, which is more or less an educated guess... ;)
> 
> 64 is most likely just fine. Some drivers rely on having 1 in flight and
> it's easier to manage to just let blk-mq take care of that. Outside of that,
> there aren't any magic values. We should probably just use BLKDEV_MAX_RQ
> for ones that don't have a specific hw limit or need.

Is a queue depth > 1 actually safe for ubd?  There are some odd global
variables tracking struct io_thread_req.
Richard Weinberger Oct. 18, 2018, 9:04 p.m. UTC | #16
Am Mittwoch, 17. Oktober 2018, 08:21:51 CEST schrieb Christoph Hellwig:
> On Tue, Oct 16, 2018 at 08:26:31AM -0600, Jens Axboe wrote:
> > > Yes. Shall I send a patch with your suggestion or will you?
> > 
> > Christoph should just fold it in, the bug only exists after his
> > change to it.
> 
> Sorry, I missed what suggestion we had.  Is that the patch form Jens
> earlier?

Just send a v2 for you with all suggestions folded in. :)

> > 
> > > I have one more question, in your first conversion you set queue_depth to 2.
> > > How does one know this value?
> > > My conversion has 64, which is more or less an educated guess... ;)
> > 
> > 64 is most likely just fine. Some drivers rely on having 1 in flight and
> > it's easier to manage to just let blk-mq take care of that. Outside of that,
> > there aren't any magic values. We should probably just use BLKDEV_MAX_RQ
> > for ones that don't have a specific hw limit or need.
> 
> Is a queue depth > 1 actually safe for ubd?  There are some odd global
> variables tracking struct io_thread_req.

This means we can have up to tag_set.queue_depth concurrent ubd_queue_rq() calls?
The global variables should be safe since we use them only from our IO thread
and UML is UP anyways.

But we should double check, thanks for the hint.
Anton, can you please check?

Thanks,
//richard
diff mbox series

Patch

diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c
index 9cb0cabb4e02..aafe9c467fdc 100644
--- a/arch/um/drivers/ubd_kern.c
+++ b/arch/um/drivers/ubd_kern.c
@@ -160,12 +160,6 @@  struct ubd {
 	spinlock_t lock;
 };
 
-struct ubd_pdu {
-	struct scatterlist sg[MAX_SG];
-	int start_sg, end_sg;
-	sector_t rq_pos;
-};
-
 #define DEFAULT_COW { \
 	.file =			NULL, \
 	.fd =			-1,	\
@@ -197,9 +191,6 @@  static struct proc_dir_entry *proc_ide = NULL;
 
 static blk_status_t ubd_queue_rq(struct blk_mq_hw_ctx *hctx,
 				 const struct blk_mq_queue_data *bd);
-static int ubd_init_request(struct blk_mq_tag_set *set,
-			    struct request *req, unsigned int hctx_idx,
-			    unsigned int numa_node);
 
 static void make_proc_ide(void)
 {
@@ -895,7 +886,6 @@  static int ubd_disk_register(int major, u64 size, int unit,
 
 static const struct blk_mq_ops ubd_mq_ops = {
 	.queue_rq = ubd_queue_rq,
-	.init_request = ubd_init_request,
 };
 
 static int ubd_add(int n, char **error_out)
@@ -918,7 +908,6 @@  static int ubd_add(int n, char **error_out)
 	ubd_dev->tag_set.queue_depth = 64;
 	ubd_dev->tag_set.numa_node = NUMA_NO_NODE;
 	ubd_dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
-	ubd_dev->tag_set.cmd_size = sizeof(struct ubd_pdu);
 	ubd_dev->tag_set.driver_data = ubd_dev;
 	ubd_dev->tag_set.nr_hw_queues = 1;
 
@@ -1300,123 +1289,80 @@  static void cowify_req(struct io_thread_req *req, unsigned long *bitmap,
 			   req->bitmap_words, bitmap_len);
 }
 
-/* Called with dev->lock held */
-static void prepare_request(struct request *req, struct io_thread_req *io_req,
-			    unsigned long long offset, int page_offset,
-			    int len, struct page *page)
+static int ubd_queue_one_vec(struct blk_mq_hw_ctx *hctx, struct request *req,
+		sector_t pos, struct bio_vec *bvec)
 {
-	struct gendisk *disk = req->rq_disk;
-	struct ubd *ubd_dev = disk->private_data;
-
-	io_req->req = req;
-	io_req->fds[0] = (ubd_dev->cow.file != NULL) ? ubd_dev->cow.fd :
-		ubd_dev->fd;
-	io_req->fds[1] = ubd_dev->fd;
-	io_req->cow_offset = -1;
-	io_req->offset = offset;
-	io_req->length = len;
-	io_req->error = 0;
-	io_req->sector_mask = 0;
-
-	io_req->op = (rq_data_dir(req) == READ) ? UBD_READ : UBD_WRITE;
-	io_req->offsets[0] = 0;
-	io_req->offsets[1] = ubd_dev->cow.data_offset;
-	io_req->buffer = page_address(page) + page_offset;
-	io_req->sectorsize = 1 << 9;
-
-	if(ubd_dev->cow.file != NULL)
-		cowify_req(io_req, ubd_dev->cow.bitmap,
-			   ubd_dev->cow.bitmap_offset, ubd_dev->cow.bitmap_len);
-
-}
+	struct ubd *dev = hctx->queue->queuedata;
+	struct io_thread_req *io_req;
+	int written;
 
-/* Called with dev->lock held */
-static void prepare_flush_request(struct request *req,
-				  struct io_thread_req *io_req)
-{
-	struct gendisk *disk = req->rq_disk;
-	struct ubd *ubd_dev = disk->private_data;
+	io_req = kmalloc(sizeof(struct io_thread_req), GFP_ATOMIC);
+	if (!io_req)
+		return -ENOMEM;
 
 	io_req->req = req;
-	io_req->fds[0] = (ubd_dev->cow.file != NULL) ? ubd_dev->cow.fd :
-		ubd_dev->fd;
-	io_req->op = UBD_FLUSH;
-}
-
-static void submit_request(struct io_thread_req *io_req, struct ubd *dev)
-{
-	int n = os_write_file(thread_fd, &io_req,
-			     sizeof(io_req));
+	if (dev->cow.file)
+		io_req->fds[0] = dev->cow.fd;
+	else
+		io_req->fds[0] = dev->fd;
 
-	if (n != sizeof(io_req)) {
-		if (n != -EAGAIN)
-			pr_err("write to io thread failed: %d\n", -n);
+	if (req_op(req) == REQ_OP_FLUSH) {
+		io_req->op = UBD_FLUSH;
+	} else {
+		io_req->fds[1] = dev->fd;
+		io_req->cow_offset = -1;
+		io_req->offset = (unsigned long long)pos << 9;
+		io_req->length = bvec->bv_len;
+		io_req->error = 0;
+		io_req->sector_mask = 0;
+
+		io_req->op = rq_data_dir(req) == READ ? UBD_READ : UBD_WRITE;
+		io_req->offsets[0] = 0;
+		io_req->offsets[1] = dev->cow.data_offset;
+		io_req->buffer = page_address(bvec->bv_page) + bvec->bv_offset;
+		io_req->sectorsize = 1 << 9;
+
+		if (dev->cow.file) {
+			cowify_req(io_req, dev->cow.bitmap,
+				   dev->cow.bitmap_offset, dev->cow.bitmap_len);
+		}
+	}
 
+	written = os_write_file(thread_fd, &io_req, sizeof(io_req));
+	if (written != sizeof(io_req)) {
+		if (written != -EAGAIN)
+			pr_err("write to io thread failed: %d\n", -written);
 		blk_mq_requeue_request(io_req->req, true);
 		kfree(io_req);
 	}
+
+	return 0;
 }
 
 static blk_status_t ubd_queue_rq(struct blk_mq_hw_ctx *hctx,
 				 const struct blk_mq_queue_data *bd)
 {
 	struct request *req = bd->rq;
-	struct ubd *dev = hctx->queue->queuedata;
-	struct ubd_pdu *pdu = blk_mq_rq_to_pdu(req);
-	struct io_thread_req *io_req;
+	struct bio *bio = req->bio;
+	struct bvec_iter iter;
+	struct bio_vec bvec;
+	sector_t pos = blk_rq_pos(req);
 
 	blk_mq_start_request(req);
 
-	pdu->rq_pos = blk_rq_pos(req);
-	pdu->start_sg = 0;
-	pdu->end_sg = blk_rq_map_sg(req->q, req, pdu->sg);
-
-	if (req_op(req) == REQ_OP_FLUSH) {
-		io_req = kmalloc(sizeof(struct io_thread_req), GFP_ATOMIC);
-		if (io_req == NULL) {
-			blk_mq_requeue_request(req, true);
-			goto done;
-		}
-		prepare_flush_request(req, io_req);
-		submit_request(io_req, dev);
-
-		goto done;
-	}
-
-	while (pdu->start_sg < pdu->end_sg) {
-		struct scatterlist *sg = &pdu->sg[pdu->start_sg];
-
-		io_req = kmalloc(sizeof(struct io_thread_req),
-				 GFP_ATOMIC);
-		if (io_req == NULL) {
-			blk_mq_requeue_request(req, true);
-			goto done;
+	for_each_bio(bio) {
+		bio_for_each_segment(bvec, bio, iter) {
+			if (ubd_queue_one_vec(hctx, req, pos, &bvec) < 0) {
+				blk_mq_requeue_request(req, true);
+				return BLK_STS_OK;
+			}
+			pos += bvec.bv_len;
 		}
-		prepare_request(req, io_req,
-				(unsigned long long)pdu->rq_pos << 9,
-				sg->offset, sg->length, sg_page(sg));
-
-		submit_request(io_req, dev);
-
-		pdu->rq_pos += sg->length >> 9;
-		pdu->start_sg++;
 	}
 
-done:
 	return BLK_STS_OK;
 }
 
-static int ubd_init_request(struct blk_mq_tag_set *set,
-			    struct request *req, unsigned int hctx_idx,
-			    unsigned int numa_node)
-{
-	struct ubd_pdu *pdu = blk_mq_rq_to_pdu(req);
-
-	sg_init_table(pdu->sg, MAX_SG);
-
-	return 0;
-}
-
 static int ubd_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 {
 	struct ubd *ubd_dev = bdev->bd_disk->private_data;