Patch Detail
get:
Show a patch.
patch:
Update a patch.
put:
Update a patch.
GET /api/1.1/patches/2229524/?format=api
{ "id": 2229524, "url": "http://patchwork.ozlabs.org/api/1.1/patches/2229524/?format=api", "web_url": "http://patchwork.ozlabs.org/project/linux-ext4/patch/20260428114730.14384-1-changfengnan@bytedance.com/", "project": { "id": 8, "url": "http://patchwork.ozlabs.org/api/1.1/projects/8/?format=api", "name": "Linux ext4 filesystem development", "link_name": "linux-ext4", "list_id": "linux-ext4.vger.kernel.org", "list_email": "linux-ext4@vger.kernel.org", "web_url": null, "scm_url": null, "webscm_url": null }, "msgid": "<20260428114730.14384-1-changfengnan@bytedance.com>", "date": "2026-04-28T11:47:30", "name": "[v2] iomap: add simple read path for small direct I/O", "commit_ref": null, "pull_url": null, "state": "new", "archived": false, "hash": "58e0a5012614c0afb7eacc31720b9e403547875f", "submitter": { "id": 85004, "url": "http://patchwork.ozlabs.org/api/1.1/people/85004/?format=api", "name": "Fengnan Chang", "email": "changfengnan@bytedance.com" }, "delegate": null, "mbox": "http://patchwork.ozlabs.org/project/linux-ext4/patch/20260428114730.14384-1-changfengnan@bytedance.com/mbox/", "series": [ { "id": 501835, "url": "http://patchwork.ozlabs.org/api/1.1/series/501835/?format=api", "web_url": "http://patchwork.ozlabs.org/project/linux-ext4/list/?series=501835", "date": "2026-04-28T11:47:30", "name": "[v2] iomap: add simple read path for small direct I/O", "version": 2, "mbox": "http://patchwork.ozlabs.org/series/501835/mbox/" } ], "comments": "http://patchwork.ozlabs.org/api/patches/2229524/comments/", "check": "pending", "checks": "http://patchwork.ozlabs.org/api/patches/2229524/checks/", "tags": {}, "headers": { "Return-Path": "\n <SRS0=h9dM=C3=vger.kernel.org=linux-ext4+bounces-16171-patchwork-incoming=ozlabs.org@ozlabs.org>", "X-Original-To": [ "incoming@patchwork.ozlabs.org", "linux-ext4@vger.kernel.org" ], "Delivered-To": [ "patchwork-incoming@legolas.ozlabs.org", "patchwork-incoming@ozlabs.org" ], "Authentication-Results": [ "legolas.ozlabs.org;\n\tdkim=pass (2048-bit key;\n unprotected) header.d=bytedance.com header.i=@bytedance.com\n header.a=rsa-sha256 header.s=2212171451 header.b=UCKZORJz;\n\tdkim-atps=neutral", "legolas.ozlabs.org;\n spf=pass (sender SPF authorized) smtp.mailfrom=ozlabs.org\n (client-ip=2404:9400:2221:ea00::3; helo=mail.ozlabs.org;\n envelope-from=srs0=h9dm=c3=vger.kernel.org=linux-ext4+bounces-16171-patchwork-incoming=ozlabs.org@ozlabs.org;\n receiver=patchwork.ozlabs.org)", "gandalf.ozlabs.org;\n arc=pass smtp.remote-ip=\"2600:3c09:e001:a7::12fc:5321\"\n arc.chain=subspace.kernel.org", "gandalf.ozlabs.org;\n dmarc=pass (p=quarantine dis=none) header.from=bytedance.com", "gandalf.ozlabs.org;\n\tdkim=pass (2048-bit key;\n unprotected) header.d=bytedance.com header.i=@bytedance.com\n header.a=rsa-sha256 header.s=2212171451 header.b=UCKZORJz;\n\tdkim-atps=neutral", "gandalf.ozlabs.org;\n spf=pass (sender SPF authorized) smtp.mailfrom=vger.kernel.org\n (client-ip=2600:3c09:e001:a7::12fc:5321; helo=sto.lore.kernel.org;\n envelope-from=linux-ext4+bounces-16171-patchwork-incoming=ozlabs.org@vger.kernel.org;\n receiver=ozlabs.org)", "smtp.subspace.kernel.org;\n\tdkim=pass (2048-bit key) header.d=bytedance.com header.i=@bytedance.com\n header.b=\"UCKZORJz\"", "smtp.subspace.kernel.org;\n arc=none smtp.client-ip=101.45.255.114", "smtp.subspace.kernel.org;\n dmarc=pass (p=quarantine dis=none) header.from=bytedance.com", "smtp.subspace.kernel.org;\n spf=pass smtp.mailfrom=bytedance.com" ], "Received": [ "from mail.ozlabs.org (mail.ozlabs.org [IPv6:2404:9400:2221:ea00::3])\n\t(using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits)\n\t key-exchange x25519)\n\t(No client certificate requested)\n\tby legolas.ozlabs.org (Postfix) with ESMTPS id 4g4fDc5ZFJz1yHv\n\tfor <incoming@patchwork.ozlabs.org>; Tue, 28 Apr 2026 22:01:52 +1000 (AEST)", "from mail.ozlabs.org (mail.ozlabs.org [IPv6:2404:9400:2221:ea00::3])\n\tby gandalf.ozlabs.org (Postfix) with ESMTP id 4g4fDc55Mbz4wBD\n\tfor <incoming@patchwork.ozlabs.org>; Tue, 28 Apr 2026 22:01:52 +1000 (AEST)", "by gandalf.ozlabs.org (Postfix)\n\tid 4g4fDc4xZtz4wCm; Tue, 28 Apr 2026 22:01:52 +1000 (AEST)", "from sto.lore.kernel.org (sto.lore.kernel.org\n [IPv6:2600:3c09:e001:a7::12fc:5321])\n\t(using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits)\n\t key-exchange x25519)\n\t(No client certificate requested)\n\tby gandalf.ozlabs.org (Postfix) with ESMTPS id 4g4fDW6CqSz4wBD\n\tfor <patchwork-incoming@ozlabs.org>; Tue, 28 Apr 2026 22:01:47 +1000 (AEST)", "from smtp.subspace.kernel.org (conduit.subspace.kernel.org\n [100.90.174.1])\n\tby sto.lore.kernel.org (Postfix) with ESMTP id C0DE8300334B\n\tfor <patchwork-incoming@ozlabs.org>; Tue, 28 Apr 2026 12:01:10 +0000 (UTC)", "from localhost.localdomain (localhost.localdomain [127.0.0.1])\n\tby smtp.subspace.kernel.org (Postfix) with ESMTP id 14FB73F23B5;\n\tTue, 28 Apr 2026 12:01:08 +0000 (UTC)", "from sg-3-114.ptr.tlmpb.com (sg-3-114.ptr.tlmpb.com\n [101.45.255.114])\n\t(using TLSv1.2 with cipher ECDHE-RSA-AES128-GCM-SHA256 (128/128 bits))\n\t(No client certificate requested)\n\tby smtp.subspace.kernel.org (Postfix) with ESMTPS id 69E833F23DA\n\tfor <linux-ext4@vger.kernel.org>; Tue, 28 Apr 2026 12:00:59 +0000 (UTC)" ], "ARC-Seal": [ "i=2; a=rsa-sha256; d=ozlabs.org; s=201707; t=1777377712; cv=pass;\n\tb=ZsoMMCqDdTENPLlSwDkddlgUutP4t/qiJK/rR7qXWQxUxVI/7qTi+9PFIrazGb7XX2wov46HZqwXS72WMZmTaEwsGqGFRGhOeHgJr6xdP+KvbxUwigfxtoP/8JiMxDjAEhOK7uc4GlxUEB01ODCn5FHFkk30WWvgnwtA3tAIq0CMaPvqeWXWTSFvfKkI7PRMRzrKKBwVJO0E0SEH0zLTufINchpHjwPIk2BImxoBEnuUbgCrQmmqnukFIeb0/Nh8pl7c7qdyfWfaMzLcCTiao4bCB9TIUgBDd9kX1QrESeLWqdo+ulTisqivPoPviGPTHFXb+Ayjg6XBwfOitkhxrQ==", "i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;\n\tt=1777377666; cv=none;\n b=mVJGoMbyBtioRvCERn4fdrAEMe7KnJnKgW35uzKwpmKZ7/VrHpahUB9Dcc7BFOHMoKvLB+62KsJ/l4VnW14i5lTsdtakjHmvGc8ilskvNZBVzhnqL7mgZBqJ91zbhx8/yTF+zvxriWt9P2+4OztHrjli2V+9M1GAW8J8cFH0dEA=" ], "ARC-Message-Signature": [ "i=2; a=rsa-sha256; d=ozlabs.org; s=201707;\n\tt=1777377712; c=relaxed/relaxed;\n\tbh=tuMlAllb2gii9Wov5/+jxIYfj8iGsOrvw3kr3rxAS8Y=;\n\th=From:To:Subject:Date:Mime-Version:Content-Type:Cc:Message-Id;\n b=VzZRlVFzCSui99/jk2jZXOdz+Qe7PLQTbsTUDfeEmR4nWQC4s0aDb2n1TpQlxZVCglbsXKFl0JOsci5DHIwrv1C/p0fmaGg5GyVRrxnvOovjX00tfH7ieCx2bFQyaF9VOEBF+I+MDp+9gQpiNF6oVltW4VR4rnxhmm2Ug9jnjug5aTPAt9msgnIXak/Cy05hLG5y+tF00YIhe4LP+l9zJuZSN1P2lQvXaw9+mWINVKtBfuJlvpJlIaQb5pTBo6xINKD7mts05Qp20zwxMM4+bWXhVp+fDuTNVzylyLFyn0TtK8Et5qoHffrqab+D7JrI2W59Ly4FgmC+U0uFCydYRg==", "i=1; a=rsa-sha256; d=subspace.kernel.org;\n\ts=arc-20240116; t=1777377666; c=relaxed/simple;\n\tbh=MherJ4arDpIzWHxnhFrg1Pwje4BFbZI3nQCsDzilLP4=;\n\th=From:To:Subject:Date:Mime-Version:Content-Type:Cc:Message-Id;\n b=MW4v24Rh1SG5EkOMLs70jdlkiOWELwxgaq0Kxx9kGZlpwNmjDzkTbhqhZEn2qp1ntfTo42Whu02bkL4YpV0mskeS6zh6Ly/zxTIvBCut1bNVDlVR81zg0y4Z0QOy+smihFfLud68D7PPYT4nW+3I1G1bhLU2neetSthmTdRdQR0=" ], "ARC-Authentication-Results": [ "i=2; gandalf.ozlabs.org;\n dmarc=pass (p=quarantine dis=none) header.from=bytedance.com;\n dkim=pass (2048-bit key;\n unprotected) header.d=bytedance.com header.i=@bytedance.com\n header.a=rsa-sha256 header.s=2212171451 header.b=UCKZORJz; dkim-atps=neutral;\n spf=pass (client-ip=2600:3c09:e001:a7::12fc:5321; helo=sto.lore.kernel.org;\n envelope-from=linux-ext4+bounces-16171-patchwork-incoming=ozlabs.org@vger.kernel.org;\n receiver=ozlabs.org) smtp.mailfrom=vger.kernel.org", "i=1; smtp.subspace.kernel.org;\n dmarc=pass (p=quarantine dis=none) header.from=bytedance.com;\n spf=pass smtp.mailfrom=bytedance.com;\n dkim=pass (2048-bit key) header.d=bytedance.com header.i=@bytedance.com\n header.b=UCKZORJz; arc=none smtp.client-ip=101.45.255.114" ], "DKIM-Signature": "v=1; a=rsa-sha256; q=dns/txt; c=relaxed/relaxed;\n s=2212171451; d=bytedance.com; t=1777376933; h=from:subject:\n mime-version:from:date:message-id:subject:to:cc:reply-to:content-type:\n mime-version:in-reply-to:message-id;\n bh=tuMlAllb2gii9Wov5/+jxIYfj8iGsOrvw3kr3rxAS8Y=;\n b=UCKZORJzEDaLuorc5PF0nimNJqxFnd9rL75qP1ogJpIPyLd9/emQ1c8+8soSy5n/Jngaiq\n LJsJ5Yf9Z2IvS6aVALpo4HNO9/S0LttlCSoXrBsaixh5yf3meHZLAeM1iKeJo5Z/e+E/Lh\n pwV0vkcm0g+NBNvtHtBW7s9iop6htkq92EYmP/u1GH66vdKH/IBmUjbY9N1zhVBFMXUQEw\n 1doEwl2bPvYWlm8I85ZWELsyaLmCNxTgtzJDvcZz0GS9KXfGHDsvtYHHvhMS7BnHd4gH1C\n t1ZqRD9185fFU9FvYKUxtdcugnVpz+uwQ7vAvmqjE24pc5QcNm4bglybK5tw9A==", "From": "\"Fengnan Chang\" <changfengnan@bytedance.com>", "X-Mailer": "git-send-email 2.39.5 (Apple Git-154)", "Content-Transfer-Encoding": "7bit", "To": "<brauner@kernel.org>, <djwong@kernel.org>, <hch@infradead.org>,\n\t<ojaswin@linux.ibm.com>, <dgc@kernel.org>, <linux-xfs@vger.kernel.org>,\n\t<linux-fsdevel@vger.kernel.org>, <linux-ext4@vger.kernel.org>,\n\t<linux-kernel@vger.kernel.org>, <lidiangang@bytedance.com>", "X-Original-From": "Fengnan Chang <changfengnan@bytedance.com>", "X-Lms-Return-Path": "\n <lba+269f09ea3+16e0c1+vger.kernel.org+changfengnan@bytedance.com>", "Subject": "[PATCH v2] iomap: add simple read path for small direct I/O", "Date": "Tue, 28 Apr 2026 19:47:30 +0800", "Precedence": "bulk", "X-Mailing-List": "linux-ext4@vger.kernel.org", "List-Id": "<linux-ext4.vger.kernel.org>", "List-Subscribe": "<mailto:linux-ext4+subscribe@vger.kernel.org>", "List-Unsubscribe": "<mailto:linux-ext4+unsubscribe@vger.kernel.org>", "Mime-Version": "1.0", "Content-Type": "text/plain; charset=UTF-8", "Cc": "\"Fengnan Chang\" <changfengnan@bytedance.com>", "Message-Id": "<20260428114730.14384-1-changfengnan@bytedance.com>", "X-Spam-Status": "No, score=-1.2 required=5.0 tests=ARC_SIGNED,ARC_VALID,\n\tDKIM_SIGNED,DKIM_VALID,DKIM_VALID_AU,DMARC_PASS,\n\tHEADER_FROM_DIFFERENT_DOMAINS,MAILING_LIST_MULTI,SPF_HELO_NONE,\n\tSPF_PASS autolearn=disabled version=4.0.1", "X-Spam-Checker-Version": "SpamAssassin 4.0.1 (2024-03-25) on gandalf.ozlabs.org" }, "content": "When running 4K random read workloads on high-performance Gen5 NVMe\nSSDs, the software overhead in the iomap direct I/O path\n(__iomap_dio_rw) becomes a significant bottleneck.\n\nUsing io_uring with poll mode for a 4K randread test on a raw block\ndevice:\ntaskset -c 30 ./t/io_uring -p1 -d512 -b4096 -s32 -c32 -F1 -B1 -R1 -X1\n-n1 -P1 /dev/nvme10n1\nResult: ~3.2M IOPS\n\nRunning the exact same workload on ext4 and XFS:\ntaskset -c 30 ./t/io_uring -p1 -d512 -b4096 -s32 -c32 -F1 -B1 -R1 -X1\n-n1 -P1 /mnt/testfile\nResult: ~1.84M IOPS\n\nProfiling the ext4 workload reveals that a significant portion of CPU\ntime is spent on memory allocation and the iomap state machine\niteration:\n 5.33% [kernel] [k] __iomap_dio_rw\n 3.26% [kernel] [k] iomap_iter\n 2.37% [kernel] [k] iomap_dio_bio_iter\n 2.35% [kernel] [k] kfree\n 1.33% [kernel] [k] iomap_dio_complete\n\nIntroduce simple reads to reduce the overhead of iomap, simple read path\nis triggered when the request satisfies:\n- I/O size is <= inode blocksize (fits in a single block, no splits).\n- No custom `iomap_dio_ops` (dops) registered by the filesystem.\n\nAfter this optimization, the heavy generic functions disappear from the\nprofile, replaced by a single streamlined execution path:\n 4.83% [kernel] [k] iomap_dio_simple_read\n\nWith this patch, 4K random read IOPS on ext4 increases from 1.84M to\n2.19M in the original single-core io_uring poll-mode workload.\n\nBelow are the test results using fio:\n\n fs workload qd simple=0 simple=1 gain\n ext4 libaio 1 18,738 18,761 +0.12%\n ext4 libaio 128 455,383 471,473 +3.53%\n ext4 libaio 256 453,273 468,555 +3.37%\n ext4 libaio 512 447,320 469,036 +4.85%\n ext4 io_uring 1 18,798 18,824 +0.14%\n ext4 io_uring 128 503,834 528,353 +4.87%\n ext4 io_uring 256 503,635 527,617 +4.76%\n ext4 io_uring 512 501,802 527,882 +5.20%\n ext4 io_uring_poll 1 19,246 19,270 +0.12%\n ext4 io_uring_poll 128 1,463,343 1,565,019 +6.95%\n ext4 io_uring_poll 256 1,651,112 1,888,182 +14.36%\n ext4 io_uring_poll 512 1,632,641 1,893,259 +15.96%\n xfs libaio 1 18,715 18,734 +0.10%\n xfs libaio 128 452,974 473,459 +4.52%\n xfs libaio 256 454,435 470,855 +3.61%\n xfs libaio 512 456,796 473,047 +3.56%\n xfs io_uring 1 18,755 18,795 +0.21%\n xfs io_uring 128 509,459 534,819 +4.98%\n xfs io_uring 256 509,853 536,051 +5.14%\n xfs io_uring 512 507,926 533,558 +5.05%\n xfs io_uring_poll 1 19,230 19,269 +0.20%\n xfs io_uring_poll 128 1,467,398 1,567,840 +6.84%\n xfs io_uring_poll 256 1,636,852 1,878,917 +14.79%\n xfs io_uring_poll 512 1,639,495 1,874,813 +14.35%\n\nAssisted-by: Gemini:gemini-3.1-pro-preview\nAssisted-by: Codex:gpt-5-5\nSigned-off-by: Fengnan Chang <changfengnan@bytedance.com>\n---\n fs/iomap/direct-io.c | 382 +++++++++++++++++++++++++++++++++++++++++--\n 1 file changed, 371 insertions(+), 11 deletions(-)", "diff": "diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c\nindex e911daedff65a..807d8c628a464 100644\n--- a/fs/iomap/direct-io.c\n+++ b/fs/iomap/direct-io.c\n@@ -9,6 +9,9 @@\n #include <linux/iomap.h>\n #include <linux/task_io_accounting_ops.h>\n #include <linux/fserror.h>\n+#include <linux/kobject.h>\n+#include <linux/sysfs.h>\n+#include <linux/init.h>\n #include \"internal.h\"\n #include \"trace.h\"\n \n@@ -236,20 +239,26 @@ static void iomap_dio_done(struct iomap_dio *dio)\n \tiomap_dio_complete_work(&dio->aio.work);\n }\n \n-static void __iomap_dio_bio_end_io(struct bio *bio, bool inline_completion)\n+static inline void iomap_dio_bio_release_pages(struct bio *bio,\n+\t\tunsigned int dio_flags, bool error)\n {\n-\tstruct iomap_dio *dio = bio->bi_private;\n-\n-\tif (dio->flags & IOMAP_DIO_BOUNCE) {\n-\t\tbio_iov_iter_unbounce(bio, !!dio->error,\n-\t\t\t\tdio->flags & IOMAP_DIO_USER_BACKED);\n+\tif (dio_flags & IOMAP_DIO_BOUNCE) {\n+\t\tbio_iov_iter_unbounce(bio, error,\n+\t\t\t\tdio_flags & IOMAP_DIO_USER_BACKED);\n \t\tbio_put(bio);\n-\t} else if (dio->flags & IOMAP_DIO_USER_BACKED) {\n+\t} else if (dio_flags & IOMAP_DIO_USER_BACKED) {\n \t\tbio_check_pages_dirty(bio);\n \t} else {\n \t\tbio_release_pages(bio, false);\n \t\tbio_put(bio);\n \t}\n+}\n+\n+static void __iomap_dio_bio_end_io(struct bio *bio, bool inline_completion)\n+{\n+\tstruct iomap_dio *dio = bio->bi_private;\n+\n+\tiomap_dio_bio_release_pages(bio, dio->flags, !!dio->error);\n \n \t/* Do not touch bio below, we just gave up our reference. */\n \n@@ -387,6 +396,14 @@ static ssize_t iomap_dio_bio_iter_one(struct iomap_iter *iter,\n \treturn ret;\n }\n \n+static inline unsigned int iomap_dio_alignment(struct inode *inode,\n+\t\tstruct block_device *bdev, unsigned int dio_flags)\n+{\n+\tif (dio_flags & IOMAP_DIO_FSBLOCK_ALIGNED)\n+\t\treturn i_blocksize(inode);\n+\treturn bdev_logical_block_size(bdev);\n+}\n+\n static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio)\n {\n \tconst struct iomap *iomap = &iter->iomap;\n@@ -405,10 +422,7 @@ static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio)\n \t * File systems that write out of place and always allocate new blocks\n \t * need each bio to be block aligned as that's the unit of allocation.\n \t */\n-\tif (dio->flags & IOMAP_DIO_FSBLOCK_ALIGNED)\n-\t\talignment = fs_block_size;\n-\telse\n-\t\talignment = bdev_logical_block_size(iomap->bdev);\n+\talignment = iomap_dio_alignment(inode, iomap->bdev, dio->flags);\n \n \tif ((pos | length) & (alignment - 1))\n \t\treturn -EINVAL;\n@@ -880,12 +894,350 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,\n }\n EXPORT_SYMBOL_GPL(__iomap_dio_rw);\n \n+struct iomap_dio_simple_read {\n+\tstruct kiocb\t\t*iocb;\n+\tsize_t\t\t\tsize;\n+\tunsigned int\t\tdio_flags;\n+\tatomic_t\t\tstate;\n+\tunion {\n+\t\tstruct task_struct\t*waiter;\n+\t\tstruct work_struct\twork;\n+\t};\n+\t/*\n+\t * Align @bio to a cacheline boundary so that, combined with the\n+\t * front_pad passed to bioset_init(), the bio sits at the start of\n+\t * a cacheline in memory returned by the (HWCACHE-aligned) bio\n+\t * slab. This keeps the hot fields block layer touches on submit\n+\t * and completion (bi_iter, bi_status, ...) within a single line.\n+\t */\n+\tstruct bio\tbio ____cacheline_aligned_in_smp;\n+};\n+\n+static struct bio_set iomap_dio_simple_read_pool;\n+\n+/*\n+ * In the async simple read path, we need to prevent bio_endio() from\n+ * triggering iocb->ki_complete() before the submitter has returned\n+ * -EIOCBQUEUED. Otherwise, the caller might free the iocb concurrently.\n+ *\n+ * We use a three-state rendezvous to synchronize the submitter and end_io:\n+ *\n+ * IOMAP_DIO_SIMPLE_SUBMITTING: Initial state set before submitting the bio.\n+ *\n+ * IOMAP_DIO_SIMPLE_QUEUED: The submitter has safely queued the IO and will\n+ * return -EIOCBQUEUED. If end_io sees this state, it takes over and calls\n+ * ki_complete().\n+ *\n+ * IOMAP_DIO_SIMPLE_DONE: end_io fired before the submitter finished the\n+ * submit path. end_io sets this state and does nothing else. The submitter\n+ * will see this state and handle the completion synchronously (bypassing\n+ * ki_complete() and returning the actual result).\n+ */\n+enum {\n+\tIOMAP_DIO_SIMPLE_SUBMITTING = 0,\n+\tIOMAP_DIO_SIMPLE_QUEUED,\n+\tIOMAP_DIO_SIMPLE_DONE,\n+};\n+\n+static ssize_t iomap_dio_simple_read_finish(struct kiocb *iocb,\n+\t\tstruct bio *bio, ssize_t ret)\n+{\n+\tstruct inode *inode = file_inode(iocb->ki_filp);\n+\tstruct iomap_dio_simple_read *sr = bio->bi_private;\n+\n+\tif (likely(!ret)) {\n+\t\tret = sr->size;\n+\t\tiocb->ki_pos += ret;\n+\t} else {\n+\t\tfserror_report_io(inode, FSERR_DIRECTIO_READ, iocb->ki_pos,\n+\t\t\t\t sr->size, ret, GFP_NOFS);\n+\t}\n+\n+\tiomap_dio_bio_release_pages(bio, sr->dio_flags, ret < 0);\n+\n+\treturn ret;\n+}\n+\n+static ssize_t iomap_dio_simple_read_complete(struct kiocb *iocb,\n+\t\tstruct bio *bio)\n+{\n+\tstruct inode *inode = file_inode(iocb->ki_filp);\n+\tssize_t ret;\n+\n+\tWRITE_ONCE(iocb->private, NULL);\n+\n+\tret = iomap_dio_simple_read_finish(iocb, bio,\n+\t\t\tblk_status_to_errno(bio->bi_status));\n+\n+\tinode_dio_end(inode);\n+\ttrace_iomap_dio_complete(iocb, ret < 0 ? ret : 0, ret > 0 ? ret : 0);\n+\treturn ret;\n+}\n+\n+static void iomap_dio_simple_read_complete_work(struct work_struct *work)\n+{\n+\tstruct iomap_dio_simple_read *sr =\n+\t\tcontainer_of(work, struct iomap_dio_simple_read, work);\n+\tstruct kiocb *iocb = sr->iocb;\n+\tssize_t ret;\n+\n+\tret = iomap_dio_simple_read_complete(iocb, &sr->bio);\n+\tiocb->ki_complete(iocb, ret);\n+}\n+\n+static void iomap_dio_simple_read_async_done(struct iomap_dio_simple_read *sr)\n+{\n+\tstruct kiocb *iocb = sr->iocb;\n+\n+\tif (unlikely(sr->bio.bi_status)) {\n+\t\tstruct inode *inode = file_inode(iocb->ki_filp);\n+\n+\t\tINIT_WORK(&sr->work, iomap_dio_simple_read_complete_work);\n+\t\tqueue_work(inode->i_sb->s_dio_done_wq, &sr->work);\n+\t\treturn;\n+\t}\n+\n+\tiomap_dio_simple_read_complete_work(&sr->work);\n+}\n+\n+static void iomap_dio_simple_read_end_io(struct bio *bio)\n+{\n+\tstruct iomap_dio_simple_read *sr = bio->bi_private;\n+\n+\tif (sr->waiter) {\n+\t\tstruct task_struct *waiter = sr->waiter;\n+\n+\t\tWRITE_ONCE(sr->waiter, NULL);\n+\t\tblk_wake_io_task(waiter);\n+\t\treturn;\n+\t}\n+\n+\tif (likely(atomic_read(&sr->state) == IOMAP_DIO_SIMPLE_QUEUED) ||\n+\t atomic_cmpxchg(&sr->state, IOMAP_DIO_SIMPLE_SUBMITTING,\n+\t\t\t IOMAP_DIO_SIMPLE_DONE) == IOMAP_DIO_SIMPLE_QUEUED)\n+\t\tiomap_dio_simple_read_async_done(sr);\n+}\n+\n+static inline bool iomap_dio_simple_read_supported(struct kiocb *iocb,\n+\t\tstruct iov_iter *iter, unsigned int dio_flags)\n+{\n+\tstruct inode *inode = file_inode(iocb->ki_filp);\n+\tsize_t count = iov_iter_count(iter);\n+\n+\tif (iov_iter_rw(iter) != READ)\n+\t\treturn false;\n+\t/*\n+\t * Simple read is an optimization for small IO. Filter out large IO\n+\t * early as it's the most common case to fail for typical direct IO\n+\t * workloads.\n+\t */\n+\tif (count > inode->i_sb->s_blocksize)\n+\t\treturn false;\n+\tif (dio_flags & (IOMAP_DIO_FORCE_WAIT | IOMAP_DIO_PARTIAL))\n+\t\treturn false;\n+\tif (iocb->ki_pos + count > i_size_read(inode))\n+\t\treturn false;\n+\n+\treturn true;\n+}\n+\n+static ssize_t iomap_dio_simple_read(struct kiocb *iocb,\n+\t\tstruct iov_iter *iter, const struct iomap_ops *ops,\n+\t\tvoid *private, unsigned int dio_flags)\n+{\n+\tstruct inode *inode = file_inode(iocb->ki_filp);\n+\tsize_t count = iov_iter_count(iter);\n+\tint nr_pages;\n+\tstruct iomap_dio_simple_read *sr;\n+\tunsigned int alignment;\n+\tstruct iomap_iter iomi = {\n+\t\t.inode\t\t= inode,\n+\t\t.pos\t\t= iocb->ki_pos,\n+\t\t.len\t\t= count,\n+\t\t.flags\t\t= IOMAP_DIRECT,\n+\t\t.private\t= private,\n+\t};\n+\tstruct bio *bio;\n+\tbool wait_for_completion = is_sync_kiocb(iocb);\n+\tssize_t ret;\n+\n+\tif (dio_flags & IOMAP_DIO_BOUNCE)\n+\t\tnr_pages = bio_iov_bounce_nr_vecs(iter, REQ_OP_READ);\n+\telse\n+\t\tnr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS);\n+\n+\tif (iocb->ki_flags & IOCB_NOWAIT)\n+\t\tiomi.flags |= IOMAP_NOWAIT;\n+\n+\tret = kiocb_write_and_wait(iocb, count);\n+\tif (ret)\n+\t\treturn ret;\n+\n+\tinode_dio_begin(inode);\n+\n+\tret = ops->iomap_begin(inode, iomi.pos, count, iomi.flags,\n+\t\t\t &iomi.iomap, &iomi.srcmap);\n+\tif (ret) {\n+\t\tinode_dio_end(inode);\n+\t\treturn ret;\n+\t}\n+\n+\tif (iomi.iomap.type != IOMAP_MAPPED ||\n+\t iomi.iomap.offset > iomi.pos ||\n+\t iomi.iomap.offset + iomi.iomap.length < iomi.pos + count) {\n+\t\tret = -ENOTBLK;\n+\t\tgoto out_iomap_end;\n+\t}\n+\n+\talignment = iomap_dio_alignment(inode, iomi.iomap.bdev, dio_flags);\n+\tif ((iomi.pos | count) & (alignment - 1)) {\n+\t\tret = -EINVAL;\n+\t\tgoto out_iomap_end;\n+\t}\n+\n+\tif (unlikely(!inode->i_sb->s_dio_done_wq)) {\n+\t\tret = sb_init_dio_done_wq(inode->i_sb);\n+\t\tif (ret < 0)\n+\t\t\tgoto out_iomap_end;\n+\t}\n+\n+\ttrace_iomap_dio_rw_begin(iocb, iter, dio_flags, 0);\n+\n+\tif (user_backed_iter(iter))\n+\t\tdio_flags |= IOMAP_DIO_USER_BACKED;\n+\n+\tbio = bio_alloc_bioset(iomi.iomap.bdev, nr_pages,\n+\t\t\t REQ_OP_READ | REQ_SYNC | REQ_IDLE,\n+\t\t\t GFP_KERNEL, &iomap_dio_simple_read_pool);\n+\tsr = container_of(bio, struct iomap_dio_simple_read, bio);\n+\n+\tfscrypt_set_bio_crypt_ctx(bio, inode, iomi.pos >> inode->i_blkbits,\n+\t\t\t\t GFP_KERNEL);\n+\tsr->iocb = iocb;\n+\tsr->dio_flags = dio_flags;\n+\n+\tbio->bi_iter.bi_sector = iomap_sector(&iomi.iomap, iomi.pos);\n+\tbio->bi_ioprio = iocb->ki_ioprio;\n+\tbio->bi_private = sr;\n+\tbio->bi_end_io = iomap_dio_simple_read_end_io;\n+\n+\tif (dio_flags & IOMAP_DIO_BOUNCE)\n+\t\tret = bio_iov_iter_bounce(bio, iter);\n+\telse\n+\t\tret = bio_iov_iter_get_pages(bio, iter, alignment - 1);\n+\tif (unlikely(ret))\n+\t\tgoto out_bio_put;\n+\n+\tif (bio->bi_iter.bi_size != count) {\n+\t\tiov_iter_revert(iter, bio->bi_iter.bi_size);\n+\t\tret = -ENOTBLK;\n+\t\tgoto out_bio_release_pages;\n+\t}\n+\n+\tsr->size = bio->bi_iter.bi_size;\n+\n+\tif ((dio_flags & IOMAP_DIO_USER_BACKED) &&\n+\t !(dio_flags & IOMAP_DIO_BOUNCE))\n+\t\tbio_set_pages_dirty(bio);\n+\n+\tif (iocb->ki_flags & IOCB_NOWAIT)\n+\t\tbio->bi_opf |= REQ_NOWAIT;\n+\tif ((iocb->ki_flags & IOCB_HIPRI) && !wait_for_completion) {\n+\t\tbio->bi_opf |= REQ_POLLED;\n+\t\tbio_set_polled(bio, iocb);\n+\t\tWRITE_ONCE(iocb->private, bio);\n+\t}\n+\n+\tif (wait_for_completion) {\n+\t\tsr->waiter = current;\n+\t\tblk_crypto_submit_bio(bio);\n+\t} else {\n+\t\tatomic_set(&sr->state, IOMAP_DIO_SIMPLE_SUBMITTING);\n+\t\tsr->waiter = NULL;\n+\t\tblk_crypto_submit_bio(bio);\n+\t\tret = -EIOCBQUEUED;\n+\t}\n+\n+\tif (ops->iomap_end)\n+\t\tops->iomap_end(inode, iomi.pos, count, count, iomi.flags,\n+\t\t\t &iomi.iomap);\n+\n+\tif (wait_for_completion) {\n+\t\tfor (;;) {\n+\t\t\tset_current_state(TASK_UNINTERRUPTIBLE);\n+\t\t\tif (!READ_ONCE(sr->waiter))\n+\t\t\t\tbreak;\n+\t\t\tblk_io_schedule();\n+\t\t}\n+\t\t__set_current_state(TASK_RUNNING);\n+\n+\t\tret = iomap_dio_simple_read_finish(iocb, bio,\n+\t\t\t\tblk_status_to_errno(bio->bi_status));\n+\t\tinode_dio_end(inode);\n+\t\ttrace_iomap_dio_complete(iocb, ret < 0 ? ret : 0,\n+\t\t\t\t\t ret > 0 ? ret : 0);\n+\t} else if (atomic_cmpxchg(&sr->state, IOMAP_DIO_SIMPLE_SUBMITTING,\n+\t\t\t\t IOMAP_DIO_SIMPLE_QUEUED) ==\n+\t\t IOMAP_DIO_SIMPLE_DONE) {\n+\t\tret = iomap_dio_simple_read_complete(iocb, bio);\n+\t} else {\n+\t\ttrace_iomap_dio_rw_queued(inode, iomi.pos, count);\n+\t}\n+\n+\treturn ret;\n+\n+out_bio_release_pages:\n+\tif (dio_flags & IOMAP_DIO_BOUNCE)\n+\t\tbio_iov_iter_unbounce(bio, true, false);\n+\telse\n+\t\tbio_release_pages(bio, false);\n+out_bio_put:\n+\tbio_put(bio);\n+out_iomap_end:\n+\tif (ops->iomap_end)\n+\t\tops->iomap_end(inode, iomi.pos, count, 0, iomi.flags,\n+\t\t\t &iomi.iomap);\n+\tinode_dio_end(inode);\n+\treturn ret;\n+}\n+\n ssize_t\n iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,\n \t\tconst struct iomap_ops *ops, const struct iomap_dio_ops *dops,\n \t\tunsigned int dio_flags, void *private, size_t done_before)\n {\n \tstruct iomap_dio *dio;\n+\tssize_t ret;\n+\n+\t/*\n+\t * Fast path for small, block-aligned reads that map to a single\n+\t * contiguous on-disk extent.\n+\t *\n+\t * @dops must be NULL: a non-NULL @dops means the caller wants its\n+\t * ->end_io / ->submit_io hooks invoked, and in particular wants its\n+\t * bios to be allocated from the filesystem-private @dops->bio_set\n+\t * (whose front_pad sizes a filesystem-private wrapper around the\n+\t * bio). The fast path instead allocates from the shared\n+\t * iomap_dio_simple_read_pool, whose front_pad matches\n+\t * struct iomap_dio_simple_read; the two wrappers are not\n+\t * interchangeable, so we must fall back to __iomap_dio_rw() in\n+\t * that case.\n+\t *\n+\t * @done_before must be zero: a non-zero caller-accumulated residual\n+\t * cannot be carried through a single-bio inline completion.\n+\t *\n+\t * -ENOTBLK is the private sentinel returned by iomap_dio_simple_read()\n+\t * when it decides the request does not fit the fast path.\n+\t * In that case we proceed to the generic __iomap_dio_rw() slow\n+\t * path. Any other errno is a real result and is propagated as-is,\n+\t * in particular -EAGAIN for IOCB_NOWAIT must reach the caller.\n+\t */\n+\tif (!dops && !done_before &&\n+\t iomap_dio_simple_read_supported(iocb, iter, dio_flags)) {\n+\t\tret = iomap_dio_simple_read(iocb, iter, ops, private, dio_flags);\n+\t\tif (ret != -ENOTBLK)\n+\t\t\treturn ret;\n+\t}\n \n \tdio = __iomap_dio_rw(iocb, iter, ops, dops, dio_flags, private,\n \t\t\t done_before);\n@@ -894,3 +1246,11 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,\n \treturn iomap_dio_complete(dio);\n }\n EXPORT_SYMBOL_GPL(iomap_dio_rw);\n+\n+static int __init iomap_dio_init(void)\n+{\n+\treturn bioset_init(&iomap_dio_simple_read_pool, 4,\n+\t\t\t offsetof(struct iomap_dio_simple_read, bio),\n+\t\t\t BIOSET_NEED_BVECS | BIOSET_PERCPU_CACHE);\n+}\n+fs_initcall(iomap_dio_init);\n", "prefixes": [ "v2" ] }