get:
Show a patch.

patch:
Update a patch.

put:
Update a patch.

GET /api/patches/2216373/?format=api
HTTP 200 OK
Allow: GET, PUT, PATCH, HEAD, OPTIONS
Content-Type: application/json
Vary: Accept

{
    "id": 2216373,
    "url": "http://patchwork.ozlabs.org/api/patches/2216373/?format=api",
    "web_url": "http://patchwork.ozlabs.org/project/linux-cifs-client/patch/20260326104544.509518-27-dhowells@redhat.com/",
    "project": {
        "id": 12,
        "url": "http://patchwork.ozlabs.org/api/projects/12/?format=api",
        "name": "Linux CIFS Client",
        "link_name": "linux-cifs-client",
        "list_id": "linux-cifs.vger.kernel.org",
        "list_email": "linux-cifs@vger.kernel.org",
        "web_url": "",
        "scm_url": "",
        "webscm_url": "",
        "list_archive_url": "",
        "list_archive_url_format": "",
        "commit_url_format": ""
    },
    "msgid": "<20260326104544.509518-27-dhowells@redhat.com>",
    "list_archive_url": null,
    "date": "2026-03-26T10:45:41",
    "name": "[26/26] netfs: Combine prepare and issue ops and grab the buffers on request",
    "commit_ref": null,
    "pull_url": null,
    "state": "new",
    "archived": false,
    "hash": "fcf2ba57466c3ba7dd40d9a8efc46ed3f5dfea0a",
    "submitter": {
        "id": 59,
        "url": "http://patchwork.ozlabs.org/api/people/59/?format=api",
        "name": "David Howells",
        "email": "dhowells@redhat.com"
    },
    "delegate": null,
    "mbox": "http://patchwork.ozlabs.org/project/linux-cifs-client/patch/20260326104544.509518-27-dhowells@redhat.com/mbox/",
    "series": [
        {
            "id": 497565,
            "url": "http://patchwork.ozlabs.org/api/series/497565/?format=api",
            "web_url": "http://patchwork.ozlabs.org/project/linux-cifs-client/list/?series=497565",
            "date": "2026-03-26T10:45:15",
            "name": "netfs: Keep track of folios in a segmented bio_vec[] chain",
            "version": 1,
            "mbox": "http://patchwork.ozlabs.org/series/497565/mbox/"
        }
    ],
    "comments": "http://patchwork.ozlabs.org/api/patches/2216373/comments/",
    "check": "pending",
    "checks": "http://patchwork.ozlabs.org/api/patches/2216373/checks/",
    "tags": {},
    "related": [],
    "headers": {
        "Return-Path": "\n <linux-cifs+bounces-10549-incoming=patchwork.ozlabs.org@vger.kernel.org>",
        "X-Original-To": [
            "incoming@patchwork.ozlabs.org",
            "linux-cifs@vger.kernel.org"
        ],
        "Delivered-To": "patchwork-incoming@legolas.ozlabs.org",
        "Authentication-Results": [
            "legolas.ozlabs.org;\n\tdkim=pass (1024-bit key;\n unprotected) header.d=redhat.com header.i=@redhat.com header.a=rsa-sha256\n header.s=mimecast20190719 header.b=L52BRxyi;\n\tdkim-atps=neutral",
            "legolas.ozlabs.org;\n spf=pass (sender SPF authorized) smtp.mailfrom=vger.kernel.org\n (client-ip=2600:3c15:e001:75::12fc:5321; helo=sin.lore.kernel.org;\n envelope-from=linux-cifs+bounces-10549-incoming=patchwork.ozlabs.org@vger.kernel.org;\n receiver=patchwork.ozlabs.org)",
            "smtp.subspace.kernel.org;\n\tdkim=pass (1024-bit key) header.d=redhat.com header.i=@redhat.com\n header.b=\"L52BRxyi\"",
            "smtp.subspace.kernel.org;\n arc=none smtp.client-ip=170.10.133.124",
            "smtp.subspace.kernel.org;\n dmarc=pass (p=quarantine dis=none) header.from=redhat.com",
            "smtp.subspace.kernel.org;\n spf=pass smtp.mailfrom=redhat.com"
        ],
        "Received": [
            "from sin.lore.kernel.org (sin.lore.kernel.org\n [IPv6:2600:3c15:e001:75::12fc:5321])\n\t(using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits)\n\t key-exchange x25519)\n\t(No client certificate requested)\n\tby legolas.ozlabs.org (Postfix) with ESMTPS id 4fhLhl3qjVz1y1G\n\tfor <incoming@patchwork.ozlabs.org>; Thu, 26 Mar 2026 22:12:23 +1100 (AEDT)",
            "from smtp.subspace.kernel.org (conduit.subspace.kernel.org\n [100.90.174.1])\n\tby sin.lore.kernel.org (Postfix) with ESMTP id 4019E310AE30\n\tfor <incoming@patchwork.ozlabs.org>; Thu, 26 Mar 2026 10:55:55 +0000 (UTC)",
            "from localhost.localdomain (localhost.localdomain [127.0.0.1])\n\tby smtp.subspace.kernel.org (Postfix) with ESMTP id 91D1533F8C1;\n\tThu, 26 Mar 2026 10:50:10 +0000 (UTC)",
            "from us-smtp-delivery-124.mimecast.com\n (us-smtp-delivery-124.mimecast.com [170.10.133.124])\n\t(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))\n\t(No client certificate requested)\n\tby smtp.subspace.kernel.org (Postfix) with ESMTPS id 65FD53CEB95\n\tfor <linux-cifs@vger.kernel.org>; Thu, 26 Mar 2026 10:49:58 +0000 (UTC)",
            "from mx-prod-mc-01.mail-002.prod.us-west-2.aws.redhat.com\n (ec2-54-186-198-63.us-west-2.compute.amazonaws.com [54.186.198.63]) by\n relay.mimecast.com with ESMTP with STARTTLS (version=TLSv1.3,\n cipher=TLS_AES_256_GCM_SHA384) id us-mta-596-89SjK45AMUe870Heb8Itmw-1; Thu,\n 26 Mar 2026 06:49:53 -0400",
            "from mx-prod-int-03.mail-002.prod.us-west-2.aws.redhat.com\n (mx-prod-int-03.mail-002.prod.us-west-2.aws.redhat.com [10.30.177.12])\n\t(using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits)\n\t key-exchange X25519 server-signature RSA-PSS (2048 bits) server-digest\n SHA256)\n\t(No client certificate requested)\n\tby mx-prod-mc-01.mail-002.prod.us-west-2.aws.redhat.com (Postfix) with ESMTPS\n id 1C15E195608B;\n\tThu, 26 Mar 2026 10:49:51 +0000 (UTC)",
            "from warthog.procyon.org.com (unknown [10.44.33.121])\n\tby mx-prod-int-03.mail-002.prod.us-west-2.aws.redhat.com (Postfix) with ESMTP\n id 0BBBE19560B1;\n\tThu, 26 Mar 2026 10:49:43 +0000 (UTC)"
        ],
        "ARC-Seal": "i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;\n\tt=1774522210; cv=none;\n b=jIIeQ3YLdkFMb0d82oq/EVzUChz9oJQ44jFjxJgf2GpVuatr9Iny6qUDj2ujeSuigXMuaJVA1MGygEakc0d7fZPs/YW/c3eivZUthLzkPvvWeFRcpxoH+7D5c91Vu/5c0KwKjhh1dJ8dnqGXzl2TEMaY49Kvs9jVcKhVx5p4mq8=",
        "ARC-Message-Signature": "i=1; a=rsa-sha256; d=subspace.kernel.org;\n\ts=arc-20240116; t=1774522210; c=relaxed/simple;\n\tbh=fTHmqyJdcoTyTInHKFwKiOwCExhN1eOyZ8RhtFwOg04=;\n\th=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References:\n\t MIME-Version;\n b=H6EeBfIgpApaz4isBLjX5n1l9QYaIaZZ6h+o99rAVGw3D01oSQNGS7t1+/B/Q3pxOW26ijRpJrz0eVBdNLoIIenb2e2sxTi6c74DK0RBvWr62fHkx0i6aVyY0HOJTGdiszxIkjuAWfsfM++8z154HBggqaH+BOspUzLreefjF7E=",
        "ARC-Authentication-Results": "i=1; smtp.subspace.kernel.org;\n dmarc=pass (p=quarantine dis=none) header.from=redhat.com;\n spf=pass smtp.mailfrom=redhat.com;\n dkim=pass (1024-bit key) header.d=redhat.com header.i=@redhat.com\n header.b=L52BRxyi; arc=none smtp.client-ip=170.10.133.124",
        "DKIM-Signature": "v=1; a=rsa-sha256; c=relaxed/relaxed; d=redhat.com;\n\ts=mimecast20190719; t=1774522197;\n\th=from:from:reply-to:subject:subject:date:date:message-id:message-id:\n\t to:to:cc:cc:mime-version:mime-version:\n\t content-transfer-encoding:content-transfer-encoding:\n\t in-reply-to:in-reply-to:references:references;\n\tbh=c+VF65Wqti8gLaqX/quu1u0c3WXc6fWf3Wezs9ZTxDc=;\n\tb=L52BRxyidaIC2sLqFrtfify4oLXX7HeTeo28HdetpP4rZvz1m3iIRcwW85FYPWoF5BaCbW\n\tFs3TkAp4rjygG7/o9U2FZiPo9WyLBTgSBxnJpqzfiz6wu3BT906DsKn1IGnJO3E0GbnLEY\n\tsvuA3adaF0G/74djWh+ngR5x//TzXt0=",
        "X-MC-Unique": "89SjK45AMUe870Heb8Itmw-1",
        "X-Mimecast-MFC-AGG-ID": "89SjK45AMUe870Heb8Itmw_1774522191",
        "From": "David Howells <dhowells@redhat.com>",
        "To": "Christian Brauner <christian@brauner.io>,\n\tMatthew Wilcox <willy@infradead.org>,\n\tChristoph Hellwig <hch@infradead.org>",
        "Cc": "David Howells <dhowells@redhat.com>,\n\tPaulo Alcantara <pc@manguebit.com>,\n\tJens Axboe <axboe@kernel.dk>,\n\tLeon Romanovsky <leon@kernel.org>,\n\tSteve French <sfrench@samba.org>,\n\tChenXiaoSong <chenxiaosong@chenxiaosong.com>,\n\tMarc Dionne <marc.dionne@auristor.com>,\n\tEric Van Hensbergen <ericvh@kernel.org>,\n\tDominique Martinet <asmadeus@codewreck.org>,\n\tIlya Dryomov <idryomov@gmail.com>,\n\tTrond Myklebust <trondmy@kernel.org>,\n\tnetfs@lists.linux.dev,\n\tlinux-afs@lists.infradead.org,\n\tlinux-cifs@vger.kernel.org,\n\tlinux-nfs@vger.kernel.org,\n\tceph-devel@vger.kernel.org,\n\tv9fs@lists.linux.dev,\n\tlinux-erofs@lists.ozlabs.org,\n\tlinux-fsdevel@vger.kernel.org,\n\tlinux-kernel@vger.kernel.org,\n\tPaulo Alcantara <pc@manguebit.org>",
        "Subject": "[PATCH 26/26] netfs: Combine prepare and issue ops and grab the\n buffers on request",
        "Date": "Thu, 26 Mar 2026 10:45:41 +0000",
        "Message-ID": "<20260326104544.509518-27-dhowells@redhat.com>",
        "In-Reply-To": "<20260326104544.509518-1-dhowells@redhat.com>",
        "References": "<20260326104544.509518-1-dhowells@redhat.com>",
        "Precedence": "bulk",
        "X-Mailing-List": "linux-cifs@vger.kernel.org",
        "List-Id": "<linux-cifs.vger.kernel.org>",
        "List-Subscribe": "<mailto:linux-cifs+subscribe@vger.kernel.org>",
        "List-Unsubscribe": "<mailto:linux-cifs+unsubscribe@vger.kernel.org>",
        "MIME-Version": "1.0",
        "Content-Transfer-Encoding": "8bit",
        "X-Scanned-By": "MIMEDefang 3.0 on 10.30.177.12"
    },
    "content": "Modify the way subrequests are generated in netfslib to try and simplify\nthe code.  The issue, primarily, is in writeback: the code has to create\nmultiple streams of write requests to disparate targets with different\nproperties (e.g. server and fscache), where not every folio needs to go to\nevery target (e.g. data just read from the server may only need writing to\nthe cache).\n\nThe current model in writeback, at least, is to go carefully through every\nfolio, preparing a subrequest for each stream when it was detected that\npart of the current folio needed to go to that stream, and repeating this\nwithin and across contiguous folios; then to issue subrequests as they\nbecome full or hit boundaries after first setting up the buffer.  However,\nthis is quite difficult to follow - and makes it tricky to handle\ndiscontiguous folios in a request.\n\nThis is changed such that netfs now accumulates buffers and attaches them\nto each stream when they become valid for that stream, then flushes the\nstream when a limit or a boundary is hit.  The issuing code in netfs then\nloops around creating and issuing subrequests without calling a separate\nprepare stage (though a function is provided to get an estimate of when\nflushing should occur).  The filesystem (or cache) then gets to take a\nslice of the master bvec chain as its I/O buffer for each subrequest,\nincluding discontiguities if it can support a sparse/vectored RPC (as Ceph\ncan).\n\nSimilar-ish changes also apply to buffered read and unbuffered read and\nwrite, though in each of those cases there is only a single contiguous\nstream.  Though for buffered read this consists of interwoven requests from\nmultiple sources (server or cache).\n\nTo this end, netfslib is changed in the following ways:\n\n (1) ->prepare_xxx(), buffer selection and ->issue_xxx() are now collapsed\n     together such that one ->issue_xxx() call is made with the subrequest\n     defined to the maximum extent; the filesystem/cache then reduces the\n     length of the subrequest and calls back to netfslib to grab a slice of\n     the buffer, which may reduce the subrequest further if a maximum\n     segment limit is set.  The filesystem/cache then dispatches the\n     operation.\n\n (2) Retry buffer tracking is added to the netfs_io_request struct.  This\n     is then selected by the subrequest retry counter being non-zero.\n\n (3) The use of iov_iter is pushed down to the filesystem.  Netfslib now\n     provides the filesystem with a bvecq holding the buffer rather than an\n     iov_iter.  The bvecq can be duplicated and headers/trailers attached\n     to hold protocol and several bvecqs can be linked together to create a\n     compound operation.\n\n (4) The ->issue_xxx() functions now return an error code that allows them\n     to return an error without having to terminate the subrequest.\n     Netfslib will handle the error immediately if it can but may request\n     termination and punt responsibility to the result collector.\n\n     ->issue_xxx() can return 0 if synchronously complete and -EIOCBQUEUED\n     if the operation will complete (or already has completed)\n     asynchronously.\n\n (5) During writeback, netfslib now builds up an accumulation of buffered\n     data before issuing writes on each stream (one server, one cache).  It\n     asks each stream for an estimate of how much data to accumulate before\n     it next generates subrequests on the stream.  The filesystem or cache\n     is not required to use up all the data accumulated on a stream at that\n     time unless the end of the pagecache is hit.\n\n (6) During read-gaps, in which there are two gaps on either end of a dirty\n     streaming write page that need to be filled, a buffer is constructed\n     consisting of the two ends plus a sink page repeated to cover the\n     middle portion.  This is passed to the server as a single write.  For\n     something like Ceph, this should probably be done either as a\n     vectored/sparse read or as two separate reads (if different Ceph\n     objects are involved).\n\n (7) During unbuffered/DIO read/write, there is a single contiguous file\n     region to be read or written as a single stream.  The dispatching\n     function just creates subrequests and calls ->issue_xxx() repeatedly\n     to eat through the bufferage.\n\n (8) At the start of buffered read, the entire set of folios allocated by\n     VM readahead is loaded into a bvecq chain, rather than trying to do it\n     piecemeal as-needed.  As the pages were already added and locked by\n     the VM, this is slightly more efficient than loading piecemeal as only\n     a single iteration of the xarray is required.\n\n (9) During buffered read, there is a single contiguous file region, to\n     read as a single stream - however, this stream may be stitched\n     together from subrequests to multiple sources.  Which sources are used\n     where is now determined by querying the cache to find the next couple\n     of extents in which it has data; netfslib uses this to direct the\n     subrequests towards the appropriate sources.\n\n     Each subrequest is given the maximum length in the current extent and\n     then ->issue_read() is called.  The filesystem then limits the size\n     and slices off a piece of the buffer for that extent.\n\n(10) Cachefiles now provides an estimation function that indicates the\n     standard maxima for doing DIO (MAX_RW_COUNT and BIO_MAX_VECS).\n\nNote that sparse cachefiles still rely on the backing filesystem for\ncontent mapping.  That will need to be addressed in a future patch and is\nnot trivial to fix.\n\nSigned-off-by: David Howells <dhowells@redhat.com>\ncc: Paulo Alcantara <pc@manguebit.org>\ncc: Matthew Wilcox <willy@infradead.org>\ncc: Christoph Hellwig <hch@infradead.org>\ncc: netfs@lists.linux.dev\ncc: linux-fsdevel@vger.kernel.org\n---\n fs/9p/vfs_addr.c                  |  49 +-\n fs/afs/dir.c                      |   8 +-\n fs/afs/file.c                     |  26 +-\n fs/afs/fsclient.c                 |   8 +-\n fs/afs/internal.h                 |   8 +-\n fs/afs/write.c                    |  35 +-\n fs/afs/yfsclient.c                |   6 +-\n fs/cachefiles/io.c                | 237 ++++++---\n fs/ceph/Kconfig                   |   1 +\n fs/ceph/addr.c                    | 127 ++---\n fs/netfs/Kconfig                  |   3 +\n fs/netfs/Makefile                 |   2 +-\n fs/netfs/buffered_read.c          | 236 +++++----\n fs/netfs/buffered_write.c         |  27 +-\n fs/netfs/direct_read.c            |  91 ++--\n fs/netfs/direct_write.c           | 145 +++---\n fs/netfs/fscache_io.c             |   6 -\n fs/netfs/internal.h               |  78 ++-\n fs/netfs/iterator.c               |   6 +-\n fs/netfs/misc.c                   |  33 +-\n fs/netfs/objects.c                |   7 +-\n fs/netfs/read_collect.c           |  33 +-\n fs/netfs/read_pgpriv2.c           | 116 +++--\n fs/netfs/read_retry.c             | 207 ++++----\n fs/netfs/read_single.c            | 150 +++---\n fs/netfs/write_collect.c          |  41 +-\n fs/netfs/write_issue.c            | 805 ++++++++++++++++++------------\n fs/netfs/write_retry.c            | 136 +++--\n fs/nfs/Kconfig                    |   1 +\n fs/nfs/fscache.c                  |  24 +-\n fs/smb/client/cifssmb.c           |  13 +-\n fs/smb/client/file.c              | 146 +++---\n fs/smb/client/smb2ops.c           |   9 +-\n fs/smb/client/smb2pdu.c           |  28 +-\n fs/smb/client/transport.c         |  15 +-\n include/linux/netfs.h             |  96 ++--\n include/trace/events/cachefiles.h |   2 +\n include/trace/events/netfs.h      |  51 +-\n net/9p/client.c                   |   8 +-\n 39 files changed, 1790 insertions(+), 1230 deletions(-)",
    "diff": "diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c\nindex 862164181bac..0db56cc00467 100644\n--- a/fs/9p/vfs_addr.c\n+++ b/fs/9p/vfs_addr.c\n@@ -48,32 +48,71 @@ static void v9fs_begin_writeback(struct netfs_io_request *wreq)\n \twreq->io_streams[0].avail = true;\n }\n \n+/*\n+ * Estimate how much data should be accumulated before we start issuing\n+ * write subrequests.\n+ */\n+static int v9fs_estimate_write(struct netfs_io_request *wreq,\n+\t\t\t       struct netfs_io_stream *stream,\n+\t\t\t       struct netfs_write_estimate *estimate)\n+{\n+\tstruct p9_fid *fid = wreq->netfs_priv;\n+\tunsigned long long limit = ULLONG_MAX - stream->issue_from;\n+\tunsigned long long max_len = fid->clnt->msize - P9_IOHDRSZ;\n+\n+\testimate->issue_at = stream->issue_from + umin(max_len, limit);\n+\treturn 0;\n+}\n+\n /*\n  * Issue a subrequest to write to the server.\n  */\n-static void v9fs_issue_write(struct netfs_io_subrequest *subreq)\n+static int v9fs_issue_write(struct netfs_io_subrequest *subreq)\n {\n+\tstruct iov_iter iter;\n \tstruct p9_fid *fid = subreq->rreq->netfs_priv;\n \tint err, len;\n \n-\tlen = p9_client_write(fid, subreq->start, &subreq->io_iter, &err);\n+\tsubreq->len = umin(subreq->len, fid->clnt->msize - P9_IOHDRSZ);\n+\n+\terr = netfs_prepare_write_buffer(subreq, INT_MAX);\n+\tif (err < 0)\n+\t\treturn err;\n+\n+\tiov_iter_bvec_queue(&iter, ITER_SOURCE, subreq->content.bvecq,\n+\t\t\t    subreq->content.slot, subreq->content.offset, subreq->len);\n+\n+\tlen = p9_client_write(fid, subreq->start, &iter, &err);\n \tif (len > 0)\n \t\t__set_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags);\n \tnetfs_write_subrequest_terminated(subreq, len ?: err);\n+\treturn err;\n }\n \n /**\n  * v9fs_issue_read - Issue a read from 9P\n  * @subreq: The read to make\n+ * @rctx: Read generation context\n  */\n-static void v9fs_issue_read(struct netfs_io_subrequest *subreq)\n+static int v9fs_issue_read(struct netfs_io_subrequest *subreq)\n {\n \tstruct netfs_io_request *rreq = subreq->rreq;\n+\tstruct iov_iter iter;\n \tstruct p9_fid *fid = rreq->netfs_priv;\n \tunsigned long long pos = subreq->start + subreq->transferred;\n \tint total, err;\n \n-\ttotal = p9_client_read(fid, pos, &subreq->io_iter, &err);\n+\terr = netfs_prepare_read_buffer(subreq, INT_MAX);\n+\tif (err < 0)\n+\t\treturn err;\n+\n+\tiov_iter_bvec_queue(&iter, ITER_DEST, subreq->content.bvecq,\n+\t\t\t    subreq->content.slot, subreq->content.offset, subreq->len);\n+\n+\t/* After this point, we're not allowed to return an error. */\n+\tnetfs_mark_read_submission(subreq);\n+\n+\ttotal = p9_client_read(fid, pos, &iter, &err);\n \n \t/* if we just extended the file size, any portion not in\n \t * cache won't be on server and is zeroes */\n@@ -89,6 +128,7 @@ static void v9fs_issue_read(struct netfs_io_subrequest *subreq)\n \n \tsubreq->error = err;\n \tnetfs_read_subreq_terminated(subreq);\n+\treturn -EIOCBQUEUED;\n }\n \n /**\n@@ -154,6 +194,7 @@ const struct netfs_request_ops v9fs_req_ops = {\n \t.free_request\t\t= v9fs_free_request,\n \t.issue_read\t\t= v9fs_issue_read,\n \t.begin_writeback\t= v9fs_begin_writeback,\n+\t.estimate_write\t\t= v9fs_estimate_write,\n \t.issue_write\t\t= v9fs_issue_write,\n };\n \ndiff --git a/fs/afs/dir.c b/fs/afs/dir.c\nindex 6627a0d38e73..52ab84ab8c1f 100644\n--- a/fs/afs/dir.c\n+++ b/fs/afs/dir.c\n@@ -255,7 +255,8 @@ static ssize_t afs_do_read_single(struct afs_vnode *dvnode, struct file *file)\n \tif (dvnode->directory_size < i_size) {\n \t\tsize_t cur_size = dvnode->directory_size;\n \n-\t\tret = bvecq_expand_buffer(&dvnode->directory, &cur_size, i_size,\n+\t\tret = bvecq_expand_buffer(&dvnode->directory, &cur_size,\n+\t\t\t\t\t  round_up(i_size, PAGE_SIZE),\n \t\t\t\t\t  mapping_gfp_mask(dvnode->netfs.inode.i_mapping));\n \t\tdvnode->directory_size = cur_size;\n \t\tif (ret < 0)\n@@ -2210,9 +2211,10 @@ int afs_single_writepages(struct address_space *mapping,\n \tif (is_dir ?\n \t    test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags) :\n \t    atomic64_read(&dvnode->cb_expires_at) != AFS_NO_CB_PROMISE) {\n+\t\tsize_t len = i_size_read(&dvnode->netfs.inode);\n \t\tiov_iter_bvec_queue(&iter, ITER_SOURCE, dvnode->directory, 0, 0,\n-\t\t\t\t    i_size_read(&dvnode->netfs.inode));\n-\t\tret = netfs_writeback_single(mapping, wbc, &iter);\n+\t\t\t\t    round_up(len, PAGE_SIZE));\n+\t\tret = netfs_writeback_single(mapping, wbc, &iter, len);\n \t}\n \n \tup_read(&dvnode->validate_lock);\ndiff --git a/fs/afs/file.c b/fs/afs/file.c\nindex 424e0c98d67f..42131fe450af 100644\n--- a/fs/afs/file.c\n+++ b/fs/afs/file.c\n@@ -329,11 +329,12 @@ void afs_fetch_data_immediate_cancel(struct afs_call *call)\n /*\n  * Fetch file data from the volume.\n  */\n-static void afs_issue_read(struct netfs_io_subrequest *subreq)\n+static int afs_issue_read(struct netfs_io_subrequest *subreq)\n {\n \tstruct afs_operation *op;\n \tstruct afs_vnode *vnode = AFS_FS_I(subreq->rreq->inode);\n \tstruct key *key = subreq->rreq->netfs_priv;\n+\tint ret;\n \n \t_enter(\"%s{%llx:%llu.%u},%x,,,\",\n \t       vnode->volume->name,\n@@ -342,19 +343,21 @@ static void afs_issue_read(struct netfs_io_subrequest *subreq)\n \t       vnode->fid.unique,\n \t       key_serial(key));\n \n+\tret = netfs_prepare_read_buffer(subreq, INT_MAX);\n+\tif (ret < 0)\n+\t\treturn ret;\n+\n \top = afs_alloc_operation(key, vnode->volume);\n-\tif (IS_ERR(op)) {\n-\t\tsubreq->error = PTR_ERR(op);\n-\t\tnetfs_read_subreq_terminated(subreq);\n-\t\treturn;\n-\t}\n+\tif (IS_ERR(op))\n+\t\treturn PTR_ERR(op);\n \n \tafs_op_set_vnode(op, 0, vnode);\n \n \top->fetch.subreq = subreq;\n \top->ops\t\t= &afs_fetch_data_operation;\n \n-\ttrace_netfs_sreq(subreq, netfs_sreq_trace_submit);\n+\t/* After this point, we're not allowed to return an error. */\n+\tnetfs_mark_read_submission(subreq);\n \n \tif (subreq->rreq->origin == NETFS_READAHEAD ||\n \t    subreq->rreq->iocb) {\n@@ -363,18 +366,19 @@ static void afs_issue_read(struct netfs_io_subrequest *subreq)\n \t\tif (!afs_begin_vnode_operation(op)) {\n \t\t\tsubreq->error = afs_put_operation(op);\n \t\t\tnetfs_read_subreq_terminated(subreq);\n-\t\t\treturn;\n+\t\t\treturn -EIOCBQUEUED;\n \t\t}\n \n \t\tif (!afs_select_fileserver(op)) {\n-\t\t\tafs_end_read(op);\n-\t\t\treturn;\n+\t\t\tafs_end_read(op); /* Error recorded here. */\n+\t\t\treturn -EIOCBQUEUED;\n \t\t}\n \n \t\tafs_issue_read_call(op);\n \t} else {\n \t\tafs_do_sync_operation(op);\n \t}\n+\treturn -EIOCBQUEUED;\n }\n \n static int afs_init_request(struct netfs_io_request *rreq, struct file *file)\n@@ -453,7 +457,7 @@ const struct netfs_request_ops afs_req_ops = {\n \t.update_i_size\t\t= afs_update_i_size,\n \t.invalidate_cache\t= afs_netfs_invalidate_cache,\n \t.begin_writeback\t= afs_begin_writeback,\n-\t.prepare_write\t\t= afs_prepare_write,\n+\t.estimate_write\t\t= afs_estimate_write,\n \t.issue_write\t\t= afs_issue_write,\n \t.retry_request\t\t= afs_retry_request,\n };\ndiff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c\nindex 95494d5f2b8a..f59a9db4bb0e 100644\n--- a/fs/afs/fsclient.c\n+++ b/fs/afs/fsclient.c\n@@ -339,7 +339,9 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call)\n \t\tif (call->remaining == 0)\n \t\t\tgoto no_more_data;\n \n-\t\tcall->iter = &subreq->io_iter;\n+\t\tiov_iter_bvec_queue(&call->def_iter, ITER_DEST, subreq->content.bvecq,\n+\t\t\t\t    subreq->content.slot, subreq->content.offset, subreq->len);\n+\n \t\tcall->iov_len = umin(call->remaining, subreq->len - subreq->transferred);\n \t\tcall->unmarshall++;\n \t\tfallthrough;\n@@ -1085,7 +1087,7 @@ static void afs_fs_store_data64(struct afs_operation *op)\n \tif (!call)\n \t\treturn afs_op_nomem(op);\n \n-\tcall->write_iter = op->store.write_iter;\n+\tcall->write_iter = &op->store.write_iter;\n \n \t/* marshall the parameters */\n \tbp = call->request;\n@@ -1139,7 +1141,7 @@ void afs_fs_store_data(struct afs_operation *op)\n \tif (!call)\n \t\treturn afs_op_nomem(op);\n \n-\tcall->write_iter = op->store.write_iter;\n+\tcall->write_iter = &op->store.write_iter;\n \n \t/* marshall the parameters */\n \tbp = call->request;\ndiff --git a/fs/afs/internal.h b/fs/afs/internal.h\nindex 9bf5d2f1dbc4..a60df9357a4f 100644\n--- a/fs/afs/internal.h\n+++ b/fs/afs/internal.h\n@@ -906,7 +906,7 @@ struct afs_operation {\n \t\t\tafs_lock_type_t type;\n \t\t} lock;\n \t\tstruct {\n-\t\t\tstruct iov_iter\t*write_iter;\n+\t\t\tstruct iov_iter\twrite_iter;\n \t\t\tloff_t\tpos;\n \t\t\tloff_t\tsize;\n \t\t\tloff_t\ti_size;\n@@ -1680,8 +1680,10 @@ extern int afs_check_volume_status(struct afs_volume *, struct afs_operation *);\n /*\n  * write.c\n  */\n-void afs_prepare_write(struct netfs_io_subrequest *subreq);\n-void afs_issue_write(struct netfs_io_subrequest *subreq);\n+int afs_estimate_write(struct netfs_io_request *wreq,\n+\t\t       struct netfs_io_stream *stream,\n+\t\t       struct netfs_write_estimate *estimate);\n+int afs_issue_write(struct netfs_io_subrequest *subreq);\n void afs_begin_writeback(struct netfs_io_request *wreq);\n void afs_retry_request(struct netfs_io_request *wreq, struct netfs_io_stream *stream);\n extern int afs_writepages(struct address_space *, struct writeback_control *);\ndiff --git a/fs/afs/write.c b/fs/afs/write.c\nindex 93ad86ff3345..1f6045bfeecc 100644\n--- a/fs/afs/write.c\n+++ b/fs/afs/write.c\n@@ -84,17 +84,20 @@ static const struct afs_operation_ops afs_store_data_operation = {\n };\n \n /*\n- * Prepare a subrequest to write to the server.  This sets the max_len\n- * parameter.\n+ * Estimate the maximum size of a write we can send to the server.\n  */\n-void afs_prepare_write(struct netfs_io_subrequest *subreq)\n+int afs_estimate_write(struct netfs_io_request *wreq,\n+\t\t       struct netfs_io_stream *stream,\n+\t\t       struct netfs_write_estimate *estimate)\n {\n-\tstruct netfs_io_stream *stream = &subreq->rreq->io_streams[subreq->stream_nr];\n+\tunsigned long long limit = ULLONG_MAX - stream->issue_from;\n+\tunsigned long long max_len = 256 * 1024 * 1024;\n \n \t//if (test_bit(NETFS_SREQ_RETRYING, &subreq->flags))\n-\t//\tsubreq->max_len = 512 * 1024;\n-\t//else\n-\tstream->sreq_max_len = 256 * 1024 * 1024;\n+\t//\tmax_len = 512 * 1024;\n+\n+\testimate->issue_at = stream->issue_from + umin(max_len, limit);\n+\treturn 0;\n }\n \n /*\n@@ -140,12 +143,15 @@ static void afs_issue_write_worker(struct work_struct *work)\n \top->flags\t\t|= AFS_OPERATION_UNINTR;\n \top->ops\t\t\t= &afs_store_data_operation;\n \n+\ttrace_netfs_sreq(subreq, netfs_sreq_trace_submit);\n \tafs_begin_vnode_operation(op);\n \n-\top->store.write_iter\t= &subreq->io_iter;\n \top->store.i_size\t= umax(pos + len, vnode->netfs.remote_i_size);\n \top->mtime\t\t= inode_get_mtime(&vnode->netfs.inode);\n \n+\tiov_iter_bvec_queue(&op->store.write_iter, ITER_SOURCE, subreq->content.bvecq,\n+\t\t\t    subreq->content.slot, subreq->content.offset, subreq->len);\n+\n \tafs_wait_for_operation(op);\n \tret = afs_put_operation(op);\n \tswitch (ret) {\n@@ -169,11 +175,20 @@ static void afs_issue_write_worker(struct work_struct *work)\n \tnetfs_write_subrequest_terminated(subreq, ret < 0 ? ret : subreq->len);\n }\n \n-void afs_issue_write(struct netfs_io_subrequest *subreq)\n+int afs_issue_write(struct netfs_io_subrequest *subreq)\n {\n+\tint ret;\n+\n+\tif (subreq->len > 256 * 1024 * 1024)\n+\t\tsubreq->len = 256 * 1024 * 1024;\n+\tret = netfs_prepare_write_buffer(subreq, INT_MAX);\n+\tif (ret < 0)\n+\t\treturn ret;\n+\n \tsubreq->work.func = afs_issue_write_worker;\n \tif (!queue_work(system_dfl_wq, &subreq->work))\n \t\tWARN_ON_ONCE(1);\n+\treturn -EIOCBQUEUED;\n }\n \n /*\n@@ -184,6 +199,8 @@ void afs_begin_writeback(struct netfs_io_request *wreq)\n {\n \tif (S_ISREG(wreq->inode->i_mode))\n \t\tafs_get_writeback_key(wreq);\n+\n+\twreq->io_streams[0].avail = true;\n }\n \n /*\ndiff --git a/fs/afs/yfsclient.c b/fs/afs/yfsclient.c\nindex 24fb562ebd33..ffd1d4c87290 100644\n--- a/fs/afs/yfsclient.c\n+++ b/fs/afs/yfsclient.c\n@@ -385,7 +385,9 @@ static int yfs_deliver_fs_fetch_data64(struct afs_call *call)\n \t\tif (call->remaining == 0)\n \t\t\tgoto no_more_data;\n \n-\t\tcall->iter = &subreq->io_iter;\n+\t\tiov_iter_bvec_queue(&call->def_iter, ITER_DEST, subreq->content.bvecq,\n+\t\t\t\t    subreq->content.slot, subreq->content.offset, subreq->len);\n+\n \t\tcall->iov_len = min(call->remaining, subreq->len - subreq->transferred);\n \t\tcall->unmarshall++;\n \t\tfallthrough;\n@@ -1357,7 +1359,7 @@ void yfs_fs_store_data(struct afs_operation *op)\n \tif (!call)\n \t\treturn afs_op_nomem(op);\n \n-\tcall->write_iter = op->store.write_iter;\n+\tcall->write_iter = &op->store.write_iter;\n \n \t/* marshall the parameters */\n \tbp = call->request;\ndiff --git a/fs/cachefiles/io.c b/fs/cachefiles/io.c\nindex 2af55a75b511..05a37b4bdf10 100644\n--- a/fs/cachefiles/io.c\n+++ b/fs/cachefiles/io.c\n@@ -26,7 +26,10 @@ struct cachefiles_kiocb {\n \t};\n \tstruct cachefiles_object *object;\n \tnetfs_io_terminated_t\tterm_func;\n-\tvoid\t\t\t*term_func_priv;\n+\tunion {\n+\t\tstruct netfs_io_subrequest *subreq;\n+\t\tvoid\t\t\t*term_func_priv;\n+\t};\n \tbool\t\t\twas_async;\n \tunsigned int\t\tinval_counter;\t/* Copy of cookie->inval_counter */\n \tu64\t\t\tb_writing;\n@@ -194,6 +197,125 @@ static int cachefiles_read(struct netfs_cache_resources *cres,\n \treturn ret;\n }\n \n+/*\n+ * Handle completion of a read from the cache issued by netfslib.\n+ */\n+static void cachefiles_issue_read_complete(struct kiocb *iocb, long ret)\n+{\n+\tstruct cachefiles_kiocb *ki = container_of(iocb, struct cachefiles_kiocb, iocb);\n+\tstruct netfs_io_subrequest *subreq = ki->subreq;\n+\tstruct inode *inode = file_inode(ki->iocb.ki_filp);\n+\n+\t_enter(\"%ld\", ret);\n+\n+\tif (ret < 0) {\n+\t\tsubreq->error = -ESTALE;\n+\t\ttrace_cachefiles_io_error(ki->object, inode, ret,\n+\t\t\t\t\t  cachefiles_trace_read_error);\n+\t}\n+\n+\tif (ret >= 0) {\n+\t\tif (ki->object->cookie->inval_counter == ki->inval_counter) {\n+\t\t\tsubreq->error = 0;\n+\t\t\tif (ret > 0) {\n+\t\t\t\tsubreq->transferred += ret;\n+\t\t\t\t__set_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags);\n+\t\t\t}\n+\t\t} else {\n+\t\t\tsubreq->error = -ESTALE;\n+\t\t}\n+\t}\n+\n+\tnetfs_read_subreq_terminated(subreq);\n+\tcachefiles_put_kiocb(ki);\n+}\n+\n+/*\n+ * Issue a read operation to the cache.\n+ */\n+static int cachefiles_issue_read(struct netfs_io_subrequest *subreq)\n+{\n+\tstruct netfs_cache_resources *cres = &subreq->rreq->cache_resources;\n+\tstruct cachefiles_object *object;\n+\tstruct cachefiles_kiocb *ki;\n+\tstruct iov_iter iter;\n+\tstruct file *file;\n+\tunsigned int old_nofs;\n+\tssize_t ret = -ENOBUFS;\n+\n+\tif (!fscache_wait_for_operation(cres, FSCACHE_WANT_READ))\n+\t\treturn -ENOBUFS;\n+\n+\tfscache_count_read();\n+\tobject = cachefiles_cres_object(cres);\n+\tfile = cachefiles_cres_file(cres);\n+\n+\t_enter(\"%pD,%li,%llx,%zx/%llx\",\n+\t       file, file_inode(file)->i_ino, subreq->start, subreq->len,\n+\t       i_size_read(file_inode(file)));\n+\n+\tif (subreq->len > MAX_RW_COUNT)\n+\t\tsubreq->len = MAX_RW_COUNT;\n+\n+\tret = netfs_prepare_read_buffer(subreq, BIO_MAX_VECS);\n+\tif (ret < 0)\n+\t\treturn ret;\n+\n+\tiov_iter_bvec_queue(&iter, ITER_DEST, subreq->content.bvecq,\n+\t\t\t    subreq->content.slot, subreq->content.offset, subreq->len);\n+\n+\tki = kzalloc_obj(struct cachefiles_kiocb);\n+\tif (!ki)\n+\t\treturn -ENOMEM;\n+\n+\trefcount_set(&ki->ki_refcnt, 2);\n+\tki->iocb.ki_filp\t= file;\n+\tki->iocb.ki_pos\t\t= subreq->start;\n+\tki->iocb.ki_flags\t= IOCB_DIRECT;\n+\tki->iocb.ki_ioprio\t= get_current_ioprio();\n+\tki->iocb.ki_complete\t= cachefiles_issue_read_complete;\n+\tki->object\t\t= object;\n+\tki->inval_counter\t= cres->inval_counter;\n+\tki->subreq\t\t= subreq;\n+\tki->was_async\t\t= true;\n+\n+\t/* After this point, we're not allowed to return an error. */\n+\tnetfs_mark_read_submission(subreq);\n+\n+\tget_file(ki->iocb.ki_filp);\n+\tcachefiles_grab_object(object, cachefiles_obj_get_ioreq);\n+\n+\ttrace_cachefiles_read(object, file_inode(file), ki->iocb.ki_pos, subreq->len);\n+\told_nofs = memalloc_nofs_save();\n+\tret = cachefiles_inject_read_error();\n+\tif (ret == 0)\n+\t\tret = vfs_iocb_iter_read(file, &ki->iocb, &iter);\n+\tmemalloc_nofs_restore(old_nofs);\n+\n+\tswitch (ret) {\n+\tcase -EIOCBQUEUED:\n+\t\tbreak;\n+\n+\tcase -ERESTARTSYS:\n+\tcase -ERESTARTNOINTR:\n+\tcase -ERESTARTNOHAND:\n+\tcase -ERESTART_RESTARTBLOCK:\n+\t\t/* There's no easy way to restart the syscall since other AIO's\n+\t\t * may be already running. Just fail this IO with EINTR.\n+\t\t */\n+\t\tret = -EINTR;\n+\t\tfallthrough;\n+\tdefault:\n+\t\tki->was_async = false;\n+\t\tcachefiles_issue_read_complete(&ki->iocb, ret);\n+\t\tbreak;\n+\t}\n+\n+\tcachefiles_put_kiocb(ki);\n+\t_leave(\" = %zd\", ret);\n+\treturn -EIOCBQUEUED;\n+}\n+\n /*\n  * Query the occupancy of the cache in a region, returning the extent of the\n  * next two chunks of cached data and the next hole.\n@@ -610,104 +732,67 @@ int __cachefiles_prepare_write(struct cachefiles_object *object,\n \t\t\t\t    cachefiles_has_space_for_write);\n }\n \n-static int cachefiles_prepare_write(struct netfs_cache_resources *cres,\n-\t\t\t\t    loff_t *_start, size_t *_len, size_t upper_len,\n-\t\t\t\t    loff_t i_size, bool no_space_allocated_yet)\n-{\n-\tstruct cachefiles_object *object = cachefiles_cres_object(cres);\n-\tstruct cachefiles_cache *cache = object->volume->cache;\n-\tconst struct cred *saved_cred;\n-\tint ret;\n-\n-\tif (!cachefiles_cres_file(cres)) {\n-\t\tif (!fscache_wait_for_operation(cres, FSCACHE_WANT_WRITE))\n-\t\t\treturn -ENOBUFS;\n-\t\tif (!cachefiles_cres_file(cres))\n-\t\t\treturn -ENOBUFS;\n-\t}\n-\n-\tcachefiles_begin_secure(cache, &saved_cred);\n-\tret = __cachefiles_prepare_write(object, cachefiles_cres_file(cres),\n-\t\t\t\t\t _start, _len, upper_len,\n-\t\t\t\t\t no_space_allocated_yet);\n-\tcachefiles_end_secure(cache, saved_cred);\n-\treturn ret;\n-}\n-\n-static void cachefiles_prepare_write_subreq(struct netfs_io_subrequest *subreq)\n+static int cachefiles_estimate_write(struct netfs_io_request *wreq,\n+\t\t\t\t     struct netfs_io_stream *stream,\n+\t\t\t\t     struct netfs_write_estimate *estimate)\n {\n-\tstruct netfs_io_request *wreq = subreq->rreq;\n-\tstruct netfs_cache_resources *cres = &wreq->cache_resources;\n-\tstruct netfs_io_stream *stream = &wreq->io_streams[subreq->stream_nr];\n-\n-\t_enter(\"W=%x[%x] %llx\", wreq->debug_id, subreq->debug_index, subreq->start);\n-\n-\tstream->sreq_max_len = MAX_RW_COUNT;\n-\tstream->sreq_max_segs = BIO_MAX_VECS;\n-\n-\tif (!cachefiles_cres_file(cres)) {\n-\t\tif (!fscache_wait_for_operation(cres, FSCACHE_WANT_WRITE))\n-\t\t\treturn netfs_prepare_write_failed(subreq);\n-\t\tif (!cachefiles_cres_file(cres))\n-\t\t\treturn netfs_prepare_write_failed(subreq);\n-\t}\n+\testimate->issue_at = stream->issue_from + MAX_RW_COUNT;\n+\testimate->max_segs = BIO_MAX_VECS;\n+\treturn 0;\n }\n \n-static void cachefiles_issue_write(struct netfs_io_subrequest *subreq)\n+static int cachefiles_issue_write(struct netfs_io_subrequest *subreq)\n {\n \tstruct netfs_io_request *wreq = subreq->rreq;\n \tstruct netfs_cache_resources *cres = &wreq->cache_resources;\n \tstruct cachefiles_object *object = cachefiles_cres_object(cres);\n \tstruct cachefiles_cache *cache = object->volume->cache;\n+\tstruct iov_iter iter;\n \tconst struct cred *saved_cred;\n-\tsize_t off, pre, post, len = subreq->len;\n \tloff_t start = subreq->start;\n+\tsize_t len = subreq->len;\n \tint ret;\n \n \t_enter(\"W=%x[%x] %llx-%llx\",\n \t       wreq->debug_id, subreq->debug_index, start, start + len - 1);\n \n-\t/* We need to start on the cache granularity boundary */\n-\toff = start & (cache->bsize - 1);\n-\tif (off) {\n-\t\tpre = cache->bsize - off;\n-\t\tif (pre >= len) {\n-\t\t\tfscache_count_dio_misfit();\n-\t\t\tnetfs_write_subrequest_terminated(subreq, len);\n-\t\t\treturn;\n-\t\t}\n-\t\tsubreq->transferred += pre;\n-\t\tstart += pre;\n-\t\tlen -= pre;\n-\t\tiov_iter_advance(&subreq->io_iter, pre);\n-\t}\n-\n-\t/* We also need to end on the cache granularity boundary */\n-\tpost = len & (cache->bsize - 1);\n-\tif (post) {\n-\t\tlen -= post;\n-\t\tif (len == 0) {\n-\t\t\tfscache_count_dio_misfit();\n-\t\t\tnetfs_write_subrequest_terminated(subreq, post);\n-\t\t\treturn;\n-\t\t}\n-\t\tiov_iter_truncate(&subreq->io_iter, len);\n+\tif (!cachefiles_cres_file(cres)) {\n+\t\tif (!fscache_wait_for_operation(cres, FSCACHE_WANT_WRITE))\n+\t\t\treturn -EINVAL;\n+\t\tif (!cachefiles_cres_file(cres))\n+\t\t\treturn -EINVAL;\n+\t}\n+\n+\tret = netfs_prepare_write_buffer(subreq, BIO_MAX_VECS);\n+\tif (ret < 0)\n+\t\treturn ret;\n+\n+\t/* The buffer extraction func may round out start and end. */\n+\tstart = subreq->start;\n+\tlen = subreq->len;\n+\n+\t/* We need to start and end on cache granularity boundaries. */\n+\tif (WARN_ON_ONCE(start & (cache->bsize - 1)) ||\n+\t    WARN_ON_ONCE(len   & (cache->bsize - 1))) {\n+\t\tfscache_count_dio_misfit();\n+\t\treturn -EIO;\n \t}\n \n+\tiov_iter_bvec_queue(&iter, ITER_SOURCE, subreq->content.bvecq,\n+\t\t\t    subreq->content.slot, subreq->content.offset, len);\n+\n \ttrace_netfs_sreq(subreq, netfs_sreq_trace_cache_prepare);\n \tcachefiles_begin_secure(cache, &saved_cred);\n \tret = __cachefiles_prepare_write(object, cachefiles_cres_file(cres),\n \t\t\t\t\t &start, &len, len, true);\n \tcachefiles_end_secure(cache, saved_cred);\n-\tif (ret < 0) {\n-\t\tnetfs_write_subrequest_terminated(subreq, ret);\n-\t\treturn;\n-\t}\n+\tif (ret < 0)\n+\t\treturn ret;\n \n \ttrace_netfs_sreq(subreq, netfs_sreq_trace_cache_write);\n-\tcachefiles_write(&subreq->rreq->cache_resources,\n-\t\t\t subreq->start, &subreq->io_iter,\n+\tcachefiles_write(&subreq->rreq->cache_resources, subreq->start, &iter,\n \t\t\t netfs_write_subrequest_terminated, subreq);\n+\treturn -EIOCBQUEUED;\n }\n \n /*\n@@ -854,9 +939,9 @@ static const struct netfs_cache_ops cachefiles_netfs_cache_ops = {\n \t.end_operation\t\t= cachefiles_end_operation,\n \t.read\t\t\t= cachefiles_read,\n \t.write\t\t\t= cachefiles_write,\n+\t.issue_read\t\t= cachefiles_issue_read,\n \t.issue_write\t\t= cachefiles_issue_write,\n-\t.prepare_write\t\t= cachefiles_prepare_write,\n-\t.prepare_write_subreq\t= cachefiles_prepare_write_subreq,\n+\t.estimate_write\t\t= cachefiles_estimate_write,\n \t.prepare_ondemand_read\t= cachefiles_prepare_ondemand_read,\n \t.query_occupancy\t= cachefiles_query_occupancy,\n \t.collect_write\t\t= cachefiles_collect_write,\ndiff --git a/fs/ceph/Kconfig b/fs/ceph/Kconfig\nindex 3d64a316ca31..aa6ccd7794d2 100644\n--- a/fs/ceph/Kconfig\n+++ b/fs/ceph/Kconfig\n@@ -4,6 +4,7 @@ config CEPH_FS\n \tdepends on INET\n \tselect CEPH_LIB\n \tselect NETFS_SUPPORT\n+\tselect NETFS_PGPRIV2\n \tselect FS_ENCRYPTION_ALGS if FS_ENCRYPTION\n \tdefault n\n \thelp\ndiff --git a/fs/ceph/addr.c b/fs/ceph/addr.c\nindex e87b3bb94ee8..8aab4f7c516f 100644\n--- a/fs/ceph/addr.c\n+++ b/fs/ceph/addr.c\n@@ -269,7 +269,7 @@ static void finish_netfs_read(struct ceph_osd_request *req)\n \tceph_dec_osd_stopping_blocker(fsc->mdsc);\n }\n \n-static bool ceph_netfs_issue_op_inline(struct netfs_io_subrequest *subreq)\n+static int ceph_netfs_issue_op_inline(struct netfs_io_subrequest *subreq)\n {\n \tstruct netfs_io_request *rreq = subreq->rreq;\n \tstruct inode *inode = rreq->inode;\n@@ -278,7 +278,8 @@ static bool ceph_netfs_issue_op_inline(struct netfs_io_subrequest *subreq)\n \tstruct ceph_mds_request *req;\n \tstruct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);\n \tstruct ceph_inode_info *ci = ceph_inode(inode);\n-\tssize_t err = 0;\n+\tstruct iov_iter iter;\n+\tssize_t err;\n \tsize_t len;\n \tint mode;\n \n@@ -287,21 +288,33 @@ static bool ceph_netfs_issue_op_inline(struct netfs_io_subrequest *subreq)\n \t\t__set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);\n \t__clear_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags);\n \n-\tif (subreq->start >= inode->i_size)\n+\tif (subreq->start >= inode->i_size) {\n+\t\t__set_bit(NETFS_SREQ_HIT_EOF, &subreq->flags);\n+\t\terr = 0;\n \t\tgoto out;\n+\t}\n+\n+\terr = netfs_prepare_read_buffer(subreq, INT_MAX);\n+\tif (err < 0)\n+\t\treturn err;\n+\n+\tiov_iter_bvec_queue(&iter, ITER_DEST, subreq->content.bvecq,\n+\t\t\t    subreq->content.slot, subreq->content.offset,\n+\t\t\t    subreq->len);\n \n \t/* We need to fetch the inline data. */\n \tmode = ceph_try_to_choose_auth_mds(inode, CEPH_STAT_CAP_INLINE_DATA);\n \treq = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, mode);\n-\tif (IS_ERR(req)) {\n-\t\terr = PTR_ERR(req);\n-\t\tgoto out;\n-\t}\n+\tif (IS_ERR(req))\n+\t\treturn PTR_ERR(req);\n+\n \treq->r_ino1 = ci->i_vino;\n \treq->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INLINE_DATA);\n \treq->r_num_caps = 2;\n \n-\ttrace_netfs_sreq(subreq, netfs_sreq_trace_submit);\n+\t/* After this point, we're not allowed to return an error. */\n+\tnetfs_mark_read_submission(subreq);\n+\n \terr = ceph_mdsc_do_request(mdsc, NULL, req);\n \tif (err < 0)\n \t\tgoto out;\n@@ -311,11 +324,11 @@ static bool ceph_netfs_issue_op_inline(struct netfs_io_subrequest *subreq)\n \tif (iinfo->inline_version == CEPH_INLINE_NONE) {\n \t\t/* The data got uninlined */\n \t\tceph_mdsc_put_request(req);\n-\t\treturn false;\n+\t\treturn 1;\n \t}\n \n \tlen = min_t(size_t, iinfo->inline_len - subreq->start, subreq->len);\n-\terr = copy_to_iter(iinfo->inline_data + subreq->start, len, &subreq->io_iter);\n+\terr = copy_to_iter(iinfo->inline_data + subreq->start, len, &iter);\n \tif (err == 0) {\n \t\terr = -EFAULT;\n \t} else {\n@@ -328,26 +341,10 @@ static bool ceph_netfs_issue_op_inline(struct netfs_io_subrequest *subreq)\n \tsubreq->error = err;\n \ttrace_netfs_sreq(subreq, netfs_sreq_trace_io_progress);\n \tnetfs_read_subreq_terminated(subreq);\n-\treturn true;\n+\treturn -EIOCBQUEUED;\n }\n \n-static int ceph_netfs_prepare_read(struct netfs_io_subrequest *subreq)\n-{\n-\tstruct netfs_io_request *rreq = subreq->rreq;\n-\tstruct inode *inode = rreq->inode;\n-\tstruct ceph_inode_info *ci = ceph_inode(inode);\n-\tstruct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);\n-\tu64 objno, objoff;\n-\tu32 xlen;\n-\n-\t/* Truncate the extent at the end of the current block */\n-\tceph_calc_file_object_mapping(&ci->i_layout, subreq->start, subreq->len,\n-\t\t\t\t      &objno, &objoff, &xlen);\n-\trreq->io_streams[0].sreq_max_len = umin(xlen, fsc->mount_options->rsize);\n-\treturn 0;\n-}\n-\n-static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq)\n+static int ceph_netfs_issue_read(struct netfs_io_subrequest *subreq)\n {\n \tstruct netfs_io_request *rreq = subreq->rreq;\n \tstruct inode *inode = rreq->inode;\n@@ -356,48 +353,65 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq)\n \tstruct ceph_client *cl = fsc->client;\n \tstruct ceph_osd_request *req = NULL;\n \tstruct ceph_vino vino = ceph_vino(inode);\n+\tstruct iov_iter iter;\n+\tu64 objno, objoff, len, off = subreq->start;\n+\tu32 maxlen;\n \tint err;\n-\tu64 len;\n \tbool sparse = IS_ENCRYPTED(inode) || ceph_test_mount_opt(fsc, SPARSEREAD);\n-\tu64 off = subreq->start;\n \tint extent_cnt;\n \n-\tif (ceph_inode_is_shutdown(inode)) {\n-\t\terr = -EIO;\n-\t\tgoto out;\n+\tif (ceph_inode_is_shutdown(inode))\n+\t\treturn -EIO;\n+\n+\tif (ceph_has_inline_data(ci)) {\n+\t\terr = ceph_netfs_issue_op_inline(subreq);\n+\t\tif (err != 1)\n+\t\t\treturn err;\n \t}\n \n-\tif (ceph_has_inline_data(ci) && ceph_netfs_issue_op_inline(subreq))\n-\t\treturn;\n+\t/* Truncate the extent at the end of the current block */\n+\tceph_calc_file_object_mapping(&ci->i_layout, subreq->start, subreq->len,\n+\t\t\t\t      &objno, &objoff, &maxlen);\n+\tmaxlen = umin(maxlen, fsc->mount_options->rsize);\n+\tlen = umin(subreq->len, maxlen);\n+\tsubreq->len = len;\n \n \t// TODO: This rounding here is slightly dodgy.  It *should* work, for\n \t// now, as the cache only deals in blocks that are a multiple of\n \t// PAGE_SIZE and fscrypt blocks are at most PAGE_SIZE.  What needs to\n \t// happen is for the fscrypt driving to be moved into netfslib and the\n \t// data in the cache also to be stored encrypted.\n-\tlen = subreq->len;\n \tceph_fscrypt_adjust_off_and_len(inode, &off, &len);\n \n \treq = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, vino,\n \t\t\toff, &len, 0, 1, sparse ? CEPH_OSD_OP_SPARSE_READ : CEPH_OSD_OP_READ,\n \t\t\tCEPH_OSD_FLAG_READ, NULL, ci->i_truncate_seq,\n \t\t\tci->i_truncate_size, false);\n-\tif (IS_ERR(req)) {\n-\t\terr = PTR_ERR(req);\n-\t\treq = NULL;\n-\t\tgoto out;\n-\t}\n+\tif (IS_ERR(req))\n+\t\treturn PTR_ERR(req);\n \n \tif (sparse) {\n \t\textent_cnt = __ceph_sparse_read_ext_count(inode, len);\n \t\terr = ceph_alloc_sparse_ext_map(&req->r_ops[0], extent_cnt);\n-\t\tif (err)\n-\t\t\tgoto out;\n+\t\tif (err) {\n+\t\t\tceph_osdc_put_request(req);\n+\t\t\treturn err;\n+\t\t}\n \t}\n \n \tdoutc(cl, \"%llx.%llx pos=%llu orig_len=%zu len=%llu\\n\",\n \t      ceph_vinop(inode), subreq->start, subreq->len, len);\n \n+\terr = netfs_prepare_read_buffer(subreq, INT_MAX);\n+\tif (err < 0) {\n+\t\tceph_osdc_put_request(req);\n+\t\treturn err;\n+\t}\n+\n+\tiov_iter_bvec_queue(&iter, ITER_DEST, subreq->content.bvecq,\n+\t\t\t    subreq->content.slot, subreq->content.offset,\n+\t\t\t    subreq->len);\n+\n \t/*\n \t * FIXME: For now, use CEPH_OSD_DATA_TYPE_PAGES instead of _ITER for\n \t * encrypted inodes. We'd need infrastructure that handles an iov_iter\n@@ -416,13 +430,12 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq)\n \t\t * ceph_msg_data_cursor_init() triggers BUG_ON() in the case\n \t\t * if msg->sparse_read_total > msg->data_length.\n \t\t */\n-\t\tsubreq->io_iter.count = len;\n-\n-\t\terr = iov_iter_get_pages_alloc2(&subreq->io_iter, &pages, len, &page_off);\n+\t\terr = iov_iter_get_pages_alloc2(&iter, &pages, len, &page_off);\n \t\tif (err < 0) {\n \t\t\tdoutc(cl, \"%llx.%llx failed to allocate pages, %d\\n\",\n \t\t\t      ceph_vinop(inode), err);\n-\t\t\tgoto out;\n+\t\t\tceph_osdc_put_request(req);\n+\t\t\treturn -EIO;\n \t\t}\n \n \t\t/* should always give us a page-aligned read */\n@@ -433,32 +446,28 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq)\n \t\tosd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, false,\n \t\t\t\t\t\t false);\n \t} else {\n-\t\tosd_req_op_extent_osd_iter(req, 0, &subreq->io_iter);\n+\t\tosd_req_op_extent_osd_iter(req, 0, &iter);\n \t}\n \tif (!ceph_inc_osd_stopping_blocker(fsc->mdsc)) {\n-\t\terr = -EIO;\n-\t\tgoto out;\n+\t\tceph_osdc_put_request(req);\n+\t\treturn -EIO;\n \t}\n \treq->r_callback = finish_netfs_read;\n \treq->r_priv = subreq;\n \treq->r_inode = inode;\n \tihold(inode);\n \n-\ttrace_netfs_sreq(subreq, netfs_sreq_trace_submit);\n+\t/* After this point, we're not allowed to return an error. */\n+\tnetfs_mark_read_submission(subreq);\n \tceph_osdc_start_request(req->r_osdc, req);\n-out:\n \tceph_osdc_put_request(req);\n-\tif (err) {\n-\t\tsubreq->error = err;\n-\t\tnetfs_read_subreq_terminated(subreq);\n-\t}\n-\tdoutc(cl, \"%llx.%llx result %d\\n\", ceph_vinop(inode), err);\n+\tdoutc(cl, \"%llx.%llx result -EIOCBQUEUED\\n\", ceph_vinop(inode));\n+\treturn -EIOCBQUEUED;\n }\n \n static int ceph_init_request(struct netfs_io_request *rreq, struct file *file)\n {\n \tstruct inode *inode = rreq->inode;\n-\tstruct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);\n \tstruct ceph_client *cl = ceph_inode_to_client(inode);\n \tint got = 0, want = CEPH_CAP_FILE_CACHE;\n \tstruct ceph_netfs_request_data *priv;\n@@ -510,7 +519,6 @@ static int ceph_init_request(struct netfs_io_request *rreq, struct file *file)\n \n \tpriv->caps = got;\n \trreq->netfs_priv = priv;\n-\trreq->io_streams[0].sreq_max_len = fsc->mount_options->rsize;\n \n out:\n \tif (ret < 0) {\n@@ -538,7 +546,6 @@ static void ceph_netfs_free_request(struct netfs_io_request *rreq)\n const struct netfs_request_ops ceph_netfs_ops = {\n \t.init_request\t\t= ceph_init_request,\n \t.free_request\t\t= ceph_netfs_free_request,\n-\t.prepare_read\t\t= ceph_netfs_prepare_read,\n \t.issue_read\t\t= ceph_netfs_issue_read,\n \t.expand_readahead\t= ceph_netfs_expand_readahead,\n \t.check_write_begin\t= ceph_netfs_check_write_begin,\ndiff --git a/fs/netfs/Kconfig b/fs/netfs/Kconfig\nindex 7701c037c328..d0e7b0971fa3 100644\n--- a/fs/netfs/Kconfig\n+++ b/fs/netfs/Kconfig\n@@ -22,6 +22,9 @@ config NETFS_STATS\n \t  between CPUs.  On the other hand, the stats are very useful for\n \t  debugging purposes.  Saying 'Y' here is recommended.\n \n+config NETFS_PGPRIV2\n+\tbool\n+\n config NETFS_DEBUG\n \tbool \"Enable dynamic debugging netfslib and FS-Cache\"\n \tdepends on NETFS_SUPPORT\ndiff --git a/fs/netfs/Makefile b/fs/netfs/Makefile\nindex 0621e6870cbd..421dd0be413b 100644\n--- a/fs/netfs/Makefile\n+++ b/fs/netfs/Makefile\n@@ -12,13 +12,13 @@ netfs-y := \\\n \tmisc.o \\\n \tobjects.o \\\n \tread_collect.o \\\n-\tread_pgpriv2.o \\\n \tread_retry.o \\\n \tread_single.o \\\n \twrite_collect.o \\\n \twrite_issue.o \\\n \twrite_retry.o\n \n+netfs-$(CONFIG_NETFS_PGPRIV2) += read_pgpriv2.o\n netfs-$(CONFIG_NETFS_STATS) += stats.o\n \n netfs-$(CONFIG_FSCACHE) += \\\ndiff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c\nindex 2cfd33abfb80..81aa99910e5d 100644\n--- a/fs/netfs/buffered_read.c\n+++ b/fs/netfs/buffered_read.c\n@@ -98,51 +98,68 @@ static int netfs_begin_cache_read(struct netfs_io_request *rreq, struct netfs_in\n }\n \n /*\n- * netfs_prepare_read_iterator - Prepare the subreq iterator for I/O\n- * @subreq: The subrequest to be set up\n- *\n- * Prepare the I/O iterator representing the read buffer on a subrequest for\n- * the filesystem to use for I/O (it can be passed directly to a socket).  This\n- * is intended to be called from the ->issue_read() method once the filesystem\n- * has trimmed the request to the size it wants.\n- *\n- * Returns the limited size if successful and -ENOMEM if insufficient memory\n- * available.\n+ * Prepare the I/O buffer on a buffered read subrequest for the filesystem to\n+ * use as a bvec queue.\n  */\n-static ssize_t netfs_prepare_read_iterator(struct netfs_io_subrequest *subreq)\n+static int netfs_prepare_buffered_read_buffer(struct netfs_io_subrequest *subreq,\n+\t\t\t\t\t      unsigned int max_segs)\n {\n \tstruct netfs_io_request *rreq = subreq->rreq;\n \tstruct netfs_io_stream *stream = &rreq->io_streams[0];\n \tssize_t extracted;\n-\tsize_t rsize = subreq->len;\n \n-\tif (subreq->source == NETFS_DOWNLOAD_FROM_SERVER)\n-\t\trsize = umin(rsize, stream->sreq_max_len);\n+\t_enter(\"R=%08x[%x] l=%zx s=%u\",\n+\t       rreq->debug_id, subreq->debug_index, subreq->len, max_segs);\n \n-\tbvecq_pos_set(&subreq->dispatch_pos, &rreq->dispatch_cursor);\n-\textracted = bvecq_slice(&rreq->dispatch_cursor, subreq->len,\n-\t\t\t\tstream->sreq_max_segs, &subreq->nr_segs);\n-\tif (extracted < rsize) {\n+\tbvecq_pos_set(&subreq->dispatch_pos, &stream->dispatch_cursor);\n+\tbvecq_pos_set(&subreq->content, &subreq->dispatch_pos);\n+\textracted = bvecq_slice(&stream->dispatch_cursor, subreq->len,\n+\t\t\t\tmax_segs, &subreq->nr_segs);\n+\n+\tstream->buffered -= extracted;\n+\tif (extracted < subreq->len) {\n \t\tsubreq->len = extracted;\n \t\ttrace_netfs_sreq(subreq, netfs_sreq_trace_limited);\n \t}\n \n-\treturn subreq->len;\n+\treturn 0;\n }\n \n-/*\n- * Issue a read against the cache.\n- * - Eats the caller's ref on subreq.\n+/**\n+ * netfs_prepare_read_buffer - Get the buffer for a subrequest\n+ * @subreq: The subrequest to get the buffer for\n+ * @max_segs: Maximum number of segments in buffer (or INT_MAX)\n+ *\n+ * Extract a slice of buffer from the stream and attach it to the subrequest as\n+ * a bio_vec queue.  The maximum amount of data attached is set by\n+ * @subreq->len, but this may be shortened if @max_segs would be exceeded.\n+ *\n+ * [!] NOTE: This must be run in the same thread as ->issue_read() was called\n+ * in as we access the readahead_control struct if there is one.\n  */\n-static void netfs_read_cache_to_pagecache(struct netfs_io_request *rreq,\n-\t\t\t\t\t  struct netfs_io_subrequest *subreq)\n+int netfs_prepare_read_buffer(struct netfs_io_subrequest *subreq,\n+\t\t\t      unsigned int max_segs)\n {\n-\tstruct netfs_cache_resources *cres = &rreq->cache_resources;\n-\n-\tnetfs_stat(&netfs_n_rh_read);\n-\tcres->ops->read(cres, subreq->start, &subreq->io_iter, NETFS_READ_HOLE_IGNORE,\n-\t\t\tnetfs_cache_read_terminated, subreq);\n+\tswitch (subreq->rreq->origin) {\n+\tcase NETFS_READAHEAD:\n+\tcase NETFS_READPAGE:\n+\tcase NETFS_READ_FOR_WRITE:\n+\t\tif (subreq->retry_count)\n+\t\t\treturn netfs_prepare_buffered_read_retry_buffer(subreq, max_segs);\n+\t\treturn netfs_prepare_buffered_read_buffer(subreq, max_segs);\n+\n+\tcase NETFS_UNBUFFERED_READ:\n+\tcase NETFS_DIO_READ:\n+\tcase NETFS_READ_GAPS:\n+\t\treturn netfs_prepare_unbuffered_read_buffer(subreq, max_segs);\n+\tcase NETFS_READ_SINGLE:\n+\t\treturn netfs_prepare_read_single_buffer(subreq, max_segs);\n+\tdefault:\n+\t\tWARN_ON_ONCE(1);\n+\t\treturn -EIO;\n+\t}\n }\n+EXPORT_SYMBOL(netfs_prepare_read_buffer);\n \n int netfs_read_query_cache(struct netfs_io_request *rreq, struct fscache_occupancy *occ)\n {\n@@ -157,12 +174,22 @@ int netfs_read_query_cache(struct netfs_io_request *rreq, struct fscache_occupan\n \treturn cres->ops->query_occupancy(cres, occ);\n }\n \n-static void netfs_queue_read(struct netfs_io_request *rreq,\n-\t\t\t     struct netfs_io_subrequest *subreq,\n-\t\t\t     bool last_subreq)\n+/**\n+ * netfs_mark_read_submission - Mark a read subrequest as being ready for submission\n+ * @subreq: The subrequest to be marked\n+ *\n+ * Calling this marks a read subrequest as being ready for submission and makes\n+ * it available to the collection thread.  After calling this, the filesystem's\n+ * ->issue_read() method must invoke netfs_read_subreq_terminated() to end the\n+ * subrequest and must return -EIOCBQUEUED.\n+ */\n+void netfs_mark_read_submission(struct netfs_io_subrequest *subreq)\n {\n+\tstruct netfs_io_request *rreq = subreq->rreq;\n \tstruct netfs_io_stream *stream = &rreq->io_streams[0];\n \n+\t_enter(\"R=%08x[%x]\", rreq->debug_id, subreq->debug_index);\n+\n \t__set_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags);\n \n \t/* We add to the end of the list whilst the collector may be walking\n@@ -170,45 +197,57 @@ static void netfs_queue_read(struct netfs_io_request *rreq,\n \t * remove entries off of the front.\n \t */\n \tspin_lock(&rreq->lock);\n-\tlist_add_tail(&subreq->rreq_link, &stream->subrequests);\n-\tif (list_is_first(&subreq->rreq_link, &stream->subrequests)) {\n-\t\tif (!stream->active) {\n-\t\t\tstream->collected_to = subreq->start;\n-\t\t\t/* Store list pointers before active flag */\n-\t\t\tsmp_store_release(&stream->active, true);\n+\tif (list_empty(&subreq->rreq_link)) {\n+\t\tlist_add_tail(&subreq->rreq_link, &stream->subrequests);\n+\t\tif (list_is_first(&subreq->rreq_link, &stream->subrequests)) {\n+\t\t\tif (!stream->active) {\n+\t\t\t\tstream->collected_to = subreq->start;\n+\t\t\t\t/* Store list pointers before active flag */\n+\t\t\t\tsmp_store_release(&stream->active, true);\n+\t\t\t}\n \t\t}\n \t}\n \n-\tif (last_subreq) {\n+\trreq->submitted += subreq->len;\n+\tstream->issue_from = subreq->start + subreq->len;\n+\tif (!stream->buffered) {\n \t\tsmp_wmb(); /* Write lists before ALL_QUEUED. */\n \t\tset_bit(NETFS_RREQ_ALL_QUEUED, &rreq->flags);\n+\t\ttrace_netfs_rreq(rreq, netfs_rreq_trace_all_queued);\n \t}\n \n \tspin_unlock(&rreq->lock);\n+\n+\ttrace_netfs_sreq(subreq, netfs_sreq_trace_submit);\n }\n+EXPORT_SYMBOL(netfs_mark_read_submission);\n \n-static void netfs_issue_read(struct netfs_io_request *rreq,\n-\t\t\t     struct netfs_io_subrequest *subreq)\n+static int netfs_issue_read(struct netfs_io_request *rreq,\n+\t\t\t    struct netfs_io_subrequest *subreq)\n {\n-\tbvecq_pos_set(&subreq->content, &subreq->dispatch_pos);\n-\tiov_iter_bvec_queue(&subreq->io_iter, ITER_DEST, subreq->content.bvecq,\n-\t\t\t    subreq->content.slot, subreq->content.offset, subreq->len);\n+\tstruct netfs_io_stream *stream = &rreq->io_streams[0];\n+\n+\t_enter(\"R=%08x[%x]\", rreq->debug_id, subreq->debug_index);\n \n \tswitch (subreq->source) {\n \tcase NETFS_DOWNLOAD_FROM_SERVER:\n-\t\trreq->netfs_ops->issue_read(subreq);\n-\t\tbreak;\n-\tcase NETFS_READ_FROM_CACHE:\n-\t\tnetfs_read_cache_to_pagecache(rreq, subreq);\n-\t\tbreak;\n+\t\treturn rreq->netfs_ops->issue_read(subreq);\n+\tcase NETFS_READ_FROM_CACHE: {\n+\t\tstruct netfs_cache_resources *cres = &rreq->cache_resources;\n+\n+\t\tnetfs_stat(&netfs_n_rh_read);\n+\t\tcres->ops->issue_read(subreq);\n+\t\treturn -EIOCBQUEUED;\n+\t}\n \tdefault:\n-\t\tbvecq_zero(&rreq->dispatch_cursor, subreq->len);\n+\t\tstream->issue_from = subreq->start + subreq->len;\n+\t\tstream->buffered = 0;\n+\t\tnetfs_mark_read_submission(subreq);\n+\t\tbvecq_zero(&stream->dispatch_cursor, subreq->len);\n \t\tsubreq->transferred = subreq->len;\n \t\tsubreq->error = 0;\n-\t\tiov_iter_zero(subreq->len, &subreq->io_iter);\n-\t\tsubreq->transferred = subreq->len;\n \t\tnetfs_read_subreq_terminated(subreq);\n-\t\tbreak;\n+\t\treturn 0;\n \t}\n }\n \n@@ -228,21 +267,18 @@ static void netfs_read_to_pagecache(struct netfs_io_request *rreq)\n \t\t.cached_to[1]\t= ULLONG_MAX,\n \t};\n \tstruct fscache_occupancy *occ = &_occ;\n+\tstruct netfs_io_stream *stream = &rreq->io_streams[0];\n \tstruct netfs_inode *ictx = netfs_inode(rreq->inode);\n-\tunsigned long long start = rreq->start;\n-\tssize_t size = rreq->len;\n \tint ret = 0;\n \n \t_enter(\"R=%08x\", rreq->debug_id);\n \n-\tbvecq_pos_set(&rreq->dispatch_cursor, &rreq->load_cursor);\n-\tbvecq_pos_set(&rreq->collect_cursor, &rreq->dispatch_cursor);\n+\tbvecq_pos_set(&stream->dispatch_cursor, &rreq->load_cursor);\n+\tbvecq_pos_set(&rreq->collect_cursor, &rreq->load_cursor);\n \n \tdo {\n-\t\tint (*prepare_read)(struct netfs_io_subrequest *subreq) = NULL;\n \t\tstruct netfs_io_subrequest *subreq;\n-\t\tunsigned long long hole_to, cache_to;\n-\t\tssize_t slice;\n+\t\tunsigned long long hole_to, cache_to, stop;\n \n \t\t/* If we don't have any, find out the next couple of data\n \t\t * extents from the cache, containing of following the\n@@ -251,7 +287,7 @@ static void netfs_read_to_pagecache(struct netfs_io_request *rreq)\n \t\t */\n \t\thole_to = occ->cached_from[0];\n \t\tcache_to = occ->cached_to[0];\n-\t\tif (start >= cache_to) {\n+\t\tif (stream->issue_from >= cache_to) {\n \t\t\t/* Extent exhausted; shuffle down. */\n \t\t\tint i;\n \n@@ -279,36 +315,33 @@ static void netfs_read_to_pagecache(struct netfs_io_request *rreq)\n \t\t\tbreak;\n \t\t}\n \n-\t\tsubreq->start\t= start;\n-\t\tsubreq->len\t= size;\n+\t\tsubreq->start = stream->issue_from;\n+\t\tstop = stream->issue_from + stream->buffered;\n \n \t\t_debug(\"rsub %llx %llx-%llx\", subreq->start, hole_to, cache_to);\n \n-\t\tif (start >= hole_to && start < cache_to) {\n+\t\tif (stream->issue_from >= hole_to && stream->issue_from < cache_to) {\n \t\t\t/* Overlap with a cached region, where the cache may\n \t\t\t * record a block of zeroes.\n \t\t\t */\n-\t\t\t_debug(\"cached s=%llx c=%llx l=%zx\", start, cache_to, size);\n-\t\t\tsubreq->len = umin(cache_to - start, size);\n+\t\t\t_debug(\"cached s=%llx c=%llx l=%zx\",\n+\t\t\t       stream->issue_from, cache_to, stream->buffered);\n+\t\t\tsubreq->len = umin(cache_to - stream->issue_from, stream->buffered);\n \t\t\tsubreq->len = round_up(subreq->len, occ->granularity);\n \t\t\tif (occ->cached_type[0] == FSCACHE_EXTENT_ZERO) {\n \t\t\t\tsubreq->source = NETFS_FILL_WITH_ZEROES;\n \t\t\t\tnetfs_stat(&netfs_n_rh_zero);\n \t\t\t} else {\n \t\t\t\tsubreq->source = NETFS_READ_FROM_CACHE;\n-\t\t\t\tprepare_read = rreq->cache_resources.ops->prepare_read;\n \t\t\t}\n-\n-\t\t\ttrace_netfs_sreq(subreq, netfs_sreq_trace_prepare);\n-\n \t\t} else if ((subreq->start >= ictx->zero_point ||\n \t\t\t    subreq->start >= rreq->i_size) &&\n-\t\t\t   size > 0) {\n+\t\t\t   subreq->start < stop) {\n \t\t\t/* If this range lies beyond the zero-point, that part\n \t\t\t * can just be cleared locally.\n \t\t\t */\n-\t\t\t_debug(\"zero %llx-%llx\", start, start + size);\n-\t\t\tsubreq->len = size;\n+\t\t\t_debug(\"zero %llx-%llx\", subreq->start, stop);\n+\t\t\tsubreq->len = stream->buffered;\n \t\t\tsubreq->source = NETFS_FILL_WITH_ZEROES;\n \t\t\tif (rreq->cache_resources.ops)\n \t\t\t\t__set_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags);\n@@ -319,10 +352,10 @@ static void netfs_read_to_pagecache(struct netfs_io_request *rreq)\n \t\t\t * that part can just be cleared locally.\n \t\t\t */\n \t\t\tunsigned long long zlimit = umin(rreq->i_size, ictx->zero_point);\n-\t\t\tunsigned long long limit = min3(zlimit, start + size, hole_to);\n+\t\t\tunsigned long long limit = min3(zlimit, stop, hole_to);\n \n \t\t\t_debug(\"limit %llx %llx\", rreq->i_size, ictx->zero_point);\n-\t\t\t_debug(\"download %llx-%llx\", start, start + size);\n+\t\t\t_debug(\"download %llx-%llx\", subreq->start, stop);\n \t\t\tsubreq->len = umin(limit - subreq->start, ULONG_MAX);\n \t\t\tsubreq->source = NETFS_DOWNLOAD_FROM_SERVER;\n \t\t\tif (rreq->cache_resources.ops)\n@@ -330,10 +363,10 @@ static void netfs_read_to_pagecache(struct netfs_io_request *rreq)\n \t\t\tnetfs_stat(&netfs_n_rh_download);\n \t\t}\n \n-\t\tif (size == 0) {\n+\t\tif (subreq->len == 0) {\n \t\t\tpr_err(\"ZERO-LEN READ: R=%08x[%x] l=%zx/%zx s=%llx z=%llx i=%llx\",\n \t\t\t       rreq->debug_id, subreq->debug_index,\n-\t\t\t       subreq->len, size,\n+\t\t\t       subreq->len, stream->buffered,\n \t\t\t       subreq->start, ictx->zero_point, rreq->i_size);\n \t\t\ttrace_netfs_sreq(subreq, netfs_sreq_trace_cancel);\n \t\t\t/* Not queued - release both refs. */\n@@ -342,24 +375,8 @@ static void netfs_read_to_pagecache(struct netfs_io_request *rreq)\n \t\t\tbreak;\n \t\t}\n \n-\t\trreq->io_streams[0].sreq_max_len = MAX_RW_COUNT;\n-\t\trreq->io_streams[0].sreq_max_segs = INT_MAX;\n-\n-\t\tif (prepare_read) {\n-\t\t\tret = prepare_read(subreq);\n-\t\t\tif (ret < 0) {\n-\t\t\t\tsubreq->error = ret;\n-\t\t\t\t/* Not queued - release both refs. */\n-\t\t\t\tnetfs_put_subrequest(subreq, netfs_sreq_trace_put_cancel);\n-\t\t\t\tnetfs_put_subrequest(subreq, netfs_sreq_trace_put_cancel);\n-\t\t\t\tbreak;\n-\t\t\t}\n-\t\t\ttrace_netfs_sreq(subreq, netfs_sreq_trace_prepare);\n-\t\t}\n-\n-\t\tslice = netfs_prepare_read_iterator(subreq);\n-\t\tif (slice < 0) {\n-\t\t\tret = slice;\n+\t\tret = netfs_issue_read(rreq, subreq);\n+\t\tif (ret != 0 && ret != -EIOCBQUEUED) {\n \t\t\tsubreq->error = ret;\n \t\t\ttrace_netfs_sreq(subreq, netfs_sreq_trace_cancel);\n \t\t\t/* Not queued - release both refs. */\n@@ -367,18 +384,12 @@ static void netfs_read_to_pagecache(struct netfs_io_request *rreq)\n \t\t\tnetfs_put_subrequest(subreq, netfs_sreq_trace_put_cancel);\n \t\t\tbreak;\n \t\t}\n-\t\tsize -= slice;\n-\t\tstart += slice;\n+\t\tret = 0;\n \n-\t\ttrace_netfs_sreq(subreq, netfs_sreq_trace_submit);\n-\n-\t\tnetfs_queue_read(rreq, subreq, size <= 0);\n-\t\tnetfs_issue_read(rreq, subreq);\n-\t\tnetfs_maybe_bulk_drop_ra_refs(rreq);\n \t\tcond_resched();\n-\t} while (size > 0);\n+\t} while (stream->buffered > 0);\n \n-\tif (unlikely(size > 0)) {\n+\tif (unlikely(stream->buffered > 0)) {\n \t\tsmp_wmb(); /* Write lists before ALL_QUEUED. */\n \t\tset_bit(NETFS_RREQ_ALL_QUEUED, &rreq->flags);\n \t\tnetfs_wake_collector(rreq);\n@@ -388,7 +399,7 @@ static void netfs_read_to_pagecache(struct netfs_io_request *rreq)\n \tcmpxchg(&rreq->error, 0, ret);\n \n \tbvecq_pos_unset(&rreq->load_cursor);\n-\tbvecq_pos_unset(&rreq->dispatch_cursor);\n+\tbvecq_pos_unset(&stream->dispatch_cursor);\n }\n \n /**\n@@ -409,17 +420,22 @@ static void netfs_read_to_pagecache(struct netfs_io_request *rreq)\n void netfs_readahead(struct readahead_control *ractl)\n {\n \tstruct netfs_io_request *rreq;\n+\tstruct netfs_io_stream *stream;\n \tstruct netfs_inode *ictx = netfs_inode(ractl->mapping->host);\n \tunsigned long long start = readahead_pos(ractl);\n \tssize_t added;\n \tsize_t size = readahead_length(ractl);\n \tint ret;\n \n+\t_enter(\"\");\n+\n \trreq = netfs_alloc_request(ractl->mapping, ractl->file, start, size,\n \t\t\t\t   NETFS_READAHEAD);\n \tif (IS_ERR(rreq))\n \t\treturn;\n \n+\tstream = &rreq->io_streams[0];\n+\n \t__set_bit(NETFS_RREQ_OFFLOAD_COLLECTION, &rreq->flags);\n \n \tret = netfs_begin_cache_read(rreq, ictx);\n@@ -446,6 +462,8 @@ void netfs_readahead(struct readahead_control *ractl)\n \trreq->submitted = rreq->start + added;\n \trreq->cleaned_to = rreq->start;\n \trreq->front_folio_order = get_order(rreq->load_cursor.bvecq->bv[0].bv_len);\n+\tstream->issue_from = rreq->start;\n+\tstream->buffered = added;\n \n \tnetfs_read_to_pagecache(rreq);\n \tnetfs_maybe_bulk_drop_ra_refs(rreq);\n@@ -461,6 +479,7 @@ EXPORT_SYMBOL(netfs_readahead);\n  */\n static int netfs_create_singular_buffer(struct netfs_io_request *rreq, struct folio *folio)\n {\n+\tstruct netfs_io_stream *stream = &rreq->io_streams[0];\n \tstruct bvecq *bq;\n \tsize_t fsize = folio_size(folio);\n \n@@ -470,6 +489,8 @@ static int netfs_create_singular_buffer(struct netfs_io_request *rreq, struct fo\n \tbq = rreq->load_cursor.bvecq;\n \tbvec_set_folio(&bq->bv[bq->nr_slots++], folio, fsize, 0);\n \trreq->submitted = rreq->start + fsize;\n+\tstream->issue_from = rreq->start;\n+\tstream->buffered = fsize;\n \treturn 0;\n }\n \n@@ -479,6 +500,7 @@ static int netfs_create_singular_buffer(struct netfs_io_request *rreq, struct fo\n static int netfs_read_gaps(struct file *file, struct folio *folio)\n {\n \tstruct netfs_io_request *rreq;\n+\tstruct netfs_io_stream *stream;\n \tstruct address_space *mapping = folio->mapping;\n \tstruct netfs_folio *finfo = netfs_folio_info(folio);\n \tstruct netfs_inode *ctx = netfs_inode(mapping->host);\n@@ -499,6 +521,7 @@ static int netfs_read_gaps(struct file *file, struct folio *folio)\n \t\tret = PTR_ERR(rreq);\n \t\tgoto alloc_error;\n \t}\n+\tstream = &rreq->io_streams[0];\n \n \tret = netfs_begin_cache_read(rreq, ctx);\n \tif (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)\n@@ -546,6 +569,8 @@ static int netfs_read_gaps(struct file *file, struct folio *folio)\n \t}\n \n \trreq->submitted = rreq->start + flen;\n+\tstream->issue_from = rreq->start;\n+\tstream->buffered = flen;\n \n \tnetfs_read_to_pagecache(rreq);\n \n@@ -618,6 +643,7 @@ int netfs_read_folio(struct file *file, struct folio *folio)\n \t\tgoto discard;\n \n \tnetfs_read_to_pagecache(rreq);\n+\n \tret = netfs_wait_for_read(rreq);\n \tnetfs_put_request(rreq, netfs_rreq_trace_put_return);\n \treturn ret < 0 ? ret : 0;\ndiff --git a/fs/netfs/buffered_write.c b/fs/netfs/buffered_write.c\nindex bce3e7109ec1..855c14118c53 100644\n--- a/fs/netfs/buffered_write.c\n+++ b/fs/netfs/buffered_write.c\n@@ -114,8 +114,8 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter,\n \t\t.range_start\t= iocb->ki_pos,\n \t\t.range_end\t= iocb->ki_pos + iter->count,\n \t};\n-\tstruct netfs_io_request *wreq = NULL;\n-\tstruct folio *folio = NULL, *writethrough = NULL;\n+\tstruct netfs_writethrough *writethrough = NULL;\n+\tstruct folio *folio = NULL;\n \tunsigned int bdp_flags = (iocb->ki_flags & IOCB_NOWAIT) ? BDP_ASYNC : 0;\n \tssize_t written = 0, ret, ret2;\n \tloff_t pos = iocb->ki_pos;\n@@ -132,15 +132,13 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter,\n \t\t\tgoto out;\n \t\t}\n \n-\t\twreq = netfs_begin_writethrough(iocb, iter->count);\n-\t\tif (IS_ERR(wreq)) {\n+\t\twritethrough = netfs_begin_writethrough(iocb, iter->count);\n+\t\tif (IS_ERR(writethrough)) {\n \t\t\twbc_detach_inode(&wbc);\n-\t\t\tret = PTR_ERR(wreq);\n-\t\t\twreq = NULL;\n+\t\t\tret = PTR_ERR(writethrough);\n+\t\t\twritethrough = NULL;\n \t\t\tgoto out;\n \t\t}\n-\t\tif (!is_sync_kiocb(iocb))\n-\t\t\twreq->iocb = iocb;\n \t\tnetfs_stat(&netfs_n_wh_writethrough);\n \t} else {\n \t\tnetfs_stat(&netfs_n_wh_buffered_write);\n@@ -264,7 +262,7 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter,\n \t\t * a file that's open for reading as ->read_folio() then has to\n \t\t * be able to flush it.\n \t\t */\n-\t\tif ((file->f_mode & FMODE_READ) ||\n+\t\tif (//(file->f_mode & FMODE_READ) ||\n \t\t    netfs_is_cache_enabled(ctx)) {\n \t\t\tif (finfo) {\n \t\t\t\tnetfs_stat(&netfs_n_wh_wstream_conflict);\n@@ -355,13 +353,12 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter,\n \t\tpos += copied;\n \t\twritten += copied;\n \n-\t\tif (likely(!wreq)) {\n+\t\tif (likely(!writethrough)) {\n \t\t\tfolio_mark_dirty(folio);\n \t\t\tfolio_unlock(folio);\n \t\t} else {\n-\t\t\tnetfs_advance_writethrough(wreq, &wbc, folio, copied,\n-\t\t\t\t\t\t   offset + copied == flen,\n-\t\t\t\t\t\t   &writethrough);\n+\t\t\tnetfs_advance_writethrough(writethrough, &wbc, folio, copied,\n+\t\t\t\t\t\t   offset + copied == flen);\n \t\t\t/* Folio unlocked */\n \t\t}\n \tretry:\n@@ -385,8 +382,8 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter,\n \t\t\tctx->ops->post_modify(inode);\n \t}\n \n-\tif (unlikely(wreq)) {\n-\t\tret2 = netfs_end_writethrough(wreq, &wbc, writethrough);\n+\tif (unlikely(writethrough)) {\n+\t\tret2 = netfs_end_writethrough(writethrough, &wbc);\n \t\twbc_detach_inode(&wbc);\n \t\tif (ret2 == -EIOCBQUEUED)\n \t\t\treturn ret2;\ndiff --git a/fs/netfs/direct_read.c b/fs/netfs/direct_read.c\nindex 05d09ba3d0d0..872e44227368 100644\n--- a/fs/netfs/direct_read.c\n+++ b/fs/netfs/direct_read.c\n@@ -16,6 +16,28 @@\n #include <linux/netfs.h>\n #include \"internal.h\"\n \n+int netfs_prepare_unbuffered_read_buffer(struct netfs_io_subrequest *subreq,\n+\t\t\t\t\t unsigned int max_segs)\n+{\n+\tstruct netfs_io_request *rreq = subreq->rreq;\n+\tstruct netfs_io_stream *stream = &rreq->io_streams[0];\n+\tsize_t len;\n+\n+\tbvecq_pos_set(&subreq->dispatch_pos, &stream->dispatch_cursor);\n+\tbvecq_pos_set(&subreq->content, &stream->dispatch_cursor);\n+\tlen = bvecq_slice(&stream->dispatch_cursor, subreq->len, max_segs,\n+\t\t\t  &subreq->nr_segs);\n+\n+\tif (len < subreq->len) {\n+\t\tsubreq->len = len;\n+\t\ttrace_netfs_sreq(subreq, netfs_sreq_trace_limited);\n+\t}\n+\n+\tstream->buffered   -= subreq->len;\n+\tstream->issue_from += subreq->len;\n+\treturn 0;\n+}\n+\n /*\n  * Perform a read to a buffer from the server, slicing up the region to be read\n  * according to the network rsize.\n@@ -23,11 +45,9 @@\n static int netfs_dispatch_unbuffered_reads(struct netfs_io_request *rreq)\n {\n \tstruct netfs_io_stream *stream = &rreq->io_streams[0];\n-\tunsigned long long start = rreq->start;\n-\tssize_t size = rreq->len;\n \tint ret = 0;\n \n-\tbvecq_pos_set(&rreq->dispatch_cursor, &rreq->load_cursor);\n+\tbvecq_pos_transfer(&stream->dispatch_cursor, &rreq->load_cursor);\n \n \tdo {\n \t\tstruct netfs_io_subrequest *subreq;\n@@ -39,66 +59,36 @@ static int netfs_dispatch_unbuffered_reads(struct netfs_io_request *rreq)\n \t\t}\n \n \t\tsubreq->source\t= NETFS_DOWNLOAD_FROM_SERVER;\n-\t\tsubreq->start\t= start;\n-\t\tsubreq->len\t= size;\n-\n-\t\t__set_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags);\n-\n-\t\tspin_lock(&rreq->lock);\n-\t\tlist_add_tail(&subreq->rreq_link, &stream->subrequests);\n-\t\tif (list_is_first(&subreq->rreq_link, &stream->subrequests)) {\n-\t\t\tif (!stream->active) {\n-\t\t\t\tstream->collected_to = subreq->start;\n-\t\t\t\t/* Store list pointers before active flag */\n-\t\t\t\tsmp_store_release(&stream->active, true);\n-\t\t\t}\n-\t\t}\n-\t\ttrace_netfs_sreq(subreq, netfs_sreq_trace_added);\n-\t\tspin_unlock(&rreq->lock);\n+\t\tsubreq->start\t= stream->issue_from;\n+\t\tsubreq->len\t= stream->buffered;\n \n \t\tnetfs_stat(&netfs_n_rh_download);\n-\t\tif (rreq->netfs_ops->prepare_read) {\n-\t\t\tret = rreq->netfs_ops->prepare_read(subreq);\n-\t\t\tif (ret < 0) {\n-\t\t\t\tnetfs_put_subrequest(subreq, netfs_sreq_trace_put_cancel);\n-\t\t\t\tbreak;\n-\t\t\t}\n-\t\t}\n \n-\t\tbvecq_pos_set(&subreq->dispatch_pos, &rreq->dispatch_cursor);\n-\t\tbvecq_pos_set(&subreq->content, &rreq->dispatch_cursor);\n-\t\tsubreq->len = bvecq_slice(&rreq->dispatch_cursor,\n-\t\t\t\t\t  umin(size, stream->sreq_max_len),\n-\t\t\t\t\t  stream->sreq_max_segs,\n-\t\t\t\t\t  &subreq->nr_segs);\n-\n-\t\tsize -= subreq->len;\n-\t\tstart += subreq->len;\n-\t\trreq->submitted += subreq->len;\n-\t\tif (size <= 0) {\n-\t\t\tsmp_wmb(); /* Write lists before ALL_QUEUED. */\n-\t\t\tset_bit(NETFS_RREQ_ALL_QUEUED, &rreq->flags);\n+\t\tret = rreq->netfs_ops->issue_read(subreq);\n+\t\tif (ret != 0 && ret != -EIOCBQUEUED) {\n+\t\t\tsubreq->error = ret;\n+\t\t\ttrace_netfs_sreq(subreq, netfs_sreq_trace_cancel);\n+\t\t\t/* Not queued - release both refs. */\n+\t\t\tnetfs_put_subrequest(subreq, netfs_sreq_trace_put_cancel);\n+\t\t\tnetfs_put_subrequest(subreq, netfs_sreq_trace_put_cancel);\n+\t\t\tbreak;\n \t\t}\n \n-\t\tiov_iter_bvec_queue(&subreq->io_iter, ITER_DEST, subreq->content.bvecq,\n-\t\t\t\t    subreq->content.slot, subreq->content.offset, subreq->len);\n-\n-\t\trreq->netfs_ops->issue_read(subreq);\n-\n+\t\tret = 0;\n \t\tif (test_bit(NETFS_RREQ_PAUSE, &rreq->flags))\n \t\t\tnetfs_wait_for_paused_read(rreq);\n \t\tif (test_bit(NETFS_RREQ_FAILED, &rreq->flags))\n \t\t\tbreak;\n \t\tcond_resched();\n-\t} while (size > 0);\n+\t} while (stream->buffered > 0);\n \n-\tif (unlikely(size > 0)) {\n+\tif (unlikely(stream->buffered > 0)) {\n \t\tsmp_wmb(); /* Write lists before ALL_QUEUED. */\n \t\tset_bit(NETFS_RREQ_ALL_QUEUED, &rreq->flags);\n \t\tnetfs_wake_collector(rreq);\n \t}\n \n-\tbvecq_pos_unset(&rreq->dispatch_cursor);\n+\tbvecq_pos_unset(&stream->dispatch_cursor);\n \treturn ret;\n }\n \n@@ -154,6 +144,7 @@ static ssize_t netfs_unbuffered_read(struct netfs_io_request *rreq, bool sync)\n ssize_t netfs_unbuffered_read_iter_locked(struct kiocb *iocb, struct iov_iter *iter)\n {\n \tstruct netfs_io_request *rreq;\n+\tstruct netfs_io_stream *stream;\n \tssize_t ret;\n \tsize_t orig_count = iov_iter_count(iter);\n \tbool sync = is_sync_kiocb(iocb);\n@@ -178,6 +169,8 @@ ssize_t netfs_unbuffered_read_iter_locked(struct kiocb *iocb, struct iov_iter *i\n \tnetfs_stat(&netfs_n_rh_dio_read);\n \ttrace_netfs_read(rreq, rreq->start, rreq->len, netfs_read_trace_dio_read);\n \n+\tstream = &rreq->io_streams[0];\n+\n \t/* If this is an async op, we have to keep track of the destination\n \t * buffer for ourselves as the caller's iterator will be trashed when\n \t * we return.\n@@ -192,6 +185,10 @@ ssize_t netfs_unbuffered_read_iter_locked(struct kiocb *iocb, struct iov_iter *i\n \tif (ret < 0)\n \t\tgoto error_put;\n \n+\trreq->len = ret;\n+\tstream->buffered = ret;\n+\tstream->issue_from = rreq->start;\n+\n \t// TODO: Set up bounce buffer if needed\n \n \tif (!sync) {\ndiff --git a/fs/netfs/direct_write.c b/fs/netfs/direct_write.c\nindex a61c6d6fd17f..b04b16d35c38 100644\n--- a/fs/netfs/direct_write.c\n+++ b/fs/netfs/direct_write.c\n@@ -9,6 +9,32 @@\n #include <linux/uio.h>\n #include \"internal.h\"\n \n+/*\n+ * Prepare the buffer for an unbuffered/DIO write.\n+ */\n+int netfs_prepare_unbuffered_write_buffer(struct netfs_io_subrequest *subreq,\n+\t\t\t\t\t  unsigned int max_segs)\n+{\n+\tstruct netfs_io_stream *stream = &subreq->rreq->io_streams[subreq->stream_nr];\n+\tsize_t len;\n+\n+\tbvecq_pos_set(&subreq->dispatch_pos, &stream->dispatch_cursor);\n+\tbvecq_pos_set(&subreq->content, &stream->dispatch_cursor);\n+\tlen = bvecq_slice(&stream->dispatch_cursor, subreq->len, max_segs,\n+\t\t\t  &subreq->nr_segs);\n+\n+\tif (len < subreq->len) {\n+\t\tsubreq->len = len;\n+\t\ttrace_netfs_sreq(subreq, netfs_sreq_trace_limited);\n+\t}\n+\n+\t// TODO: Wait here for completion of prev subreq\n+\n+\tstream->issue_from += subreq->len;\n+\tstream->buffered   -= subreq->len;\n+\treturn 0;\n+}\n+\n /*\n  * Perform the cleanup rituals after an unbuffered write is complete.\n  */\n@@ -74,9 +100,9 @@ static void netfs_unbuffered_write_collect(struct netfs_io_request *wreq,\n \n \twreq->transferred += subreq->transferred;\n \tif (subreq->transferred < subreq->len) {\n-\t\tbvecq_pos_unset(&wreq->dispatch_cursor);\n-\t\tbvecq_pos_transfer(&wreq->dispatch_cursor, &subreq->dispatch_pos);\n-\t\tbvecq_pos_advance(&wreq->dispatch_cursor, subreq->transferred);\n+\t\tbvecq_pos_unset(&stream->dispatch_cursor);\n+\t\tbvecq_pos_transfer(&stream->dispatch_cursor, &subreq->dispatch_pos);\n+\t\tbvecq_pos_advance(&stream->dispatch_cursor, subreq->transferred);\n \t}\n \n \tstream->collected_to = subreq->start + subreq->transferred;\n@@ -85,6 +111,7 @@ static void netfs_unbuffered_write_collect(struct netfs_io_request *wreq,\n \n \ttrace_netfs_collect_stream(wreq, stream);\n \ttrace_netfs_collect_state(wreq, wreq->collected_to, 0);\n+\t/* TODO: Progressively clean up wreq->direct_bq */\n }\n \n /*\n@@ -103,60 +130,60 @@ static int netfs_unbuffered_write(struct netfs_io_request *wreq)\n \n \t_enter(\"%llx\", wreq->len);\n \n-\tbvecq_pos_set(&wreq->dispatch_cursor, &wreq->load_cursor);\n-\tbvecq_pos_set(&wreq->collect_cursor, &wreq->dispatch_cursor);\n+\tstream->issue_from = wreq->start;\n+\tstream->buffered = wreq->len;\n+\tbvecq_pos_set(&stream->dispatch_cursor, &wreq->load_cursor);\n+\tbvecq_pos_set(&wreq->collect_cursor, &stream->dispatch_cursor);\n \n \tif (wreq->origin == NETFS_DIO_WRITE)\n \t\tinode_dio_begin(wreq->inode);\n \n-\tstream->collected_to = wreq->start;\n-\n \tfor (;;) {\n \t\tbool retry = false;\n \n \t\tif (!subreq) {\n-\t\t\tnetfs_prepare_write(wreq, stream, wreq->start + wreq->transferred);\n-\t\t\tsubreq = stream->construct;\n-\t\t\tstream->construct = NULL;\n-\t\t} else {\n-\t\t\tbvecq_pos_set(&subreq->dispatch_pos, &wreq->dispatch_cursor);\n+\t\t\tsubreq = netfs_alloc_write_subreq(wreq, stream);\n+\t\t\tif (!subreq)\n+\t\t\t\treturn -ENOMEM;\n \t\t}\n \n-\t\t/* Check if (re-)preparation failed. */\n-\t\tif (unlikely(test_bit(NETFS_SREQ_FAILED, &subreq->flags))) {\n-\t\t\tnetfs_write_subrequest_terminated(subreq, subreq->error);\n-\t\t\twreq->error = subreq->error;\n+\t\tret = stream->issue_write(subreq);\n+\t\tswitch (ret) {\n+\t\tcase 0:\n+\t\t\t/* Already completed synchronously. */\n \t\t\tbreak;\n-\t\t}\n-\n-\t\tsubreq->len = bvecq_slice(&wreq->dispatch_cursor, stream->sreq_max_len,\n-\t\t\t\t\t  stream->sreq_max_segs, &subreq->nr_segs);\n-\t\tbvecq_pos_set(&subreq->content, &subreq->dispatch_pos);\n-\n-\t\tiov_iter_bvec_queue(&subreq->io_iter, ITER_SOURCE,\n-\t\t\t\t    subreq->content.bvecq, subreq->content.slot,\n-\t\t\t\t    subreq->content.offset,\n-\t\t\t\t    subreq->len);\n-\n-\t\tif (!iov_iter_count(&subreq->io_iter))\n+\t\tcase -EIOCBQUEUED:\n+\t\t\t/* Async, need to wait. */\n+\t\t\tret = netfs_wait_for_in_progress_subreq(wreq, subreq);\n+\t\t\tif (ret < 0) {\n+\t\t\t\tif (ret == -EAGAIN) {\n+\t\t\t\t\tretry = true;\n+\t\t\t\t\tbreak;\n+\t\t\t\t}\n+\n+\t\t\t\tlist_del_init(&subreq->rreq_link);\n+\t\t\t\tret = subreq->error;\n+\t\t\t\tnetfs_put_subrequest(subreq, netfs_sreq_trace_put_failed);\n+\t\t\t\tsubreq = NULL;\n+\t\t\t\tgoto failed;\n+\t\t\t}\n \t\t\tbreak;\n-\n-\t\ttrace_netfs_sreq(subreq, netfs_sreq_trace_submit);\n-\t\tstream->issue_write(subreq);\n-\n-\t\t/* Async, need to wait. */\n-\t\tnetfs_wait_for_in_progress_stream(wreq, stream);\n-\n-\t\tif (test_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags)) {\n+\t\tcase -EAGAIN:\n+\t\t\t/* Need to retry. */\n+\t\t\t__set_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags);\n \t\t\tretry = true;\n-\t\t} else if (test_bit(NETFS_SREQ_FAILED, &subreq->flags)) {\n-\t\t\tret = subreq->error;\n+\t\t\tbreak;\n+\t\tdefault:\n+\t\t\t/* Probably failed before dispatch. */\n+\t\t\tsubreq->error = ret;\n \t\t\twreq->error = ret;\n-\t\t\tnetfs_see_subrequest(subreq, netfs_sreq_trace_see_failed);\n+\t\t\t__set_bit(NETFS_SREQ_FAILED, &subreq->flags);\n+\t\t\ttrace_netfs_sreq(subreq, netfs_sreq_trace_cancel);\n+\t\t\tlist_del_init(&subreq->rreq_link);\n+\t\t\tnetfs_put_subrequest(subreq, netfs_sreq_trace_put_cancel);\n \t\t\tsubreq = NULL;\n-\t\t\tbreak;\n+\t\t\tgoto failed;\n \t\t}\n-\t\tret = 0;\n \n \t\tif (!retry) {\n \t\t\tnetfs_unbuffered_write_collect(wreq, stream, subreq);\n@@ -171,20 +198,21 @@ static int netfs_unbuffered_write(struct netfs_io_request *wreq)\n \t\t\tcontinue;\n \t\t}\n \n-\t\t/* We need to retry the last subrequest, so first reset the\n-\t\t * iterator, taking into account what, if anything, we managed\n-\t\t * to transfer.\n+\t\t/* We need to retry the last subrequest, so first wind back the\n+\t\t * buffer position.\n \t\t */\n \t\tsubreq->error = -EAGAIN;\n \t\ttrace_netfs_sreq(subreq, netfs_sreq_trace_retry);\n \n \t\tbvecq_pos_unset(&subreq->content);\n-\t\tbvecq_pos_unset(&wreq->dispatch_cursor);\n-\t\tbvecq_pos_transfer(&wreq->dispatch_cursor, &subreq->dispatch_pos);\n+\t\tbvecq_pos_unset(&stream->dispatch_cursor);\n+\t\tbvecq_pos_transfer(&stream->dispatch_cursor, &subreq->dispatch_pos);\n \n \t\tif (subreq->transferred > 0) {\n-\t\t\twreq->transferred += subreq->transferred;\n-\t\t\tbvecq_pos_advance(&wreq->dispatch_cursor, subreq->transferred);\n+\t\t\twreq->transferred  += subreq->transferred;\n+\t\t\tstream->issue_from -= subreq->len - subreq->transferred;\n+\t\t\tstream->buffered   += subreq->len - subreq->transferred;\n+\t\t\tbvecq_pos_advance(&stream->dispatch_cursor, subreq->transferred);\n \t\t}\n \n \t\tif (stream->source == NETFS_UPLOAD_TO_SERVER &&\n@@ -192,25 +220,21 @@ static int netfs_unbuffered_write(struct netfs_io_request *wreq)\n \t\t\twreq->netfs_ops->retry_request(wreq, stream);\n \n \t\t__clear_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags);\n-\t\t__clear_bit(NETFS_SREQ_BOUNDARY, &subreq->flags);\n \t\t__clear_bit(NETFS_SREQ_FAILED, &subreq->flags);\n-\t\tsubreq->start\t\t= wreq->start + wreq->transferred;\n-\t\tsubreq->len\t\t= wreq->len   - wreq->transferred;\n+\t\t__clear_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags);\n+\t\tsubreq->start\t\t= stream->issue_from;\n+\t\tsubreq->len\t\t= stream->buffered;\n \t\tsubreq->transferred\t= 0;\n \t\tsubreq->retry_count\t+= 1;\n-\t\tstream->sreq_max_len\t= UINT_MAX;\n-\t\tstream->sreq_max_segs\t= INT_MAX;\n \n \t\tnetfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit);\n \n-\t\tif (stream->prepare_write)\n-\t\t\tstream->prepare_write(subreq);\n \t\t__set_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags);\n \t\tnetfs_stat(&netfs_n_wh_retry_write_subreq);\n \t}\n \n-\tbvecq_pos_unset(&wreq->dispatch_cursor);\n-\tbvecq_pos_unset(&wreq->load_cursor);\n+failed:\n+\tbvecq_pos_unset(&stream->dispatch_cursor);\n \tnetfs_unbuffered_write_done(wreq);\n \t_leave(\" = %d\", ret);\n \treturn ret;\n@@ -254,6 +278,7 @@ ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov_iter *\n \tif (IS_ERR(wreq))\n \t\treturn PTR_ERR(wreq);\n \n+\twreq->len = iov_iter_count(iter);\n \twreq->io_streams[0].avail = true;\n \ttrace_netfs_write(wreq, (iocb->ki_flags & IOCB_DIRECT ?\n \t\t\t\t netfs_write_trace_dio_write :\n@@ -264,9 +289,7 @@ ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov_iter *\n \t\t * we have to save the source buffer as the iterator is only\n \t\t * good until we return.  In such a case, extract an iterator\n \t\t * to represent as much of the the output buffer as we can\n-\t\t * manage.  Note that the extraction might not be able to\n-\t\t * allocate a sufficiently large bvec array and may shorten the\n-\t\t * request.\n+\t\t * manage.  Note that the extraction may shorten the request.\n \t\t */\n \t\tssize_t n = netfs_extract_iter(iter, len, INT_MAX, iocb->ki_pos,\n \t\t\t\t\t       &wreq->load_cursor.bvecq, 0);\n@@ -281,8 +304,6 @@ ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov_iter *\n \t\t       wreq->load_cursor.bvecq->max_slots);\n \t}\n \n-\t__set_bit(NETFS_RREQ_USE_IO_ITER, &wreq->flags);\n-\n \t/* Copy the data into the bounce buffer and encrypt it. */\n \t// TODO\n \ndiff --git a/fs/netfs/fscache_io.c b/fs/netfs/fscache_io.c\nindex 37f05b4d3469..70b10ac23a27 100644\n--- a/fs/netfs/fscache_io.c\n+++ b/fs/netfs/fscache_io.c\n@@ -239,10 +239,6 @@ void __fscache_write_to_cache(struct fscache_cookie *cookie,\n \t\t\t\t    fscache_access_io_write) < 0)\n \t\tgoto abandon_free;\n \n-\tret = cres->ops->prepare_write(cres, &start, &len, len, i_size, false);\n-\tif (ret < 0)\n-\t\tgoto abandon_end;\n-\n \t/* TODO: Consider clearing page bits now for space the write isn't\n \t * covering.  This is more complicated than it appears when THPs are\n \t * taken into account.\n@@ -252,8 +248,6 @@ void __fscache_write_to_cache(struct fscache_cookie *cookie,\n \tfscache_write(cres, start, &iter, fscache_wreq_done, wreq);\n \treturn;\n \n-abandon_end:\n-\treturn fscache_wreq_done(wreq, ret);\n abandon_free:\n \tkfree(wreq);\n abandon:\ndiff --git a/fs/netfs/internal.h b/fs/netfs/internal.h\nindex ddae82f94ce0..ecf7cd5b5ca1 100644\n--- a/fs/netfs/internal.h\n+++ b/fs/netfs/internal.h\n@@ -34,6 +34,18 @@ int netfs_prefetch_for_write(struct file *file, struct folio *folio,\n void netfs_update_i_size(struct netfs_inode *ctx, struct inode *inode,\n \t\t\t loff_t pos, size_t copied);\n \n+/*\n+ * direct_read.c\n+ */\n+int netfs_prepare_unbuffered_read_buffer(struct netfs_io_subrequest *subreq,\n+\t\t\t\t\t unsigned int max_segs);\n+\n+/*\n+ * direct_write.c\n+ */\n+int netfs_prepare_unbuffered_write_buffer(struct netfs_io_subrequest *subreq,\n+\t\t\t\t\t  unsigned int max_segs);\n+\n /*\n  * main.c\n  */\n@@ -70,6 +82,8 @@ struct bvecq *netfs_buffer_make_space(struct netfs_io_request *rreq,\n \t\t\t\t      enum netfs_bvecq_trace trace);\n void netfs_wake_collector(struct netfs_io_request *rreq);\n void netfs_subreq_clear_in_progress(struct netfs_io_subrequest *subreq);\n+int netfs_wait_for_in_progress_subreq(struct netfs_io_request *rreq,\n+\t\t\t\t      struct netfs_io_subrequest *subreq);\n void netfs_wait_for_in_progress_stream(struct netfs_io_request *rreq,\n \t\t\t\t       struct netfs_io_stream *stream);\n ssize_t netfs_wait_for_read(struct netfs_io_request *rreq);\n@@ -113,16 +127,53 @@ void netfs_cache_read_terminated(void *priv, ssize_t transferred_or_error);\n /*\n  * read_pgpriv2.c\n  */\n+#ifdef CONFIG_NETFS_PGPRIV2\n+int netfs_prepare_pgpriv2_write_buffer(struct netfs_io_subrequest *subreq,\n+\t\t\t\t       unsigned int max_segs);\n void netfs_pgpriv2_copy_to_cache(struct netfs_io_request *rreq, struct folio *folio);\n void netfs_pgpriv2_end_copy_to_cache(struct netfs_io_request *rreq);\n bool netfs_pgpriv2_unlock_copied_folios(struct netfs_io_request *wreq);\n+static inline bool netfs_using_pgpriv2(const struct netfs_io_request *rreq)\n+{\n+\treturn test_bit(NETFS_RREQ_USE_PGPRIV2, &rreq->flags);\n+}\n+#else\n+static inline int netfs_prepare_pgpriv2_write_buffer(struct netfs_io_subrequest *subreq,\n+\t\t\t\t\t\t     unsigned int max_segs)\n+{\n+\treturn -EIO;\n+}\n+static inline void netfs_pgpriv2_copy_to_cache(struct netfs_io_request *rreq, struct folio *folio)\n+{\n+}\n+static inline void netfs_pgpriv2_end_copy_to_cache(struct netfs_io_request *rreq)\n+{\n+}\n+static inline bool netfs_pgpriv2_unlock_copied_folios(struct netfs_io_request *wreq)\n+{\n+\treturn true;\n+}\n+static inline bool netfs_using_pgpriv2(const struct netfs_io_request *rreq)\n+{\n+\treturn false;\n+}\n+#endif\n \n /*\n  * read_retry.c\n  */\n+int netfs_prepare_buffered_read_retry_buffer(struct netfs_io_subrequest *subreq,\n+\t\t\t\t\t     unsigned int max_segs);\n+int netfs_reset_for_read_retry(struct netfs_io_subrequest *subreq);\n void netfs_retry_reads(struct netfs_io_request *rreq);\n void netfs_unlock_abandoned_read_pages(struct netfs_io_request *rreq);\n \n+/*\n+ * read_single.c\n+ */\n+int netfs_prepare_read_single_buffer(struct netfs_io_subrequest *subreq,\n+\t\t\t\t     unsigned int max_segs);\n+\n /*\n  * stats.c\n  */\n@@ -194,30 +245,25 @@ void netfs_write_collection_worker(struct work_struct *work);\n /*\n  * write_issue.c\n  */\n+struct netfs_writethrough;\n struct netfs_io_request *netfs_create_write_req(struct address_space *mapping,\n \t\t\t\t\t\tstruct file *file,\n \t\t\t\t\t\tloff_t start,\n \t\t\t\t\t\tenum netfs_io_origin origin);\n-void netfs_prepare_write(struct netfs_io_request *wreq,\n-\t\t\t struct netfs_io_stream *stream,\n-\t\t\t loff_t start);\n-void netfs_reissue_write(struct netfs_io_stream *stream,\n-\t\t\t struct netfs_io_subrequest *subreq);\n-void netfs_issue_write(struct netfs_io_request *wreq,\n-\t\t       struct netfs_io_stream *stream);\n-size_t netfs_advance_write(struct netfs_io_request *wreq,\n-\t\t\t   struct netfs_io_stream *stream,\n-\t\t\t   loff_t start, size_t len, bool to_eof);\n-struct netfs_io_request *netfs_begin_writethrough(struct kiocb *iocb, size_t len);\n-int netfs_advance_writethrough(struct netfs_io_request *wreq, struct writeback_control *wbc,\n-\t\t\t       struct folio *folio, size_t copied, bool to_page_end,\n-\t\t\t       struct folio **writethrough_cache);\n-ssize_t netfs_end_writethrough(struct netfs_io_request *wreq, struct writeback_control *wbc,\n-\t\t\t       struct folio *writethrough_cache);\n+struct netfs_io_subrequest *netfs_alloc_write_subreq(struct netfs_io_request *wreq,\n+\t\t\t\t\t\t     struct netfs_io_stream *stream);\n+struct netfs_writethrough *netfs_begin_writethrough(struct kiocb *iocb, size_t len);\n+int netfs_advance_writethrough(struct netfs_writethrough *wthru,\n+\t\t\t       struct writeback_control *wbc,\n+\t\t\t       struct folio *folio, size_t copied, bool to_page_end);\n+ssize_t netfs_end_writethrough(struct netfs_writethrough *wthru,\n+\t\t\t       struct writeback_control *wbc);\n \n /*\n  * write_retry.c\n  */\n+int netfs_prepare_write_retry_buffer(struct netfs_io_subrequest *subreq,\n+\t\t\t\t     unsigned int max_segs);\n void netfs_retry_writes(struct netfs_io_request *wreq);\n \n /*\ndiff --git a/fs/netfs/iterator.c b/fs/netfs/iterator.c\nindex 7969c0b1f9a9..69164e8b8e57 100644\n--- a/fs/netfs/iterator.c\n+++ b/fs/netfs/iterator.c\n@@ -102,14 +102,14 @@ ssize_t netfs_extract_iter(struct iov_iter *orig, size_t orig_len, size_t max_se\n \t\t\t}\n \n \t\t\tif (got == 0) {\n-\t\t\t\tpr_err(\"extract_pages gave nothing from %zu, %zu\\n\",\n+\t\t\t\tpr_err(\"extract_pages gave nothing from %zx, %zx\\n\",\n \t\t\t\t       extracted, orig_len);\n \t\t\t\tret = -EIO;\n \t\t\t\tgoto out;\n \t\t\t}\n \n-\t\t\tif (got > orig_len - extracted) {\n-\t\t\t\tpr_err(\"extract_pages rc=%zd more than %zu\\n\",\n+\t\t\tif (got > orig_len) {\n+\t\t\t\tpr_err(\"extract_pages rc=%zx more than %zx\\n\",\n \t\t\t\t       got, orig_len);\n \t\t\t\tgoto out;\n \t\t\t}\ndiff --git a/fs/netfs/misc.c b/fs/netfs/misc.c\nindex a19724389147..796dc227c2b2 100644\n--- a/fs/netfs/misc.c\n+++ b/fs/netfs/misc.c\n@@ -232,6 +232,37 @@ void netfs_subreq_clear_in_progress(struct netfs_io_subrequest *subreq)\n \t\tnetfs_wake_collector(rreq);\n }\n \n+/*\n+ * Wait for a subrequest to come to completion.\n+ */\n+int netfs_wait_for_in_progress_subreq(struct netfs_io_request *rreq,\n+\t\t\t\t      struct netfs_io_subrequest *subreq)\n+{\n+\tif (netfs_check_subreq_in_progress(subreq)) {\n+\t\tDEFINE_WAIT(myself);\n+\n+\t\ttrace_netfs_rreq(rreq, netfs_rreq_trace_wait_quiesce);\n+\t\tfor (;;) {\n+\t\t\tprepare_to_wait(&rreq->waitq, &myself, TASK_UNINTERRUPTIBLE);\n+\n+\t\t\tif (!netfs_check_subreq_in_progress(subreq))\n+\t\t\t\tbreak;\n+\n+\t\t\ttrace_netfs_sreq(subreq, netfs_sreq_trace_wait_for);\n+\t\t\tschedule();\n+\t\t}\n+\n+\t\ttrace_netfs_rreq(rreq, netfs_rreq_trace_waited_quiesce);\n+\t\tfinish_wait(&rreq->waitq, &myself);\n+\t}\n+\n+\tif (test_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags))\n+\t\treturn -EAGAIN;\n+\tif (test_bit(NETFS_SREQ_FAILED, &subreq->flags))\n+\t\treturn subreq->error;\n+\treturn 0;\n+}\n+\n /*\n  * Wait for all outstanding I/O in a stream to quiesce.\n  */\n@@ -361,7 +392,7 @@ static ssize_t netfs_wait_for_in_progress(struct netfs_io_request *rreq,\n \t\tcase NETFS_UNBUFFERED_WRITE:\n \t\t\tbreak;\n \t\tdefault:\n-\t\t\tif (rreq->submitted < rreq->len) {\n+\t\t\tif (rreq->transferred < rreq->len) {\n \t\t\t\ttrace_netfs_failure(rreq, NULL, ret, netfs_fail_short_read);\n \t\t\t\tret = -EIO;\n \t\t\t}\ndiff --git a/fs/netfs/objects.c b/fs/netfs/objects.c\nindex eff431cd7d6a..3db79943762d 100644\n--- a/fs/netfs/objects.c\n+++ b/fs/netfs/objects.c\n@@ -46,8 +46,6 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping,\n \trreq->i_size\t= i_size_read(inode);\n \trreq->debug_id\t= atomic_inc_return(&debug_ids);\n \trreq->wsize\t= INT_MAX;\n-\trreq->io_streams[0].sreq_max_len = ULONG_MAX;\n-\trreq->io_streams[0].sreq_max_segs = 0;\n \tspin_lock_init(&rreq->lock);\n \tINIT_LIST_HEAD(&rreq->io_streams[0].subrequests);\n \tINIT_LIST_HEAD(&rreq->io_streams[1].subrequests);\n@@ -134,8 +132,10 @@ static void netfs_deinit_request(struct netfs_io_request *rreq)\n \tif (rreq->cache_resources.ops)\n \t\trreq->cache_resources.ops->end_operation(&rreq->cache_resources);\n \tbvecq_pos_unset(&rreq->load_cursor);\n-\tbvecq_pos_unset(&rreq->dispatch_cursor);\n \tbvecq_pos_unset(&rreq->collect_cursor);\n+\tbvecq_pos_unset(&rreq->retry_cursor);\n+\tfor (int i = 0; i < NR_IO_STREAMS; i++)\n+\t\tbvecq_pos_unset(&rreq->io_streams[i].dispatch_cursor);\n \n \tif (atomic_dec_and_test(&ictx->io_count))\n \t\twake_up_var(&ictx->io_count);\n@@ -226,6 +226,7 @@ static void netfs_free_subrequest(struct netfs_io_subrequest *subreq)\n \tstruct netfs_io_request *rreq = subreq->rreq;\n \n \ttrace_netfs_sreq(subreq, netfs_sreq_trace_free);\n+\tWARN_ON_ONCE(!list_empty(&subreq->rreq_link));\n \tif (rreq->netfs_ops->free_subrequest)\n \t\trreq->netfs_ops->free_subrequest(subreq);\n \tbvecq_pos_unset(&subreq->dispatch_pos);\ndiff --git a/fs/netfs/read_collect.c b/fs/netfs/read_collect.c\nindex 6d49f9a6b1f0..fbb0425ecb89 100644\n--- a/fs/netfs/read_collect.c\n+++ b/fs/netfs/read_collect.c\n@@ -36,6 +36,7 @@ static void netfs_clear_unread(struct netfs_io_subrequest *subreq)\n \n \tif (subreq->start + subreq->transferred >= subreq->rreq->i_size)\n \t\t__set_bit(NETFS_SREQ_HIT_EOF, &subreq->flags);\n+\ttrace_netfs_rreq(subreq->rreq, netfs_rreq_trace_zero_unread);\n }\n \n /*\n@@ -58,7 +59,7 @@ static void netfs_unlock_read_folio(struct netfs_io_request *rreq,\n \tflush_dcache_folio(folio);\n \tfolio_mark_uptodate(folio);\n \n-\tif (!test_bit(NETFS_RREQ_USE_PGPRIV2, &rreq->flags)) {\n+\tif (!netfs_using_pgpriv2(rreq)) {\n \t\tfinfo = netfs_folio_info(folio);\n \t\tif (finfo) {\n \t\t\ttrace_netfs_folio(folio, netfs_folio_trace_filled_gaps);\n@@ -264,8 +265,7 @@ static void netfs_collect_read_results(struct netfs_io_request *rreq)\n \t\t\t\ttransferred = front->len;\n \t\t\t\ttrace_netfs_rreq(rreq, netfs_rreq_trace_set_abandon);\n \t\t\t}\n-\t\t\tif (front->start + transferred >= rreq->cleaned_to + fsize ||\n-\t\t\t    test_bit(NETFS_SREQ_HIT_EOF, &front->flags))\n+\t\t\tif (front->start + transferred >= rreq->cleaned_to + fsize)\n \t\t\t\tnetfs_read_unlock_folios(rreq, &notes);\n \t\t} else {\n \t\t\tstream->collected_to = front->start + transferred;\n@@ -381,31 +381,6 @@ static void netfs_rreq_assess_dio(struct netfs_io_request *rreq)\n \t\tinode_dio_end(rreq->inode);\n }\n \n-/*\n- * Do processing after reading a monolithic single object.\n- */\n-static void netfs_rreq_assess_single(struct netfs_io_request *rreq)\n-{\n-\tstruct netfs_io_stream *stream = &rreq->io_streams[0];\n-\n-\tif (!rreq->error && stream->source == NETFS_DOWNLOAD_FROM_SERVER &&\n-\t    fscache_resources_valid(&rreq->cache_resources)) {\n-\t\ttrace_netfs_rreq(rreq, netfs_rreq_trace_dirty);\n-\t\tnetfs_single_mark_inode_dirty(rreq->inode);\n-\t}\n-\n-\tif (rreq->iocb) {\n-\t\trreq->iocb->ki_pos += rreq->transferred;\n-\t\tif (rreq->iocb->ki_complete) {\n-\t\t\ttrace_netfs_rreq(rreq, netfs_rreq_trace_ki_complete);\n-\t\t\trreq->iocb->ki_complete(\n-\t\t\t\trreq->iocb, rreq->error ? rreq->error : rreq->transferred);\n-\t\t}\n-\t}\n-\tif (rreq->netfs_ops->done)\n-\t\trreq->netfs_ops->done(rreq);\n-}\n-\n /*\n  * Perform the collection of subrequests and folios.\n  *\n@@ -441,7 +416,7 @@ bool netfs_read_collection(struct netfs_io_request *rreq)\n \t\tnetfs_rreq_assess_dio(rreq);\n \t\tbreak;\n \tcase NETFS_READ_SINGLE:\n-\t\tnetfs_rreq_assess_single(rreq);\n+\t\tWARN_ON_ONCE(1);\n \t\tbreak;\n \tdefault:\n \t\tbreak;\ndiff --git a/fs/netfs/read_pgpriv2.c b/fs/netfs/read_pgpriv2.c\nindex fb783318318e..5f4d1a21afc5 100644\n--- a/fs/netfs/read_pgpriv2.c\n+++ b/fs/netfs/read_pgpriv2.c\n@@ -13,8 +13,39 @@\n #include <linux/task_io_accounting_ops.h>\n #include \"internal.h\"\n \n+int netfs_prepare_pgpriv2_write_buffer(struct netfs_io_subrequest *subreq,\n+\t\t\t\t       unsigned int max_segs)\n+{\n+\tstruct netfs_io_request *creq = subreq->rreq;\n+\tstruct netfs_io_stream *stream = &creq->io_streams[1];\n+\tsize_t len;\n+\n+\tbvecq_pos_set(&subreq->dispatch_pos, &stream->dispatch_cursor);\n+\tbvecq_pos_set(&subreq->content, &stream->dispatch_cursor);\n+\tlen = bvecq_slice(&stream->dispatch_cursor, subreq->len, max_segs,\n+\t\t\t  &subreq->nr_segs);\n+\n+\tif (len < subreq->len) {\n+\t\tsubreq->len = len;\n+\t\ttrace_netfs_sreq(subreq, netfs_sreq_trace_limited);\n+\t}\n+\n+\t// TODO: Wait here for completion of prev subreq\n+\n+\tstream->issue_from += subreq->len;\n+\tstream->buffered   -= subreq->len;\n+\tif (stream->buffered == 0) {\n+\t\tsmp_wmb(); /* Write lists before ALL_QUEUED. */\n+\t\tset_bit(NETFS_RREQ_ALL_QUEUED, &creq->flags);\n+\t}\n+\treturn 0;\n+}\n+\n /*\n- * [DEPRECATED] Copy a folio to the cache with PG_private_2 set.\n+ * [DEPRECATED] Copy a folio to the cache with PG_private_2 set.  Note that the\n+ * folio won't necessarily be contiguous with the previous one as there might\n+ * be a mixture of folios read from the cache and downloaded from the server\n+ * (or just zeroed).\n  */\n static void netfs_pgpriv2_copy_folio(struct netfs_io_request *creq, struct folio *folio)\n {\n@@ -24,7 +55,6 @@ static void netfs_pgpriv2_copy_folio(struct netfs_io_request *creq, struct folio\n \tsize_t dio_size = PAGE_SIZE;\n \tsize_t fsize = folio_size(folio), flen = fsize;\n \tloff_t fpos = folio_pos(folio), i_size;\n-\tbool to_eof = false;\n \n \t_enter(\"\");\n \n@@ -44,12 +74,8 @@ static void netfs_pgpriv2_copy_folio(struct netfs_io_request *creq, struct folio\n \tif (fpos + fsize > creq->i_size)\n \t\tcreq->i_size = i_size;\n \n-\tif (flen > i_size - fpos) {\n+\tif (flen > i_size - fpos)\n \t\tflen = i_size - fpos;\n-\t\tto_eof = true;\n-\t} else if (flen == i_size - fpos) {\n-\t\tto_eof = true;\n-\t}\n \n \tflen = round_up(flen, dio_size);\n \n@@ -57,7 +83,6 @@ static void netfs_pgpriv2_copy_folio(struct netfs_io_request *creq, struct folio\n \n \ttrace_netfs_folio(folio, netfs_folio_trace_store_copy);\n \n-\n \t/* Institute a new bvec queue segment if the current one is full or if\n \t * we encounter a discontiguity.  The discontiguity break is important\n \t * when it comes to bulk unlocking folios by file range.\n@@ -79,40 +104,13 @@ static void netfs_pgpriv2_copy_folio(struct netfs_io_request *creq, struct folio\n \t/* Attach the folio to the rolling buffer. */\n \tslot = queue->nr_slots;\n \tbvec_set_folio(&queue->bv[slot], folio, fsize, 0);\n-\t/* Order incrementing the slot counter after the slot is filled. */\n-\tsmp_store_release(&queue->nr_slots, slot + 1);\n+\tqueue->nr_slots = slot + 1;\n \tcreq->load_cursor.slot = slot + 1;\n \tcreq->load_cursor.offset = 0;\n \ttrace_netfs_bv_slot(queue, slot);\n+\ttrace_netfs_wback(creq, folio, 0);\n \n-\tcache->submit_off = 0;\n-\tcache->submit_len = flen;\n-\n-\t/* Attach the folio to one or more subrequests.  For a big folio, we\n-\t * could end up with thousands of subrequests if the wsize is small -\n-\t * but we might need to wait during the creation of subrequests for\n-\t * network resources (eg. SMB credits).\n-\t */\n-\tdo {\n-\t\tssize_t part;\n-\n-\t\tcreq->dispatch_cursor.offset = cache->submit_off;\n-\n-\t\tatomic64_set(&creq->issued_to, fpos + cache->submit_off);\n-\t\tpart = netfs_advance_write(creq, cache, fpos + cache->submit_off,\n-\t\t\t\t\t   cache->submit_len, to_eof);\n-\t\tcache->submit_off += part;\n-\t\tif (part > cache->submit_len)\n-\t\t\tcache->submit_len = 0;\n-\t\telse\n-\t\t\tcache->submit_len -= part;\n-\t} while (cache->submit_len > 0);\n-\n-\tbvecq_pos_step(&creq->dispatch_cursor);\n-\tatomic64_set(&creq->issued_to, fpos + fsize);\n-\n-\tif (flen < fsize)\n-\t\tnetfs_issue_write(creq, cache);\n+\tcache->buffered += flen;\n }\n \n /*\n@@ -122,6 +120,7 @@ static struct netfs_io_request *netfs_pgpriv2_begin_copy_to_cache(\n \tstruct netfs_io_request *rreq, struct folio *folio)\n {\n \tstruct netfs_io_request *creq;\n+\tstruct netfs_io_stream *cache;\n \n \tif (!fscache_resources_valid(&rreq->cache_resources))\n \t\tgoto cancel;\n@@ -131,12 +130,15 @@ static struct netfs_io_request *netfs_pgpriv2_begin_copy_to_cache(\n \tif (IS_ERR(creq))\n \t\tgoto cancel;\n \n-\tif (!creq->io_streams[1].avail)\n+\tcache = &creq->io_streams[1];\n+\tif (!cache->avail)\n+\t\tgoto cancel_put;\n+\n+\tif (bvecq_buffer_init(&creq->load_cursor, GFP_KERNEL) < 0)\n \t\tgoto cancel_put;\n \n-\tbvecq_buffer_init(&creq->load_cursor, GFP_KERNEL);\n-\tbvecq_pos_set(&creq->dispatch_cursor, &creq->load_cursor);\n-\tbvecq_pos_set(&creq->collect_cursor, &creq->dispatch_cursor);\n+\tbvecq_pos_set(&cache->dispatch_cursor, &creq->load_cursor);\n+\tbvecq_pos_set(&creq->collect_cursor, &creq->load_cursor);\n \n \t__set_bit(NETFS_RREQ_OFFLOAD_COLLECTION, &creq->flags);\n \ttrace_netfs_copy2cache(rreq, creq);\n@@ -171,19 +173,43 @@ void netfs_pgpriv2_copy_to_cache(struct netfs_io_request *rreq, struct folio *fo\n \tnetfs_pgpriv2_copy_folio(creq, folio);\n }\n \n+/*\n+ * Issue all pending writes on the cache stream.\n+ */\n+static int netfs_pgpriv2_issue_stream(struct netfs_io_request *wreq,\n+\t\t\t\t      struct netfs_io_stream *stream)\n+{\n+\tint ret;\n+\n+\tatomic64_set_release(&stream->issued_to, wreq->start);\n+\n+\tdo {\n+\t\tstruct netfs_io_subrequest *subreq;\n+\n+\t\tsubreq = netfs_alloc_write_subreq(wreq, stream);\n+\t\tif (!subreq)\n+\t\t\treturn -ENOMEM;\n+\n+\t\tret = stream->issue_write(subreq);\n+\t\tif (ret < 0 && ret != -EIOCBQUEUED)\n+\t\t\tbreak;\n+\t} while (stream->buffered > 0);\n+\n+\treturn ret;\n+}\n+\n /*\n  * [DEPRECATED] End writing to the cache, flushing out any outstanding writes.\n  */\n void netfs_pgpriv2_end_copy_to_cache(struct netfs_io_request *rreq)\n {\n \tstruct netfs_io_request *creq = rreq->copy_to_cache;\n+\tstruct netfs_io_stream *stream = &creq->io_streams[1];\n \n \tif (IS_ERR_OR_NULL(creq))\n \t\treturn;\n \n-\tnetfs_issue_write(creq, &creq->io_streams[1]);\n-\tsmp_wmb(); /* Write lists before ALL_QUEUED. */\n-\tset_bit(NETFS_RREQ_ALL_QUEUED, &creq->flags);\n+\tnetfs_pgpriv2_issue_stream(creq, stream);\n \ttrace_netfs_rreq(rreq, netfs_rreq_trace_end_copy_to_cache);\n \tif (list_empty_careful(&creq->io_streams[1].subrequests))\n \t\tnetfs_wake_collector(creq);\ndiff --git a/fs/netfs/read_retry.c b/fs/netfs/read_retry.c\nindex 6f2eb14aac72..b3bc924ffe8e 100644\n--- a/fs/netfs/read_retry.c\n+++ b/fs/netfs/read_retry.c\n@@ -9,19 +9,55 @@\n #include <linux/slab.h>\n #include \"internal.h\"\n \n-static void netfs_reissue_read(struct netfs_io_request *rreq,\n-\t\t\t       struct netfs_io_subrequest *subreq)\n+/*\n+ * Prepare the I/O buffer on a buffered read subrequest for the filesystem to\n+ * use as a bvec queue.\n+ */\n+int netfs_prepare_buffered_read_retry_buffer(struct netfs_io_subrequest *subreq,\n+\t\t\t\t\t     unsigned int max_segs)\n {\n+\tstruct netfs_io_request *rreq = subreq->rreq;\n+\tsize_t len;\n+\n+\tbvecq_pos_set(&subreq->dispatch_pos, &rreq->retry_cursor);\n \tbvecq_pos_set(&subreq->content, &subreq->dispatch_pos);\n-\tiov_iter_bvec_queue(&subreq->io_iter, ITER_DEST, subreq->content.bvecq,\n-\t\t\t    subreq->content.slot, subreq->content.offset, subreq->len);\n-\tiov_iter_advance(&subreq->io_iter, subreq->transferred);\n+\tlen = bvecq_slice(&rreq->retry_cursor, subreq->len, max_segs,\n+\t\t\t  &subreq->nr_segs);\n+\tif (len < subreq->len) {\n+\t\tsubreq->len = len;\n+\t\ttrace_netfs_sreq(subreq, netfs_sreq_trace_limited);\n+\t}\n+\trreq->retry_buffered -= subreq->len;\n+\trreq->retry_start    += subreq->len;\n+\treturn 0;\n+}\n \n-\tsubreq->error = 0;\n+/*\n+ * Reset the state of the subrequest and discard any buffering so that we can\n+ * retry (where this may include sending it to the server instead of the\n+ * cache).\n+ */\n+int netfs_reset_for_read_retry(struct netfs_io_subrequest *subreq)\n+{\n+\ttrace_netfs_sreq(subreq, netfs_sreq_trace_retry);\n+\n+\tif (subreq->retry_count > 3) {\n+\t\ttrace_netfs_sreq(subreq, netfs_sreq_trace_too_many_retries);\n+\t\treturn subreq->error;\n+\t}\n+\n+\tsubreq->retry_count++;\n \t__clear_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags);\n+\t__clear_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags);\n+\t__clear_bit(NETFS_SREQ_FAILED, &subreq->flags);\n \t__set_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags);\n-\tnetfs_stat(&netfs_n_rh_retry_read_subreq);\n-\tsubreq->rreq->netfs_ops->issue_read(subreq);\n+\tbvecq_pos_unset(&subreq->content);\n+\tbvecq_pos_unset(&subreq->dispatch_pos);\n+\tsubreq->error = 0;\n+\tsubreq->transferred = 0;\n+\tnetfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit);\n+\tnetfs_stat(&netfs_n_wh_retry_write_subreq);\n+\treturn 0;\n }\n \n /*\n@@ -32,8 +68,8 @@ static void netfs_retry_read_subrequests(struct netfs_io_request *rreq)\n {\n \tstruct netfs_io_subrequest *subreq;\n \tstruct netfs_io_stream *stream = &rreq->io_streams[0];\n-\tstruct bvecq_pos dispatch_cursor = {};\n \tstruct list_head *next;\n+\tint ret;\n \n \t_enter(\"R=%x\", rreq->debug_id);\n \n@@ -43,47 +79,19 @@ static void netfs_retry_read_subrequests(struct netfs_io_request *rreq)\n \tif (rreq->netfs_ops->retry_request)\n \t\trreq->netfs_ops->retry_request(rreq, NULL);\n \n-\t/* If there's no renegotiation to do, just resend each retryable subreq\n-\t * up to the first permanently failed one.\n-\t */\n-\tif (!rreq->netfs_ops->prepare_read &&\n-\t    !rreq->cache_resources.ops) {\n-\t\tlist_for_each_entry(subreq, &stream->subrequests, rreq_link) {\n-\t\t\tif (test_bit(NETFS_SREQ_FAILED, &subreq->flags))\n-\t\t\t\tbreak;\n-\t\t\tif (__test_and_clear_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags)) {\n-\t\t\t\t__clear_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags);\n-\t\t\t\tsubreq->retry_count++;\n-\t\t\t\tnetfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit);\n-\t\t\t\tnetfs_reissue_read(rreq, subreq);\n-\t\t\t}\n-\t\t}\n-\t\treturn;\n-\t}\n-\n \t/* Okay, we need to renegotiate all the download requests and flip any\n \t * failed cache reads over to being download requests and negotiate\n-\t * those also.  All fully successful subreqs have been removed from the\n-\t * list and any spare data from those has been donated.\n-\t *\n-\t * What we do is decant the list and rebuild it one subreq at a time so\n-\t * that we don't end up with donations jumping over a gap we're busy\n-\t * populating with smaller subrequests.  In the event that the subreq\n-\t * we just launched finishes before we insert the next subreq, it'll\n-\t * fill in rreq->prev_donated instead.\n-\t *\n-\t * Note: Alternatively, we could split the tail subrequest right before\n-\t * we reissue it and fix up the donations under lock.\n+\t * those also.\n \t */\n \tnext = stream->subrequests.next;\n \n \tdo {\n \t\tstruct netfs_io_subrequest *from, *to, *tmp;\n-\t\tunsigned long long start, len;\n-\t\tsize_t part;\n-\t\tbool boundary = false, subreq_superfluous = false;\n+\t\tunsigned long long start;\n+\t\tsize_t len;\n+\t\tbool subreq_superfluous = false;\n \n-\t\tbvecq_pos_unset(&dispatch_cursor);\n+\t\tbvecq_pos_unset(&rreq->retry_cursor);\n \n \t\t/* Go through the subreqs and find the next span of contiguous\n \t\t * buffer that we then rejig (cifs, for example, needs the\n@@ -98,8 +106,7 @@ static void netfs_retry_read_subrequests(struct netfs_io_request *rreq)\n \t\t       rreq->debug_id, from->debug_index,\n \t\t       from->start, from->transferred, from->len);\n \n-\t\tif (test_bit(NETFS_SREQ_FAILED, &from->flags) ||\n-\t\t    !test_bit(NETFS_SREQ_NEED_RETRY, &from->flags)) {\n+\t\tif (!test_bit(NETFS_SREQ_NEED_RETRY, &from->flags)) {\n \t\t\tsubreq = from;\n \t\t\tgoto abandon;\n \t\t}\n@@ -107,68 +114,53 @@ static void netfs_retry_read_subrequests(struct netfs_io_request *rreq)\n \t\tlist_for_each_continue(next, &stream->subrequests) {\n \t\t\tsubreq = list_entry(next, struct netfs_io_subrequest, rreq_link);\n \t\t\tif (subreq->start + subreq->transferred != start + len ||\n-\t\t\t    test_bit(NETFS_SREQ_BOUNDARY, &subreq->flags) ||\n \t\t\t    !test_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags))\n \t\t\t\tbreak;\n \t\t\tto = subreq;\n \t\t\tlen += to->len;\n \t\t}\n \n-\t\t_debug(\" - range: %llx-%llx %llx\", start, start + len - 1, len);\n+\t\t_debug(\" - range: %llx-%llx %zx\", start, start + len - 1, len);\n \n \t\t/* Determine the set of buffers we're going to use.  Each\n-\t\t * subreq gets a subset of a single overall contiguous buffer.\n+\t\t * subreq takes a subset of a single overall contiguous buffer.\n \t\t */\n-\t\tbvecq_pos_transfer(&dispatch_cursor, &from->dispatch_pos);\n-\t\tbvecq_pos_advance(&dispatch_cursor, from->transferred);\n+\t\tbvecq_pos_transfer(&rreq->retry_cursor, &from->dispatch_pos);\n+\t\tbvecq_pos_advance(&rreq->retry_cursor, from->transferred);\n+\t\trreq->retry_start = start;\n+\t\trreq->retry_buffered = len;\n \n \t\t/* Work through the sublist. */\n \t\tsubreq = from;\n \t\tlist_for_each_entry_from(subreq, &stream->subrequests, rreq_link) {\n-\t\t\tif (!len) {\n+\t\t\tif (rreq->retry_buffered == 0) {\n \t\t\t\tsubreq_superfluous = true;\n \t\t\t\tbreak;\n \t\t\t}\n \t\t\tsubreq->source\t= NETFS_DOWNLOAD_FROM_SERVER;\n-\t\t\tsubreq->start\t= start - subreq->transferred;\n-\t\t\tsubreq->len\t= len   + subreq->transferred;\n-\t\t\t__clear_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags);\n-\t\t\t__clear_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags);\n-\t\t\tsubreq->retry_count++;\n+\t\t\tsubreq->start\t= rreq->retry_start;\n+\t\t\tsubreq->len\t= rreq->retry_buffered;\n \n-\t\t\tbvecq_pos_unset(&subreq->dispatch_pos);\n-\t\t\tbvecq_pos_unset(&subreq->content);\n-\n-\t\t\ttrace_netfs_sreq(subreq, netfs_sreq_trace_retry);\n-\n-\t\t\t/* Renegotiate max_len (rsize) */\n-\t\t\tstream->sreq_max_len = subreq->len;\n-\t\t\tstream->sreq_max_segs = INT_MAX;\n-\t\t\tif (rreq->netfs_ops->prepare_read &&\n-\t\t\t    rreq->netfs_ops->prepare_read(subreq) < 0) {\n-\t\t\t\ttrace_netfs_sreq(subreq, netfs_sreq_trace_reprep_failed);\n+\t\t\tret = netfs_reset_for_read_retry(subreq);\n+\t\t\tif (ret < 0) {\n \t\t\t\t__set_bit(NETFS_SREQ_FAILED, &subreq->flags);\n+\t\t\t\trreq->error = ret;\n \t\t\t\tgoto abandon;\n \t\t\t}\n \n-\t\t\tbvecq_pos_set(&subreq->dispatch_pos, &dispatch_cursor);\n-\t\t\tpart = bvecq_slice(&dispatch_cursor,\n-\t\t\t\t\t   umin(len, stream->sreq_max_len),\n-\t\t\t\t\t   stream->sreq_max_segs,\n-\t\t\t\t\t   &subreq->nr_segs);\n-\t\t\tsubreq->len = subreq->transferred + part;\n-\n-\t\t\tlen -= part;\n-\t\t\tstart += part;\n-\t\t\tif (!len) {\n-\t\t\t\tif (boundary)\n-\t\t\t\t\t__set_bit(NETFS_SREQ_BOUNDARY, &subreq->flags);\n-\t\t\t} else {\n-\t\t\t\t__clear_bit(NETFS_SREQ_BOUNDARY, &subreq->flags);\n+\t\t\tnetfs_stat(&netfs_n_rh_download);\n+\t\t\tret = rreq->netfs_ops->issue_read(subreq);\n+\t\t\tif (ret < 0 && ret != -EIOCBQUEUED) {\n+\t\t\t\tif (ret == -ENOMEM)\n+\t\t\t\t\tgoto abandon;\n+\t\t\t\tsubreq->error = ret;\n+\t\t\t\tif (ret != -EAGAIN) {\n+\t\t\t\t\t__set_bit(NETFS_SREQ_FAILED, &subreq->flags);\n+\t\t\t\t\tgoto abandon_after;\n+\t\t\t\t}\n+\t\t\t\t__set_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags);\n+\t\t\t\tnetfs_read_subreq_terminated(subreq);\n \t\t\t}\n-\n-\t\t\tnetfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit);\n-\t\t\tnetfs_reissue_read(rreq, subreq);\n \t\t\tif (subreq == to) {\n \t\t\t\tsubreq_superfluous = false;\n \t\t\t\tbreak;\n@@ -178,7 +170,7 @@ static void netfs_retry_read_subrequests(struct netfs_io_request *rreq)\n \t\t/* If we managed to use fewer subreqs, we can discard the\n \t\t * excess; if we used the same number, then we're done.\n \t\t */\n-\t\tif (!len) {\n+\t\tif (rreq->retry_buffered == 0) {\n \t\t\tif (!subreq_superfluous)\n \t\t\t\tcontinue;\n \t\t\tlist_for_each_entry_safe_from(subreq, tmp,\n@@ -194,7 +186,8 @@ static void netfs_retry_read_subrequests(struct netfs_io_request *rreq)\n \t\t}\n \n \t\t/* We ran out of subrequests, so we need to allocate some more\n-\t\t * and insert them after.\n+\t\t * and insert them after.  They must start with being marked\n+\t\t * for retry to switch to the retry cursor.\n \t\t */\n \t\tdo {\n \t\t\tsubreq = netfs_alloc_subrequest(rreq);\n@@ -203,8 +196,8 @@ static void netfs_retry_read_subrequests(struct netfs_io_request *rreq)\n \t\t\t\tgoto abandon_after;\n \t\t\t}\n \t\t\tsubreq->source\t\t= NETFS_DOWNLOAD_FROM_SERVER;\n-\t\t\tsubreq->start\t\t= start;\n-\t\t\tsubreq->len\t\t= len;\n+\t\t\tsubreq->start\t\t= rreq->retry_start;\n+\t\t\tsubreq->len\t\t= rreq->retry_buffered;\n \t\t\tsubreq->stream_nr\t= stream->stream_nr;\n \t\t\tsubreq->retry_count\t= 1;\n \n@@ -216,37 +209,26 @@ static void netfs_retry_read_subrequests(struct netfs_io_request *rreq)\n \t\t\tto = list_next_entry(to, rreq_link);\n \t\t\ttrace_netfs_sreq(subreq, netfs_sreq_trace_retry);\n \n-\t\t\tstream->sreq_max_len\t= umin(len, rreq->rsize);\n-\t\t\tstream->sreq_max_segs\t= INT_MAX;\n-\n \t\t\tnetfs_stat(&netfs_n_rh_download);\n-\t\t\tif (rreq->netfs_ops->prepare_read(subreq) < 0) {\n-\t\t\t\ttrace_netfs_sreq(subreq, netfs_sreq_trace_reprep_failed);\n-\t\t\t\t__set_bit(NETFS_SREQ_FAILED, &subreq->flags);\n-\t\t\t\tgoto abandon;\n+\t\t\tret = rreq->netfs_ops->issue_read(subreq);\n+\t\t\tif (ret < 0 && ret != -EIOCBQUEUED) {\n+\t\t\t\tif (ret == -ENOMEM)\n+\t\t\t\t\tgoto abandon;\n+\t\t\t\tsubreq->error = ret;\n+\t\t\t\tif (ret != -EAGAIN) {\n+\t\t\t\t\t__set_bit(NETFS_SREQ_FAILED, &subreq->flags);\n+\t\t\t\t\tgoto abandon_after;\n+\t\t\t\t}\n+\t\t\t\t__set_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags);\n+\t\t\t\tnetfs_read_subreq_terminated(subreq);\n \t\t\t}\n \n-\t\t\tbvecq_pos_set(&subreq->dispatch_pos, &dispatch_cursor);\n-\t\t\tpart = bvecq_slice(&dispatch_cursor,\n-\t\t\t\t\t   umin(len, stream->sreq_max_len),\n-\t\t\t\t\t   stream->sreq_max_segs,\n-\t\t\t\t\t   &subreq->nr_segs);\n-\t\t\tsubreq->len = subreq->transferred + part;\n-\n-\t\t\tlen -= part;\n-\t\t\tstart += part;\n-\t\t\tif (!len && boundary) {\n-\t\t\t\t__set_bit(NETFS_SREQ_BOUNDARY, &to->flags);\n-\t\t\t\tboundary = false;\n-\t\t\t}\n-\n-\t\t\tnetfs_reissue_read(rreq, subreq);\n-\t\t} while (len);\n+\t\t} while (rreq->retry_buffered > 0);\n \n \t} while (!list_is_head(next, &stream->subrequests));\n \n out:\n-\tbvecq_pos_unset(&dispatch_cursor);\n+\tbvecq_pos_unset(&rreq->retry_cursor);\n \treturn;\n \n \t/* If we hit an error, fail all remaining incomplete subrequests */\n@@ -295,8 +277,6 @@ void netfs_unlock_abandoned_read_pages(struct netfs_io_request *rreq)\n \tstruct bvecq *p;\n \n \tfor (p = rreq->collect_cursor.bvecq; p; p = p->next) {\n-\t\tif (!p->free)\n-\t\t\tcontinue;\n \t\tfor (int slot = 0; slot < p->nr_slots; slot++) {\n \t\t\tif (!p->bv[slot].bv_page)\n \t\t\t\tcontinue;\n@@ -310,6 +290,7 @@ void netfs_unlock_abandoned_read_pages(struct netfs_io_request *rreq)\n \t\t\t}\n \t\t\ttrace_netfs_folio(folio, netfs_folio_trace_abandon);\n \t\t\tfolio_unlock(folio);\n+\t\t\tp->bv[slot].bv_page = NULL;\n \t\t}\n \t}\n }\ndiff --git a/fs/netfs/read_single.c b/fs/netfs/read_single.c\nindex b386cae77ece..52b9e12a820a 100644\n--- a/fs/netfs/read_single.c\n+++ b/fs/netfs/read_single.c\n@@ -16,6 +16,19 @@\n #include <linux/netfs.h>\n #include \"internal.h\"\n \n+int netfs_prepare_read_single_buffer(struct netfs_io_subrequest *subreq,\n+\t\t\t\t     unsigned int max_segs)\n+{\n+\tstruct netfs_io_request *rreq = subreq->rreq;\n+\tstruct netfs_io_stream *stream = &rreq->io_streams[0];\n+\n+\tbvecq_pos_set(&subreq->dispatch_pos, &rreq->load_cursor);\n+\tbvecq_pos_set(&subreq->content, &subreq->dispatch_pos);\n+\n+\tstream->issue_from += subreq->len;\n+\treturn 0;\n+}\n+\n /**\n  * netfs_single_mark_inode_dirty - Mark a single, monolithic object inode dirty\n  * @inode: The inode to mark\n@@ -58,24 +71,12 @@ static int netfs_single_begin_cache_read(struct netfs_io_request *rreq, struct n\n \treturn fscache_begin_read_operation(&rreq->cache_resources, netfs_i_cookie(ctx));\n }\n \n-static void netfs_single_read_cache(struct netfs_io_request *rreq,\n-\t\t\t\t    struct netfs_io_subrequest *subreq)\n-{\n-\tstruct netfs_cache_resources *cres = &rreq->cache_resources;\n-\n-\t_enter(\"R=%08x[%x]\", rreq->debug_id, subreq->debug_index);\n-\tnetfs_stat(&netfs_n_rh_read);\n-\tcres->ops->read(cres, subreq->start, &subreq->io_iter, NETFS_READ_HOLE_FAIL,\n-\t\t\tnetfs_cache_read_terminated, subreq);\n-}\n-\n /*\n  * Perform a read to a buffer from the cache or the server.  Only a single\n  * subreq is permitted as the object must be fetched in a single transaction.\n  */\n static int netfs_single_dispatch_read(struct netfs_io_request *rreq)\n {\n-\tstruct netfs_io_stream *stream = &rreq->io_streams[0];\n \tstruct fscache_occupancy occ = {\n \t\t.query_from\t= 0,\n \t\t.query_to\t= rreq->len,\n@@ -85,76 +86,79 @@ static int netfs_single_dispatch_read(struct netfs_io_request *rreq)\n \t\t.cached_to[1]\t= ULLONG_MAX,\n \t};\n \tstruct netfs_io_subrequest *subreq;\n-\tint ret = 0;\n+\tint ret;\n+\n+\tret = netfs_read_query_cache(rreq, &occ);\n+\tif (ret < 0)\n+\t\treturn ret;\n \n \tsubreq = netfs_alloc_subrequest(rreq);\n \tif (!subreq)\n \t\treturn -ENOMEM;\n \n-\tsubreq->source\t= NETFS_DOWNLOAD_FROM_SERVER;\n \tsubreq->start\t= 0;\n \tsubreq->len\t= rreq->len;\n \n-\tbvecq_pos_set(&subreq->dispatch_pos, &rreq->dispatch_cursor);\n-\tbvecq_pos_set(&subreq->content, &rreq->dispatch_cursor);\n-\n-\tiov_iter_bvec_queue(&subreq->io_iter, ITER_DEST, subreq->content.bvecq,\n-\t\t\t    subreq->content.slot, subreq->content.offset, subreq->len);\n+\ttrace_netfs_sreq(subreq, netfs_sreq_trace_prepare);\n \n \t/* Try to use the cache if the cache content matches the size of the\n \t * remote file.\n \t */\n-\tnetfs_read_query_cache(rreq, &occ);\n \tif (occ.cached_from[0] == 0 &&\n-\t    occ.cached_to[0] == rreq->len)\n-\t\tsubreq->source = NETFS_READ_FROM_CACHE;\n+\t    occ.cached_to[0] == rreq->len) {\n+\t\tstruct netfs_cache_resources *cres = &rreq->cache_resources;\n \n-\t__set_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags);\n+\t\tsubreq->source = NETFS_READ_FROM_CACHE;\n+\t\tnetfs_stat(&netfs_n_rh_read);\n+\t\tret = cres->ops->issue_read(subreq);\n+\t\tif (ret == -EIOCBQUEUED)\n+\t\t\tret = netfs_wait_for_in_progress_subreq(rreq, subreq);\n+\t\tif (ret == -ENOMEM)\n+\t\t\tgoto cancel;\n+\t\tif (ret == 0)\n+\t\t\tgoto success;\n+\n+\t\t/* Didn't manage to retrieve from the cache, so toss it to the\n+\t\t * server instead.\n+\t\t */\n+\t\tif (netfs_reset_for_read_retry(subreq) < 0)\n+\t\t\tgoto cancel;\n+\t}\n \n-\tspin_lock(&rreq->lock);\n-\tlist_add_tail(&subreq->rreq_link, &stream->subrequests);\n-\ttrace_netfs_sreq(subreq, netfs_sreq_trace_added);\n-\t/* Store list pointers before active flag */\n-\tsmp_store_release(&stream->active, true);\n-\tspin_unlock(&rreq->lock);\n+\t__set_bit(NETFS_RREQ_FOLIO_COPY_TO_CACHE, &rreq->flags);\n \n-\tswitch (subreq->source) {\n-\tcase NETFS_DOWNLOAD_FROM_SERVER:\n+\t/* Try to send it to the cache. */\n+\tfor (;;) {\n+\t\tsubreq->source = NETFS_DOWNLOAD_FROM_SERVER;\n \t\tnetfs_stat(&netfs_n_rh_download);\n-\t\tif (rreq->netfs_ops->prepare_read) {\n-\t\t\tret = rreq->netfs_ops->prepare_read(subreq);\n-\t\t\tif (ret < 0)\n-\t\t\t\tgoto cancel;\n-\t\t}\n-\n-\t\trreq->netfs_ops->issue_read(subreq);\n-\t\trreq->submitted += subreq->len;\n-\t\tbreak;\n-\tcase NETFS_READ_FROM_CACHE:\n-\t\tif (rreq->cache_resources.ops->prepare_read) {\n-\t\t\tret = rreq->cache_resources.ops->prepare_read(subreq);\n-\t\t\tif (ret < 0)\n-\t\t\t\tgoto cancel;\n-\t\t}\n-\n-\t\ttrace_netfs_sreq(subreq, netfs_sreq_trace_submit);\n-\t\tnetfs_single_read_cache(rreq, subreq);\n-\t\trreq->submitted += subreq->len;\n-\t\tret = 0;\n-\t\tbreak;\n-\tdefault:\n-\t\tpr_warn(\"Unexpected single-read source %u\\n\", subreq->source);\n-\t\tWARN_ON_ONCE(true);\n-\t\tret = -EIO;\n-\t\tbreak;\n+\t\tret = rreq->netfs_ops->issue_read(subreq);\n+\t\tif (ret == -EIOCBQUEUED)\n+\t\t\tret = netfs_wait_for_in_progress_subreq(rreq, subreq);\n+\t\tif (ret == 0)\n+\t\t\tgoto success;\n+\t\tif (ret == -ENOMEM)\n+\t\t\tgoto cancel;\n+\t\tif (ret != -EAGAIN)\n+\t\t\tgoto failed;\n+\t\tif (netfs_reset_for_read_retry(subreq) < 0)\n+\t\t\tgoto cancel;\n \t}\n \n-\tsmp_wmb(); /* Write lists before ALL_QUEUED. */\n-\tset_bit(NETFS_RREQ_ALL_QUEUED, &rreq->flags);\n-\treturn ret;\n+success:\n+\trreq->transferred = subreq->transferred;\n+\tlist_del_init(&subreq->rreq_link);\n+\tnetfs_put_subrequest(subreq, netfs_sreq_trace_put_consumed);\n+\treturn 0;\n cancel:\n+\trreq->error = ret;\n+\tlist_del_init(&subreq->rreq_link);\n \tnetfs_put_subrequest(subreq, netfs_sreq_trace_put_cancel);\n \treturn ret;\n+failed:\n+\trreq->error = ret;\n+\tlist_del_init(&subreq->rreq_link);\n+\tnetfs_put_subrequest(subreq, netfs_sreq_trace_put_failed);\n+\treturn ret;\n }\n \n /**\n@@ -185,7 +189,7 @@ ssize_t netfs_read_single(struct inode *inode, struct file *file, struct iov_ite\n \tif (IS_ERR(rreq))\n \t\treturn PTR_ERR(rreq);\n \n-\tret = netfs_extract_iter(iter, rreq->len, INT_MAX, 0, &rreq->dispatch_cursor.bvecq, 0);\n+\tret = netfs_extract_iter(iter, rreq->len, INT_MAX, 0, &rreq->load_cursor.bvecq, 0);\n \tif (ret < 0)\n \t\tgoto cleanup_free;\n \n@@ -196,9 +200,29 @@ ssize_t netfs_read_single(struct inode *inode, struct file *file, struct iov_ite\n \tnetfs_stat(&netfs_n_rh_read_single);\n \ttrace_netfs_read(rreq, 0, rreq->len, netfs_read_trace_read_single);\n \n-\tnetfs_single_dispatch_read(rreq);\n+\tret = netfs_single_dispatch_read(rreq);\n+\n+\ttrace_netfs_rreq(rreq, netfs_rreq_trace_complete);\n+\tif (ret == 0) {\n+\t\ttask_io_account_read(rreq->transferred);\n+\n+\t\tif (test_bit(NETFS_RREQ_FOLIO_COPY_TO_CACHE, &rreq->flags) &&\n+\t\t    fscache_resources_valid(&rreq->cache_resources)) {\n+\t\t\ttrace_netfs_rreq(rreq, netfs_rreq_trace_dirty);\n+\t\t\tnetfs_single_mark_inode_dirty(rreq->inode);\n+\t\t}\n+\t\tret = rreq->transferred;\n+\t}\n+\n+\tif (rreq->netfs_ops->done)\n+\t\trreq->netfs_ops->done(rreq);\n+\n+\tnetfs_wake_rreq_flag(rreq, NETFS_RREQ_IN_PROGRESS, netfs_rreq_trace_wake_ip);\n+\t/* As we cleared NETFS_RREQ_IN_PROGRESS, we acquired its ref. */\n+\tnetfs_put_request(rreq, netfs_rreq_trace_put_work_ip);\n+\n+\ttrace_netfs_rreq(rreq, netfs_rreq_trace_done);\n \n-\tret = netfs_wait_for_read(rreq);\n \tnetfs_put_request(rreq, netfs_rreq_trace_put_return);\n \treturn ret;\n \ndiff --git a/fs/netfs/write_collect.c b/fs/netfs/write_collect.c\nindex fb8daf50c86d..bfca6d48361f 100644\n--- a/fs/netfs/write_collect.c\n+++ b/fs/netfs/write_collect.c\n@@ -28,8 +28,8 @@ static void netfs_dump_request(const struct netfs_io_request *rreq)\n \t       rreq->origin, rreq->error);\n \tpr_err(\"  st=%llx tsl=%zx/%llx/%llx\\n\",\n \t       rreq->start, rreq->transferred, rreq->submitted, rreq->len);\n-\tpr_err(\"  cci=%llx/%llx/%llx\\n\",\n-\t       rreq->cleaned_to, rreq->collected_to, atomic64_read(&rreq->issued_to));\n+\tpr_err(\"  cci=%llx/%llx\\n\",\n+\t       rreq->cleaned_to, rreq->collected_to);\n \tpr_err(\"  iw=%pSR\\n\", rreq->netfs_ops->issue_write);\n \tfor (int i = 0; i < NR_IO_STREAMS; i++) {\n \t\tconst struct netfs_io_subrequest *sreq;\n@@ -38,8 +38,9 @@ static void netfs_dump_request(const struct netfs_io_request *rreq)\n \t\tpr_err(\"  str[%x] s=%x e=%d acnf=%u,%u,%u,%u\\n\",\n \t\t       s->stream_nr, s->source, s->error,\n \t\t       s->avail, s->active, s->need_retry, s->failed);\n-\t\tpr_err(\"  str[%x] ct=%llx t=%zx\\n\",\n-\t\t       s->stream_nr, s->collected_to, s->transferred);\n+\t\tpr_err(\"  str[%x] it=%llx ct=%llx t=%zx\\n\",\n+\t\t       s->stream_nr, atomic64_read(&s->issued_to),\n+\t\t       s->collected_to, s->transferred);\n \t\tlist_for_each_entry(sreq, &s->subrequests, rreq_link) {\n \t\t\tpr_err(\"  sreq[%x:%x] sc=%u s=%llx t=%zx/%zx r=%d f=%lx\\n\",\n \t\t\t       sreq->stream_nr, sreq->debug_index, sreq->source,\n@@ -56,7 +57,7 @@ static void netfs_dump_request(const struct netfs_io_request *rreq)\n  */\n int netfs_folio_written_back(struct folio *folio)\n {\n-\tenum netfs_folio_trace why = netfs_folio_trace_clear;\n+\tenum netfs_folio_trace why = netfs_folio_trace_endwb;\n \tstruct netfs_inode *ictx = netfs_inode(folio->mapping->host);\n \tstruct netfs_folio *finfo;\n \tstruct netfs_group *group = NULL;\n@@ -76,13 +77,13 @@ int netfs_folio_written_back(struct folio *folio)\n \t\tgroup = finfo->netfs_group;\n \t\tgcount++;\n \t\tkfree(finfo);\n-\t\twhy = netfs_folio_trace_clear_s;\n+\t\twhy = netfs_folio_trace_endwb_s;\n \t\tgoto end_wb;\n \t}\n \n \tif ((group = netfs_folio_group(folio))) {\n \t\tif (group == NETFS_FOLIO_COPY_TO_CACHE) {\n-\t\t\twhy = netfs_folio_trace_clear_cc;\n+\t\t\twhy = netfs_folio_trace_endwb_cc;\n \t\t\tfolio_detach_private(folio);\n \t\t\tgoto end_wb;\n \t\t}\n@@ -95,7 +96,7 @@ int netfs_folio_written_back(struct folio *folio)\n \t\tif (!folio_test_dirty(folio)) {\n \t\t\tfolio_detach_private(folio);\n \t\t\tgcount++;\n-\t\t\twhy = netfs_folio_trace_clear_g;\n+\t\t\twhy = netfs_folio_trace_endwb_g;\n \t\t}\n \t}\n \n@@ -222,9 +223,7 @@ static void netfs_collect_write_results(struct netfs_io_request *wreq)\n \ttrace_netfs_rreq(wreq, netfs_rreq_trace_collect);\n \n reassess_streams:\n-\t/* Order reading the issued_to point before reading the queue it refers to. */\n-\tissued_to = atomic64_read_acquire(&wreq->issued_to);\n-\tsmp_rmb();\n+\tissued_to = ULLONG_MAX;\n \tcollected_to = ULLONG_MAX;\n \tif (wreq->origin == NETFS_WRITEBACK ||\n \t    wreq->origin == NETFS_WRITETHROUGH ||\n@@ -239,14 +238,26 @@ static void netfs_collect_write_results(struct netfs_io_request *wreq)\n \t * to the tail whilst we're doing this.\n \t */\n \tfor (s = 0; s < NR_IO_STREAMS; s++) {\n+\t\tunsigned long long s_issued_to;\n+\n \t\tstream = &wreq->io_streams[s];\n-\t\t/* Read active flag before list pointers */\n+\t\t/* Read active flag before issued_to */\n \t\tif (!smp_load_acquire(&stream->active))\n \t\t\tcontinue;\n \n-\t\tfront = list_first_entry_or_null(&stream->subrequests,\n-\t\t\t\t\t\t struct netfs_io_subrequest, rreq_link);\n-\t\twhile (front) {\n+\t\tfor (;;) {\n+\t\t\t/* Order reading the issued_to point before reading the\n+\t\t\t * queue it refers to.\n+\t\t\t */\n+\t\t\ts_issued_to = atomic64_read_acquire(&stream->issued_to);\n+\t\t\tif (s_issued_to < issued_to)\n+\t\t\t\tissued_to = s_issued_to;\n+\n+\t\t\tfront = list_first_entry_or_null(&stream->subrequests,\n+\t\t\t\t\t\t\t struct netfs_io_subrequest, rreq_link);\n+\t\t\tif (!front)\n+\t\t\t\tbreak;\n+\n \t\t\ttrace_netfs_collect_sreq(wreq, front);\n \t\t\t//_debug(\"sreq [%x] %llx %zx/%zx\",\n \t\t\t//       front->debug_index, front->start, front->transferred, front->len);\ndiff --git a/fs/netfs/write_issue.c b/fs/netfs/write_issue.c\nindex d4c4bee4299e..ec84d2bcabeb 100644\n--- a/fs/netfs/write_issue.c\n+++ b/fs/netfs/write_issue.c\n@@ -36,6 +36,39 @@\n #include <linux/pagemap.h>\n #include \"internal.h\"\n \n+#define NOTE_UPLOAD_AVAIL\t0x001\t/* Upload is available */\n+#define NOTE_CACHE_AVAIL\t0x002\t/* Local cache is available */\n+#define NOTE_CACHE_COPY\t\t0x004\t/* Copy folio to cache */\n+#define NOTE_UPLOAD\t\t0x008\t/* Upload folio to server */\n+#define NOTE_UPLOAD_STARTED\t0x010\t/* Upload started */\n+#define NOTE_STREAMW\t\t0x020\t/* Folio is from a streaming write */\n+#define NOTE_DISCONTIG_BEFORE\t0x040\t/* Folio discontiguous with the previous folio */\n+#define NOTE_DISCONTIG_AFTER\t0x080\t/* Folio discontiguous with the next folio */\n+#define NOTE_TO_EOF\t\t0x100\t/* Data in folio ends at EOF */\n+#define NOTE_FLUSH_ANYWAY\t0x200\t/* Flush data, even if not hit estimated limit */\n+\n+#define NOTES__KEEP_MASK (NOTE_UPLOAD_AVAIL | NOTE_CACHE_AVAIL | NOTE_UPLOAD_STARTED)\n+\n+struct netfs_wb_params {\n+\tunsigned long long\tlast_end;\t/* End file pos of previous folio */\n+\tunsigned long long\tfolio_start;\t/* File pos of folio */\n+\tunsigned int\t\tfolio_len;\t/* Length of folio */\n+\tunsigned int\t\tdirty_offset;\t/* Offset of dirty region in folio */\n+\tunsigned int\t\tdirty_len;\t/* Length of dirty region in folio */\n+\tunsigned int\t\tnotes;\t\t/* Notes on applicability */\n+\tstruct bvecq_pos\tdispatch_cursor; /* Folio queue anchor for issue_at */\n+\tstruct netfs_write_estimate estimates[2];\n+};\n+\n+struct netfs_writethrough {\n+\tstruct netfs_wb_params\tparams;\n+\tstruct netfs_io_request\t*wreq;\n+\tstruct folio\t\t*in_progress;\n+};\n+\n+static int netfs_prepare_write_single_buffer(struct netfs_io_subrequest *subreq,\n+\t\t\t\t\t     unsigned int max_segs);\n+\n /*\n  * Kill all dirty folios in the event of an unrecoverable error, starting with\n  * a locked folio we've already obtained from writeback_iter().\n@@ -115,65 +148,48 @@ struct netfs_io_request *netfs_create_write_req(struct address_space *mapping,\n \n \twreq->io_streams[0].stream_nr\t\t= 0;\n \twreq->io_streams[0].source\t\t= NETFS_UPLOAD_TO_SERVER;\n-\twreq->io_streams[0].prepare_write\t= ictx->ops->prepare_write;\n+\twreq->io_streams[0].applicable\t\t= NOTE_UPLOAD;\n+\twreq->io_streams[0].estimate_write\t= ictx->ops->estimate_write;\n \twreq->io_streams[0].issue_write\t\t= ictx->ops->issue_write;\n \twreq->io_streams[0].collected_to\t= start;\n \twreq->io_streams[0].transferred\t\t= 0;\n \n \twreq->io_streams[1].stream_nr\t\t= 1;\n \twreq->io_streams[1].source\t\t= NETFS_WRITE_TO_CACHE;\n+\twreq->io_streams[1].applicable\t\t= NOTE_CACHE_COPY;\n \twreq->io_streams[1].collected_to\t= start;\n \twreq->io_streams[1].transferred\t\t= 0;\n \tif (fscache_resources_valid(&wreq->cache_resources)) {\n \t\twreq->io_streams[1].avail\t= true;\n \t\twreq->io_streams[1].active\t= true;\n-\t\twreq->io_streams[1].prepare_write = wreq->cache_resources.ops->prepare_write_subreq;\n+\t\twreq->io_streams[1].estimate_write = wreq->cache_resources.ops->estimate_write;\n \t\twreq->io_streams[1].issue_write = wreq->cache_resources.ops->issue_write;\n \t}\n \n \treturn wreq;\n }\n \n-/**\n- * netfs_prepare_write_failed - Note write preparation failed\n- * @subreq: The subrequest to mark\n- *\n- * Mark a subrequest to note that preparation for write failed.\n- */\n-void netfs_prepare_write_failed(struct netfs_io_subrequest *subreq)\n-{\n-\t__set_bit(NETFS_SREQ_FAILED, &subreq->flags);\n-\ttrace_netfs_sreq(subreq, netfs_sreq_trace_prep_failed);\n-}\n-EXPORT_SYMBOL(netfs_prepare_write_failed);\n-\n /*\n- * Prepare a write subrequest.  We need to allocate a new subrequest\n- * if we don't have one.\n+ * Allocate and prepare a write subrequest.\n  */\n-void netfs_prepare_write(struct netfs_io_request *wreq,\n-\t\t\t struct netfs_io_stream *stream,\n-\t\t\t loff_t start)\n+struct netfs_io_subrequest *netfs_alloc_write_subreq(struct netfs_io_request *wreq,\n+\t\t\t\t\t\t     struct netfs_io_stream *stream)\n {\n \tstruct netfs_io_subrequest *subreq;\n \n \tsubreq = netfs_alloc_subrequest(wreq);\n \tsubreq->source\t\t= stream->source;\n-\tsubreq->start\t\t= start;\n+\tsubreq->start\t\t= stream->issue_from;\n+\tsubreq->len\t\t= stream->buffered;\n \tsubreq->stream_nr\t= stream->stream_nr;\n \n-\tbvecq_pos_set(&subreq->dispatch_pos, &wreq->dispatch_cursor);\n-\n \t_enter(\"R=%x[%x]\", wreq->debug_id, subreq->debug_index);\n \n \ttrace_netfs_sreq(subreq, netfs_sreq_trace_prepare);\n \n-\tstream->sreq_max_len\t= UINT_MAX;\n-\tstream->sreq_max_segs\t= INT_MAX;\n \tswitch (stream->source) {\n \tcase NETFS_UPLOAD_TO_SERVER:\n \t\tnetfs_stat(&netfs_n_wh_upload);\n-\t\tstream->sreq_max_len = wreq->wsize;\n \t\tbreak;\n \tcase NETFS_WRITE_TO_CACHE:\n \t\tnetfs_stat(&netfs_n_wh_write);\n@@ -183,9 +199,6 @@ void netfs_prepare_write(struct netfs_io_request *wreq,\n \t\tbreak;\n \t}\n \n-\tif (stream->prepare_write)\n-\t\tstream->prepare_write(subreq);\n-\n \t__set_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags);\n \n \t/* We add to the end of the list whilst the collector may be walking\n@@ -194,84 +207,46 @@ void netfs_prepare_write(struct netfs_io_request *wreq,\n \t */\n \tspin_lock(&wreq->lock);\n \tlist_add_tail(&subreq->rreq_link, &stream->subrequests);\n-\tif (list_is_first(&subreq->rreq_link, &stream->subrequests)) {\n-\t\tif (!stream->active) {\n-\t\t\tstream->collected_to = subreq->start;\n-\t\t\t/* Write list pointers before active flag */\n-\t\t\tsmp_store_release(&stream->active, true);\n-\t\t}\n-\t}\n+\tif (list_is_first(&subreq->rreq_link, &stream->subrequests) &&\n+\t    stream->collected_to == 0)\n+\t\tstream->collected_to = subreq->start;\n \n \tspin_unlock(&wreq->lock);\n-\n-\tstream->construct = subreq;\n+\treturn subreq;\n }\n \n /*\n- * Set the I/O iterator for the filesystem/cache to use and dispatch the I/O\n- * operation.  The operation may be asynchronous and should call\n- * netfs_write_subrequest_terminated() when complete.\n+ * Prepare the buffer for a buffered write.\n  */\n-static void netfs_do_issue_write(struct netfs_io_stream *stream,\n-\t\t\t\t struct netfs_io_subrequest *subreq)\n+static int netfs_prepare_buffered_write_buffer(struct netfs_io_subrequest *subreq,\n+\t\t\t\t\t       unsigned int max_segs)\n {\n \tstruct netfs_io_request *wreq = subreq->rreq;\n+\tstruct netfs_io_stream *stream = &wreq->io_streams[subreq->stream_nr];\n+\tssize_t len;\n \n-\t_enter(\"R=%x[%x],%zx\", wreq->debug_id, subreq->debug_index, subreq->len);\n-\n-\tif (test_bit(NETFS_SREQ_FAILED, &subreq->flags))\n-\t\treturn netfs_write_subrequest_terminated(subreq, subreq->error);\n-\n-\ttrace_netfs_sreq(subreq, netfs_sreq_trace_submit);\n-\tstream->issue_write(subreq);\n-}\n-\n-void netfs_reissue_write(struct netfs_io_stream *stream,\n-\t\t\t struct netfs_io_subrequest *subreq)\n-{\n-\t// TODO: Use encrypted buffer\n-\tbvecq_pos_set(&subreq->content, &subreq->dispatch_pos);\n-\tiov_iter_bvec_queue(&subreq->io_iter, ITER_SOURCE,\n-\t\t\t    subreq->content.bvecq, subreq->content.slot,\n-\t\t\t    subreq->content.offset,\n-\t\t\t    subreq->len);\n-\tiov_iter_advance(&subreq->io_iter, subreq->transferred);\n-\n-\tsubreq->retry_count++;\n-\tsubreq->error = 0;\n-\t__clear_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags);\n-\t__set_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags);\n-\tnetfs_stat(&netfs_n_wh_retry_write_subreq);\n-\tnetfs_do_issue_write(stream, subreq);\n-}\n-\n-void netfs_issue_write(struct netfs_io_request *wreq,\n-\t\t       struct netfs_io_stream *stream)\n-{\n-\tstruct netfs_io_subrequest *subreq = stream->construct;\n+\t_enter(\"%zx,{,%u,%u},%u\",\n+\t       subreq->len, stream->dispatch_cursor.slot, stream->dispatch_cursor.offset, max_segs);\n \n-\tif (!subreq)\n-\t\treturn;\n+\tbvecq_pos_set(&subreq->dispatch_pos, &stream->dispatch_cursor);\n \n \t/* If we have a write to the cache, we need to round out the first and\n \t * last entries (only those as the data will be on virtually contiguous\n \t * folios) to cache DIO boundaries.\n \t */\n \tif (subreq->source == NETFS_WRITE_TO_CACHE) {\n-\t\tstruct bvecq_pos tmp_pos;\n \t\tstruct bio_vec *bv;\n \t\tstruct bvecq *bq;\n \t\tsize_t dio_size = wreq->cache_resources.dio_size;\n-\t\tsize_t disp, len;\n-\t\tint ret;\n+\t\tsize_t disp, dlen;\n \n-\t\tbvecq_pos_set(&tmp_pos, &subreq->dispatch_pos);\n-\t\tret = bvecq_extract(&tmp_pos, subreq->len, INT_MAX, &subreq->content.bvecq);\n-\t\tbvecq_pos_unset(&tmp_pos);\n-\t\tif (ret < 0) {\n-\t\t\tnetfs_write_subrequest_terminated(subreq, -ENOMEM);\n-\t\t\treturn;\n-\t\t}\n+\t\tlen = bvecq_extract(&stream->dispatch_cursor, subreq->len, max_segs,\n+\t\t\t\t    &subreq->content.bvecq);\n+\t\tif (len < 0)\n+\t\t\treturn -ENOMEM;\n+\n+\t\t_debug(\"extract %zx/%zx\", len, subreq->len);\n+\t\tsubreq->len = len;\n \n \t\t/* Round the first entry down. */\n \t\tbq = subreq->content.bvecq;\n@@ -289,96 +264,276 @@ void netfs_issue_write(struct netfs_io_request *wreq,\n \t\twhile (bq->next)\n \t\t\tbq = bq->next;\n \t\tbv = &bq->bv[bq->nr_slots - 1];\n-\t\tlen = round_up(bv->bv_len, dio_size);\n-\t\tif (len > bv->bv_len) {\n-\t\t\tsubreq->len += len - bv->bv_len;\n-\t\t\tbv->bv_len = len;\n+\t\tdlen = round_up(bv->bv_len, dio_size);\n+\t\tif (dlen > bv->bv_len) {\n+\t\t\tsubreq->len += dlen - bv->bv_len;\n+\t\t\tbv->bv_len = dlen;\n \t\t}\n \t} else {\n-\t\tbvecq_pos_set(&subreq->content, &subreq->dispatch_pos);\n+\t\tbvecq_pos_set(&subreq->content, &stream->dispatch_cursor);\n+\t\tlen = bvecq_slice(&stream->dispatch_cursor, subreq->len, max_segs,\n+\t\t\t\t  &subreq->nr_segs);\n+\n+\t\tif (len < subreq->len) {\n+\t\t\tsubreq->len = len;\n+\t\t\ttrace_netfs_sreq(subreq, netfs_sreq_trace_limited);\n+\t\t}\n \t}\n \n-\tiov_iter_bvec_queue(&subreq->io_iter, ITER_SOURCE,\n-\t\t\t    subreq->content.bvecq, subreq->content.slot,\n-\t\t\t    subreq->content.offset,\n-\t\t\t    subreq->len);\n+\tstream->issue_from += len;\n+\tstream->buffered   -= len;\n+\tif (stream->buffered == 0) {\n+\t\tstream->buffering = false;\n+\t\tbvecq_pos_unset(&stream->dispatch_cursor);\n+\t}\n+\t/* Order loading the queue before updating the issue_to point */\n+\tatomic64_set_release(&stream->issued_to, stream->issue_from);\n+\treturn 0;\n+}\n+\n+/**\n+ * netfs_prepare_write_buffer - Get the buffer for a subrequest\n+ * @subreq: The subrequest to get the buffer for\n+ * @max_segs: Maximum number of segments in buffer (or INT_MAX)\n+ *\n+ * Extract a slice of buffer from the stream and attach it to the subrequest as\n+ * a bio_vec queue.  The maximum amount of data attached is set by\n+ * @subreq->len, but this may be shortened if @max_segs would be exceeded.\n+ */\n+int netfs_prepare_write_buffer(struct netfs_io_subrequest *subreq,\n+\t\t\t       unsigned int max_segs)\n+{\n+\tstruct netfs_io_request *rreq = subreq->rreq;\n+\n+\tswitch (rreq->origin) {\n+\tcase NETFS_WRITEBACK:\n+\tcase NETFS_WRITETHROUGH:\n+\t\tif (test_bit(NETFS_RREQ_RETRYING, &rreq->flags))\n+\t\t\treturn netfs_prepare_write_retry_buffer(subreq, max_segs);\n+\t\treturn netfs_prepare_buffered_write_buffer(subreq, max_segs);\n+\n+\tcase NETFS_UNBUFFERED_WRITE:\n+\tcase NETFS_DIO_WRITE:\n+\t\treturn netfs_prepare_unbuffered_write_buffer(subreq, max_segs);\n \n-\tstream->construct = NULL;\n-\tnetfs_do_issue_write(stream, subreq);\n+\tcase NETFS_WRITEBACK_SINGLE:\n+\t\treturn netfs_prepare_write_single_buffer(subreq, max_segs);\n+\n+\tcase NETFS_PGPRIV2_COPY_TO_CACHE:\n+\t\treturn netfs_prepare_pgpriv2_write_buffer(subreq, max_segs);\n+\n+\tdefault:\n+\t\tWARN_ON_ONCE(1);\n+\t\treturn -EIO;\n+\t}\n }\n+EXPORT_SYMBOL(netfs_prepare_write_buffer);\n \n /*\n- * Add data to the write subrequest, dispatching each as we fill it up or if it\n- * is discontiguous with the previous.  We only fill one part at a time so that\n- * we can avoid overrunning the credits obtained (cifs) and try to parallelise\n- * content-crypto preparation with network writes.\n+ * Issue writes for a stream.\n  */\n-size_t netfs_advance_write(struct netfs_io_request *wreq,\n-\t\t\t   struct netfs_io_stream *stream,\n-\t\t\t   loff_t start, size_t len, bool to_eof)\n+static int netfs_issue_writes(struct netfs_io_request *wreq,\n+\t\t\t      struct netfs_io_stream *stream,\n+\t\t\t      struct netfs_wb_params *params)\n {\n-\tstruct netfs_io_subrequest *subreq = stream->construct;\n-\tsize_t part;\n+\tstruct netfs_write_estimate *estimate = &params->estimates[stream->stream_nr];\n+\n+\tfor (;;) {\n+\t\tstruct netfs_io_subrequest *subreq;\n+\t\tint ret;\n+\n+\t\tsubreq = netfs_alloc_write_subreq(wreq, stream);\n+\t\tif (!subreq)\n+\t\t\treturn -ENOMEM;\n \n-\tif (!stream->avail) {\n-\t\t_leave(\"no write\");\n-\t\treturn len;\n+\t\tret = stream->issue_write(subreq);\n+\t\tif (ret < 0 && ret != -EIOCBQUEUED)\n+\t\t\treturn ret;\n+\n+\t\tif (stream->buffered == 0) {\n+\t\t\tif (stream->stream_nr == 0)\n+\t\t\t\tparams->notes &= ~NOTE_UPLOAD_STARTED;\n+\t\t\treturn 0;\n+\t\t}\n+\n+\t\tif (!(params->notes & NOTE_FLUSH_ANYWAY)) {\n+\t\t\testimate->issue_at = ULLONG_MAX;\n+\t\t\testimate->max_segs = INT_MAX;\n+\t\t\tstream->estimate_write(wreq, stream, estimate);\n+\t\t\tif (stream->issue_from + stream->buffered < estimate->issue_at &&\n+\t\t\t    estimate->max_segs > 0)\n+\t\t\t\treturn 0;\n+\t\t}\n+\t}\n+}\n+\n+/*\n+ * Issue pending writes on a stream.\n+ */\n+static int netfs_issue_stream(struct netfs_io_request *wreq,\n+\t\t\t      struct netfs_wb_params *params, int s)\n+{\n+\tstruct netfs_write_estimate *estimate = &params->estimates[s];\n+\tstruct netfs_io_stream *stream = &wreq->io_streams[s];\n+\tunsigned long long dirty_start;\n+\tbool discontig_before = params->notes & NOTE_DISCONTIG_BEFORE;\n+\tint ret;\n+\n+\t_enter(\"%x\", params->notes);\n+\n+\t/* If the current folio doesn't contribute to this stream, see if we\n+\t * need to flush it.\n+\t */\n+\tif (!(params->notes & stream->applicable)) {\n+\t\tif (!stream->buffering) {\n+\t\t\tatomic64_set_release(&stream->issued_to,\n+\t\t\t\t\t     params->folio_start + params->folio_len);\n+\t\t\treturn 0;\n+\t\t}\n+\t\tdiscontig_before = true;\n+\t}\n+\n+\t/* Issue writes if we meet a discontiguity before the current folio.\n+\t * Even if the filesystem can do sparse/vectored writes, we still\n+\t * generate a subreq per contiguous region rather than generating\n+\t * separate extent lists.\n+\t */\n+\tif (stream->buffering && discontig_before) {\n+\t\tparams->notes |= NOTE_FLUSH_ANYWAY;\n+\t\tret = netfs_issue_writes(wreq, stream, params);\n+\t\tif (ret < 0)\n+\t\t\treturn ret;\n+\t\tstream->buffering = false;\n+\t\tparams->notes &= ~NOTE_FLUSH_ANYWAY;\n+\t}\n+\n+\tif (!(params->notes & stream->applicable)) {\n+\t\tatomic64_set_release(&stream->issued_to,\n+\t\t\t\t     params->folio_start + params->folio_len);\n+\t\treturn 0;\n+\t}\n+\n+\t/* If we're not currently buffering on this stream, we need to get an\n+\t * estimate of when we need to issue a write.  It might be within the\n+\t * starting folio.\n+\t */\n+\tdirty_start = params->folio_start + params->dirty_offset;\n+\tif (!stream->buffering) {\n+\t\tstream->buffering = true;\n+\t\tstream->issue_from = dirty_start;\n+\t\tbvecq_pos_set(&stream->dispatch_cursor, &params->dispatch_cursor);\n+\t\testimate->issue_at = ULLONG_MAX;\n+\t\testimate->max_segs = INT_MAX;\n+\t\tstream->estimate_write(wreq, stream, estimate);\n+\t}\n+\n+\tstream->buffered += params->dirty_len;\n+\testimate->max_segs--;\n+\n+\t/* Poke the filesystem to issue writes when we hit the limit it set or\n+\t * if the data ends before the end of the page.\n+\t */\n+\tif (params->notes & NOTE_DISCONTIG_AFTER)\n+\t\tparams->notes |= NOTE_FLUSH_ANYWAY;\n+\t_debug(\"[%u] %llx + %zx >= %llx, %u %x\",\n+\t       s, stream->issue_from, stream->buffered, estimate->issue_at,\n+\t       estimate->max_segs, params->notes);\n+\tif (stream->issue_from + stream->buffered >= estimate->issue_at ||\n+\t    estimate->max_segs <= 0 ||\n+\t    (params->notes & NOTE_FLUSH_ANYWAY)) {\n+\t\tret = netfs_issue_writes(wreq, stream, params);\n+\t\tif (ret < 0)\n+\t\t\treturn ret;\n \t}\n \n-\t_enter(\"R=%x[%x]\", wreq->debug_id, subreq ? subreq->debug_index : 0);\n+\treturn 0;\n+}\n+\n+/*\n+ * See which streams need writes issuing and issue them.\n+ */\n+static int netfs_issue_streams(struct netfs_io_request *wreq,\n+\t\t\t       struct netfs_wb_params *params)\n+{\n+\tint ret = 0, ret2;\n+\n+\t_enter(\"%x\", params->notes);\n \n-\tif (subreq && start != subreq->start + subreq->len) {\n-\t\tnetfs_issue_write(wreq, stream);\n-\t\tsubreq = NULL;\n+\tfor (int s = 0; s < NR_IO_STREAMS; s++) {\n+\t\tret2 = netfs_issue_stream(wreq, params, s);\n+\t\tif (ret2 < 0)\n+\t\t\tret = ret2;\n \t}\n+\treturn ret;\n+}\n \n-\tif (!stream->construct)\n-\t\tnetfs_prepare_write(wreq, stream, start);\n-\tsubreq = stream->construct;\n+/*\n+ * End the issuing of writes, let the collector know we're done.\n+ */\n+static void netfs_end_issue_write(struct netfs_io_request *wreq,\n+\t\t\t\t  struct netfs_wb_params *params)\n+{\n+\tbool needs_poke = true;\n \n-\tpart = umin(stream->sreq_max_len - subreq->len, len);\n-\t_debug(\"part %zx/%zx %zx/%zx\", subreq->len, stream->sreq_max_len, part, len);\n-\tsubreq->len += part;\n-\tsubreq->nr_segs++;\n+\tparams->notes |= NOTE_FLUSH_ANYWAY;\n+\n+\tfor (int s = 0; s < NR_IO_STREAMS; s++) {\n+\t\tstruct netfs_io_stream *stream = &wreq->io_streams[s];\n+\t\tint ret;\n+\n+\t\tif (stream->buffering) {\n+\t\t\tret = netfs_issue_writes(wreq, stream, params);\n+\t\t\tif (ret < 0) {\n+\t\t\t\t/* Leave the error somewhere the completion\n+\t\t\t\t * path can pick it up if there isn't already\n+\t\t\t\t * another error logged.\n+\t\t\t\t */\n+\t\t\t\tcmpxchg(&wreq->error, 0, ret);\n+\t\t\t}\n+\t\t\tstream->buffering = false;\n+\t\t}\n+\t}\n+\n+\tsmp_wmb(); /* Write subreq lists before ALL_QUEUED. */\n+\tset_bit(NETFS_RREQ_ALL_QUEUED, &wreq->flags);\n+\n+\tfor (int s = 0; s < NR_IO_STREAMS; s++) {\n+\t\tstruct netfs_io_stream *stream = &wreq->io_streams[s];\n \n-\tif (subreq->len >= stream->sreq_max_len ||\n-\t    subreq->nr_segs >= stream->sreq_max_segs ||\n-\t    to_eof) {\n-\t\tnetfs_issue_write(wreq, stream);\n-\t\tsubreq = NULL;\n+\t\tif (!stream->active)\n+\t\t\tcontinue;\n+\t\tif (!list_empty(&stream->subrequests))\n+\t\t\tneeds_poke = false;\n \t}\n \n-\treturn part;\n+\tif (needs_poke)\n+\t\tnetfs_wake_collector(wreq);\n }\n \n /*\n- * Write some of a pending folio data back to the server.\n+ * Queue a folio for writeback.\n  */\n-static int netfs_write_folio(struct netfs_io_request *wreq,\n-\t\t\t     struct writeback_control *wbc,\n-\t\t\t     struct folio *folio)\n+static int netfs_queue_wb_folio(struct netfs_io_request *wreq,\n+\t\t\t\tstruct writeback_control *wbc,\n+\t\t\t\tstruct folio *folio,\n+\t\t\t\tstruct netfs_wb_params *params)\n {\n-\tstruct netfs_io_stream *upload = &wreq->io_streams[0];\n-\tstruct netfs_io_stream *cache  = &wreq->io_streams[1];\n-\tstruct netfs_io_stream *stream;\n \tstruct netfs_group *fgroup; /* TODO: Use this with ceph */\n \tstruct netfs_folio *finfo;\n \tstruct bvecq *queue = wreq->load_cursor.bvecq;\n \tunsigned int slot;\n \tsize_t fsize = folio_size(folio), flen = fsize, foff = 0;\n \tloff_t fpos = folio_pos(folio), i_size;\n-\tbool to_eof = false, streamw = false;\n-\tbool debug = false;\n \tint ret;\n \n-\t_enter(\"\");\n+\t_enter(\"%x\", params->notes);\n \n \t/* Institute a new bvec queue segment if the current one is full or if\n \t * we encounter a discontiguity.  The discontiguity break is important\n \t * when it comes to bulk unlocking folios by file range.\n \t */\n \tif (bvecq_is_full(queue) ||\n-\t    (fpos != wreq->last_end && wreq->last_end > 0)) {\n+\t    (fpos != params->last_end && params->last_end > 0)) {\n \t\tret = bvecq_buffer_make_space(&wreq->load_cursor, GFP_NOFS);\n \t\tif (ret < 0) {\n \t\t\tfolio_unlock(folio);\n@@ -387,10 +542,10 @@ static int netfs_write_folio(struct netfs_io_request *wreq,\n \n \t\tqueue = wreq->load_cursor.bvecq;\n \t\tqueue->fpos = fpos;\n-\t\tif (fpos != wreq->last_end)\n+\t\tif (fpos != params->last_end)\n \t\t\tqueue->discontig = true;\n-\t\tbvecq_pos_move(&wreq->dispatch_cursor, queue);\n-\t\twreq->dispatch_cursor.slot = 0;\n+\t\tbvecq_pos_move(&params->dispatch_cursor, queue);\n+\t\tparams->dispatch_cursor.slot = 0;\n \t}\n \n \t/* netfs_perform_write() may shift i_size around the page or from out\n@@ -418,23 +573,36 @@ static int netfs_write_folio(struct netfs_io_request *wreq,\n \tif (finfo) {\n \t\tfoff = finfo->dirty_offset;\n \t\tflen = foff + finfo->dirty_len;\n-\t\tstreamw = true;\n+\t\tparams->notes |= NOTE_STREAMW;\n+\t\tif (foff > 0)\n+\t\t\tparams->notes |= NOTE_DISCONTIG_BEFORE;\n+\t\tif (flen < fsize)\n+\t\t\tparams->notes |= NOTE_DISCONTIG_AFTER;\n \t}\n \n+\tif (params->last_end && fpos != params->last_end)\n+\t\tparams->notes |= NOTE_DISCONTIG_BEFORE;\n+\tparams->last_end = fpos + fsize;\n+\n \tif (wreq->origin == NETFS_WRITETHROUGH) {\n-\t\tto_eof = false;\n \t\tif (flen > i_size - fpos)\n \t\t\tflen = i_size - fpos;\n+\t\t/* EOF may be changing. */\n \t} else if (flen > i_size - fpos) {\n \t\tflen = i_size - fpos;\n-\t\tif (!streamw)\n+\t\tif (!(params->notes & NOTE_STREAMW))\n \t\t\tfolio_zero_segment(folio, flen, fsize);\n-\t\tto_eof = true;\n+\t\tparams->notes |= NOTE_TO_EOF;\n \t} else if (flen == i_size - fpos) {\n-\t\tto_eof = true;\n+\t\tparams->notes |= NOTE_TO_EOF;\n \t}\n \tflen -= foff;\n \n+\tparams->folio_start\t= fpos;\n+\tparams->folio_len\t= fsize;\n+\tparams->dirty_offset\t= foff;\n+\tparams->dirty_len\t= flen;\n+\n \t_debug(\"folio %zx %zx %zx\", foff, flen, fsize);\n \n \t/* Deal with discontinuities in the stream of dirty pages.  These can\n@@ -454,22 +622,31 @@ static int netfs_write_folio(struct netfs_io_request *wreq,\n \t *     write-back group.\n \t */\n \tif (fgroup == NETFS_FOLIO_COPY_TO_CACHE) {\n-\t\tnetfs_issue_write(wreq, upload);\n+\t\tif (!(params->notes & NOTE_CACHE_AVAIL)) {\n+\t\t\ttrace_netfs_folio(folio, netfs_folio_trace_cancel_copy);\n+\t\t\tgoto cancel_folio;\n+\t\t}\n+\t\tparams->notes |= NOTE_CACHE_COPY;\n+\t\ttrace_netfs_folio(folio, netfs_folio_trace_store_copy);\n \t} else if (fgroup != wreq->group) {\n \t\t/* We can't write this page to the server yet. */\n \t\tkdebug(\"wrong group\");\n-\t\tfolio_redirty_for_writepage(wbc, folio);\n-\t\tfolio_unlock(folio);\n-\t\tnetfs_issue_write(wreq, upload);\n-\t\tnetfs_issue_write(wreq, cache);\n-\t\treturn 0;\n+\t\tgoto skip_folio;\n+\t} else if (!(params->notes & (NOTE_UPLOAD_AVAIL | NOTE_CACHE_AVAIL))) {\n+\t\ttrace_netfs_folio(folio, netfs_folio_trace_cancel_store);\n+\t\tgoto cancel_folio_discard;\n+\t} else {\n+\t\tif (params->notes & NOTE_UPLOAD_STARTED) {\n+\t\t\tparams->notes |= NOTE_UPLOAD;\n+\t\t\ttrace_netfs_folio(folio, netfs_folio_trace_store_plus);\n+\t\t} else {\n+\t\t\tparams->notes |= NOTE_UPLOAD | NOTE_UPLOAD_STARTED;\n+\t\t\ttrace_netfs_folio(folio, netfs_folio_trace_store);\n+\t\t}\n+\t\tif (params->notes & NOTE_CACHE_AVAIL)\n+\t\t\tparams->notes |= NOTE_CACHE_COPY;\n \t}\n \n-\tif (foff > 0)\n-\t\tnetfs_issue_write(wreq, upload);\n-\tif (streamw)\n-\t\tnetfs_issue_write(wreq, cache);\n-\n \t/* Flip the page to the writeback state and unlock.  If we're called\n \t * from write-through, then the page has already been put into the wb\n \t * state.\n@@ -478,129 +655,37 @@ static int netfs_write_folio(struct netfs_io_request *wreq,\n \t\tfolio_start_writeback(folio);\n \tfolio_unlock(folio);\n \n-\tif (fgroup == NETFS_FOLIO_COPY_TO_CACHE) {\n-\t\tif (!cache->avail) {\n-\t\t\ttrace_netfs_folio(folio, netfs_folio_trace_cancel_copy);\n-\t\t\tnetfs_issue_write(wreq, upload);\n-\t\t\tnetfs_folio_written_back(folio);\n-\t\t\treturn 0;\n-\t\t}\n-\t\ttrace_netfs_folio(folio, netfs_folio_trace_store_copy);\n-\t} else if (!upload->avail && !cache->avail) {\n-\t\ttrace_netfs_folio(folio, netfs_folio_trace_cancel_store);\n-\t\tnetfs_folio_written_back(folio);\n-\t\treturn 0;\n-\t} else if (!upload->construct) {\n-\t\ttrace_netfs_folio(folio, netfs_folio_trace_store);\n-\t} else {\n-\t\ttrace_netfs_folio(folio, netfs_folio_trace_store_plus);\n-\t}\n-\n \t/* Attach the folio to the rolling buffer. */\n \tslot = queue->nr_slots;\n-\tbvec_set_folio(&queue->bv[slot], folio, flen, 0);\n+\tbvec_set_folio(&queue->bv[slot], folio, flen, foff);\n \tqueue->nr_slots = slot + 1;\n \twreq->load_cursor.slot = slot + 1;\n \twreq->load_cursor.offset = 0;\n-\twreq->last_end = fpos + foff + flen;\n \ttrace_netfs_bv_slot(queue, slot);\n+\ttrace_netfs_wback(wreq, folio, params->notes);\n \n-\t/* Move the submission point forward to allow for write-streaming data\n-\t * not starting at the front of the page.  We don't do write-streaming\n-\t * with the cache as the cache requires DIO alignment.\n-\t *\n-\t * Also skip uploading for data that's been read and just needs copying\n-\t * to the cache.\n-\t */\n-\tfor (int s = 0; s < NR_IO_STREAMS; s++) {\n-\t\tstream = &wreq->io_streams[s];\n-\t\tstream->submit_off = 0;\n-\t\tstream->submit_len = flen;\n-\t\tif (!stream->avail ||\n-\t\t    (stream->source == NETFS_WRITE_TO_CACHE && streamw) ||\n-\t\t    (stream->source == NETFS_UPLOAD_TO_SERVER &&\n-\t\t     fgroup == NETFS_FOLIO_COPY_TO_CACHE)) {\n-\t\t\tstream->submit_off = UINT_MAX;\n-\t\t\tstream->submit_len = 0;\n-\t\t}\n-\t}\n-\n-\t/* Attach the folio to one or more subrequests.  For a big folio, we\n-\t * could end up with thousands of subrequests if the wsize is small -\n-\t * but we might need to wait during the creation of subrequests for\n-\t * network resources (eg. SMB credits).\n-\t */\n-\tfor (;;) {\n-\t\tssize_t part;\n-\t\tsize_t lowest_off = ULONG_MAX;\n-\t\tint choose_s = -1;\n-\n-\t\t/* Always add to the lowest-submitted stream first. */\n-\t\tfor (int s = 0; s < NR_IO_STREAMS; s++) {\n-\t\t\tstream = &wreq->io_streams[s];\n-\t\t\tif (stream->submit_len > 0 &&\n-\t\t\t    stream->submit_off < lowest_off) {\n-\t\t\t\tlowest_off = stream->submit_off;\n-\t\t\t\tchoose_s = s;\n-\t\t\t}\n-\t\t}\n-\n-\t\tif (choose_s < 0)\n-\t\t\tbreak;\n-\t\tstream = &wreq->io_streams[choose_s];\n-\n-\t\t/* Advance the cursor. */\n-\t\twreq->dispatch_cursor.offset = stream->submit_off;\n-\n-\t\tatomic64_set(&wreq->issued_to, fpos + foff + stream->submit_off);\n-\t\tpart = netfs_advance_write(wreq, stream, fpos + foff + stream->submit_off,\n-\t\t\t\t\t   stream->submit_len, to_eof);\n-\t\tstream->submit_off += part;\n-\t\tif (part > stream->submit_len)\n-\t\t\tstream->submit_len = 0;\n-\t\telse\n-\t\t\tstream->submit_len -= part;\n-\t\tif (part > 0)\n-\t\t\tdebug = true;\n-\t}\n-\n-\tbvecq_pos_step(&wreq->dispatch_cursor);\n-\t/* Order loading the queue before updating the issue_to point */\n-\tatomic64_set_release(&wreq->issued_to, fpos + fsize);\n-\n-\tif (!debug)\n-\t\tkdebug(\"R=%x: No submit\", wreq->debug_id);\n-\n-\tif (foff + flen < fsize)\n-\t\tfor (int s = 0; s < NR_IO_STREAMS; s++)\n-\t\t\tnetfs_issue_write(wreq, &wreq->io_streams[s]);\n-\n-\t_leave(\" = 0\");\n+out:\n+\t_leave(\" = %x\", params->notes);\n \treturn 0;\n-}\n \n-/*\n- * End the issuing of writes, letting the collector know we're done.\n- */\n-static void netfs_end_issue_write(struct netfs_io_request *wreq)\n-{\n-\tbool needs_poke = true;\n-\n-\tsmp_wmb(); /* Write subreq lists before ALL_QUEUED. */\n-\tset_bit(NETFS_RREQ_ALL_QUEUED, &wreq->flags);\n-\n-\tfor (int s = 0; s < NR_IO_STREAMS; s++) {\n-\t\tstruct netfs_io_stream *stream = &wreq->io_streams[s];\n-\n-\t\tif (!stream->active)\n-\t\t\tcontinue;\n-\t\tif (!list_empty(&stream->subrequests))\n-\t\t\tneeds_poke = false;\n-\t\tnetfs_issue_write(wreq, stream);\n-\t}\n-\n-\tif (needs_poke)\n-\t\tnetfs_wake_collector(wreq);\n+skip_folio:\n+\tret = folio_redirty_for_writepage(wbc, folio);\n+\tfolio_unlock(folio);\n+\tif (ret < 0)\n+\t\treturn ret;\n+\tparams->notes |= NOTE_DISCONTIG_BEFORE;\n+\tgoto out;\n+cancel_folio_discard:\n+\tnetfs_put_group(fgroup);\n+cancel_folio:\n+\tfolio_detach_private(folio);\n+\tkfree(finfo);\n+\tfolio_unlock(folio);\n+\tfolio_cancel_dirty(folio);\n+\tif (wreq->origin == NETFS_WRITETHROUGH)\n+\t\tfolio_end_writeback(folio);\n+\tparams->notes |= NOTE_DISCONTIG_BEFORE;\n+\tgoto out;\n }\n \n /*\n@@ -611,6 +696,7 @@ int netfs_writepages(struct address_space *mapping,\n {\n \tstruct netfs_inode *ictx = netfs_inode(mapping->host);\n \tstruct netfs_io_request *wreq = NULL;\n+\tstruct netfs_wb_params params = {};\n \tstruct folio *folio;\n \tint error = 0;\n \n@@ -636,35 +722,48 @@ int netfs_writepages(struct address_space *mapping,\n \n \tif (bvecq_buffer_init(&wreq->load_cursor, GFP_NOFS) < 0)\n \t\tgoto nomem;\n-\tbvecq_pos_set(&wreq->dispatch_cursor, &wreq->load_cursor);\n-\tbvecq_pos_set(&wreq->collect_cursor, &wreq->dispatch_cursor);\n+\tbvecq_pos_set(&params.dispatch_cursor, &wreq->load_cursor);\n+\tbvecq_pos_set(&wreq->collect_cursor, &wreq->load_cursor);\n \n \t__set_bit(NETFS_RREQ_OFFLOAD_COLLECTION, &wreq->flags);\n \ttrace_netfs_write(wreq, netfs_write_trace_writeback);\n \tnetfs_stat(&netfs_n_wh_writepages);\n \n-\tdo {\n-\t\t_debug(\"wbiter %lx %llx\", folio->index, atomic64_read(&wreq->issued_to));\n+\tif (wreq->io_streams[1].avail)\n+\t\tparams.notes |= NOTE_CACHE_AVAIL;\n \n-\t\t/* It appears we don't have to handle cyclic writeback wrapping. */\n-\t\tWARN_ON_ONCE(wreq && folio_pos(folio) < atomic64_read(&wreq->issued_to));\n+\tdo {\n+\t\t_debug(\"wbiter %lx\", folio->index);\n \n \t\tif (netfs_folio_group(folio) != NETFS_FOLIO_COPY_TO_CACHE &&\n \t\t    unlikely(!test_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags))) {\n \t\t\tset_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags);\n \t\t\twreq->netfs_ops->begin_writeback(wreq);\n+\t\t\tif (wreq->io_streams[0].avail) {\n+\t\t\t\tparams.notes |= NOTE_UPLOAD_AVAIL;\n+\t\t\t\t/* Order setting the active flag after other fields. */\n+\t\t\t\tsmp_store_release(&wreq->io_streams[0].active, true);\n+\t\t\t}\n \t\t}\n \n-\t\terror = netfs_write_folio(wreq, wbc, folio);\n+\t\tparams.notes &= NOTES__KEEP_MASK;\n+\t\terror = netfs_queue_wb_folio(wreq, wbc, folio, &params);\n+\t\tif (error < 0)\n+\t\t\tbreak;\n+\t\terror = netfs_issue_streams(wreq, &params);\n \t\tif (error < 0)\n \t\t\tbreak;\n+\n+\t\tbvecq_pos_step(&params.dispatch_cursor);\n \t} while ((folio = writeback_iter(mapping, wbc, folio, &error)));\n \n-\tnetfs_end_issue_write(wreq);\n+\tnetfs_end_issue_write(wreq, &params);\n \n \tmutex_unlock(&ictx->wb_lock);\n \tbvecq_pos_unset(&wreq->load_cursor);\n-\tbvecq_pos_unset(&wreq->dispatch_cursor);\n+\tbvecq_pos_unset(&params.dispatch_cursor);\n+\tfor (int i = 0; i < NR_IO_STREAMS; i++)\n+\t\tbvecq_pos_unset(&wreq->io_streams[i].dispatch_cursor);\n \tnetfs_wake_collector(wreq);\n \n \tnetfs_put_request(wreq, netfs_rreq_trace_put_return);\n@@ -686,32 +785,55 @@ EXPORT_SYMBOL(netfs_writepages);\n /*\n  * Begin a write operation for writing through the pagecache.\n  */\n-struct netfs_io_request *netfs_begin_writethrough(struct kiocb *iocb, size_t len)\n+struct netfs_writethrough *netfs_begin_writethrough(struct kiocb *iocb, size_t len)\n {\n+\tstruct netfs_writethrough *wthru = NULL;\n \tstruct netfs_io_request *wreq = NULL;\n \tstruct netfs_inode *ictx = netfs_inode(file_inode(iocb->ki_filp));\n \n+\twthru = kzalloc_obj(struct netfs_writethrough);\n+\tif (!wthru)\n+\t\treturn ERR_PTR(-ENOMEM);\n+\n \tmutex_lock(&ictx->wb_lock);\n \n \twreq = netfs_create_write_req(iocb->ki_filp->f_mapping, iocb->ki_filp,\n \t\t\t\t      iocb->ki_pos, NETFS_WRITETHROUGH);\n \tif (IS_ERR(wreq)) {\n \t\tmutex_unlock(&ictx->wb_lock);\n-\t\treturn wreq;\n+\t\tkfree(wthru);\n+\t\treturn ERR_CAST(wreq);\n \t}\n+\twthru->wreq = wreq;\n \n \tif (bvecq_buffer_init(&wreq->load_cursor, GFP_NOFS) < 0) {\n \t\tnetfs_put_failed_request(wreq);\n \t\tmutex_unlock(&ictx->wb_lock);\n+\t\tkfree(wthru);\n \t\treturn ERR_PTR(-ENOMEM);\n \t}\n \n-\tbvecq_pos_set(&wreq->dispatch_cursor, &wreq->load_cursor);\n-\tbvecq_pos_set(&wreq->collect_cursor, &wreq->dispatch_cursor);\n+\tbvecq_pos_set(&wthru->params.dispatch_cursor, &wreq->load_cursor);\n+\tbvecq_pos_set(&wreq->collect_cursor, &wreq->load_cursor);\n+\n+\tif (wreq->io_streams[1].avail)\n+\t\twthru->params.notes |= NOTE_CACHE_AVAIL;\n \n \twreq->io_streams[0].avail = true;\n \ttrace_netfs_write(wreq, netfs_write_trace_writethrough);\n-\treturn wreq;\n+\tif (!is_sync_kiocb(iocb))\n+\t\twreq->iocb = iocb;\n+\n+\tif (unlikely(!test_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags))) {\n+\t\tset_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags);\n+\t\t/* Don't call ->begin_writeback() as ->init_request() gets file*. */\n+\t\tif (wreq->io_streams[0].avail) {\n+\t\t\twthru->params.notes |= NOTE_UPLOAD_AVAIL;\n+\t\t\t/* Order setting the active flag after other fields. */\n+\t\t\tsmp_store_release(&wreq->io_streams[0].active, true);\n+\t\t}\n+\t}\n+\treturn wthru;\n }\n \n /*\n@@ -720,14 +842,17 @@ struct netfs_io_request *netfs_begin_writethrough(struct kiocb *iocb, size_t len\n  * to the request.  If we've added more than wsize then we need to create a new\n  * subrequest.\n  */\n-int netfs_advance_writethrough(struct netfs_io_request *wreq, struct writeback_control *wbc,\n-\t\t\t       struct folio *folio, size_t copied, bool to_page_end,\n-\t\t\t       struct folio **writethrough_cache)\n+int netfs_advance_writethrough(struct netfs_writethrough *wthru,\n+\t\t\t       struct writeback_control *wbc,\n+\t\t\t       struct folio *folio, size_t copied, bool to_page_end)\n {\n+\tstruct netfs_io_request *wreq = wthru->wreq;\n+\tint ret;\n+\n \t_enter(\"R=%x ws=%u cp=%zu tp=%u\",\n \t       wreq->debug_id, wreq->wsize, copied, to_page_end);\n \n-\tif (!*writethrough_cache) {\n+\tif (!wthru->in_progress) {\n \t\tif (folio_test_dirty(folio))\n \t\t\t/* Sigh.  mmap. */\n \t\t\tfolio_clear_dirty_for_io(folio);\n@@ -738,63 +863,113 @@ int netfs_advance_writethrough(struct netfs_io_request *wreq, struct writeback_c\n \t\t\ttrace_netfs_folio(folio, netfs_folio_trace_wthru);\n \t\telse\n \t\t\ttrace_netfs_folio(folio, netfs_folio_trace_wthru_plus);\n-\t\t*writethrough_cache = folio;\n+\t\twthru->in_progress = folio;\n \t}\n \n \twreq->len += copied;\n \tif (!to_page_end)\n \t\treturn 0;\n \n-\t*writethrough_cache = NULL;\n-\treturn netfs_write_folio(wreq, wbc, folio);\n+\twthru->in_progress = NULL;\n+\twthru->params.notes &= NOTES__KEEP_MASK;\n+\tret = netfs_queue_wb_folio(wreq, wbc, folio, &wthru->params);\n+\tif (ret < 0)\n+\t\treturn ret;\n+\treturn netfs_issue_streams(wreq, &wthru->params);\n }\n \n /*\n  * End a write operation used when writing through the pagecache.\n  */\n-ssize_t netfs_end_writethrough(struct netfs_io_request *wreq, struct writeback_control *wbc,\n-\t\t\t       struct folio *writethrough_cache)\n+ssize_t netfs_end_writethrough(struct netfs_writethrough *wthru,\n+\t\t\t       struct writeback_control *wbc)\n {\n+\tstruct netfs_io_request *wreq = wthru->wreq;\n \tstruct netfs_inode *ictx = netfs_inode(wreq->inode);\n \tssize_t ret;\n \n \t_enter(\"R=%x\", wreq->debug_id);\n \n-\tif (writethrough_cache)\n-\t\tnetfs_write_folio(wreq, wbc, writethrough_cache);\n+\tif (wthru->in_progress) {\n+\t\twthru->params.notes &= NOTES__KEEP_MASK;\n+\t\tret = netfs_queue_wb_folio(wreq, wbc, wthru->in_progress, &wthru->params);\n+\t\tif (ret == 0)\n+\t\t\tret = netfs_issue_streams(wreq, &wthru->params);\n+\t\twthru->in_progress = NULL;\n+\t}\n \n-\tnetfs_end_issue_write(wreq);\n+\tnetfs_end_issue_write(wreq, &wthru->params);\n \n \tmutex_unlock(&ictx->wb_lock);\n \n \tbvecq_pos_unset(&wreq->load_cursor);\n-\tbvecq_pos_unset(&wreq->dispatch_cursor);\n+\tbvecq_pos_unset(&wthru->params.dispatch_cursor);\n+\tfor (int i = 0; i < NR_IO_STREAMS; i++)\n+\t\tbvecq_pos_unset(&wreq->io_streams[i].dispatch_cursor);\n \n \tif (wreq->iocb)\n \t\tret = -EIOCBQUEUED;\n \telse\n \t\tret = netfs_wait_for_write(wreq);\n \tnetfs_put_request(wreq, netfs_rreq_trace_put_return);\n+\tkfree(wthru);\n \treturn ret;\n }\n \n+/*\n+ * Prepare a buffer for a single monolithic write.\n+ */\n+static int netfs_prepare_write_single_buffer(struct netfs_io_subrequest *subreq,\n+\t\t\t\t\t     unsigned int max_segs)\n+{\n+\tstruct netfs_io_request *wreq = subreq->rreq;\n+\tstruct netfs_io_stream *stream = &wreq->io_streams[subreq->stream_nr];\n+\tstruct bio_vec *bv;\n+\tstruct bvecq *bq;\n+\tsize_t dio_size = wreq->cache_resources.dio_size;\n+\tsize_t dlen;\n+\n+\tbvecq_pos_set(&subreq->dispatch_pos, &stream->dispatch_cursor);\n+\tbvecq_pos_set(&subreq->content, &subreq->dispatch_pos);\n+\n+\t/* Round the end of the last entry up. */\n+\tbq = subreq->content.bvecq;\n+\twhile (bq->next)\n+\t\tbq = bq->next;\n+\tbv = &bq->bv[bq->nr_slots - 1];\n+\tdlen = round_up(bv->bv_len, dio_size);\n+\tif (dlen > bv->bv_len) {\n+\t\tsubreq->len += dlen - bv->bv_len;\n+\t\tbv->bv_len = dlen;\n+\t}\n+\n+\tstream->buffered   = 0;\n+\tstream->issue_from = subreq->len;\n+\twreq->submitted    = subreq->len;\n+\treturn 0;\n+}\n+\n /**\n  * netfs_writeback_single - Write back a monolithic payload\n  * @mapping: The mapping to write from\n  * @wbc: Hints from the VM\n- * @iter: Data to write.\n+ * @iter: Data to write\n+ * @len: Amount of data to write\n  *\n  * Write a monolithic, non-pagecache object back to the server and/or\n  * the cache.  There's a maximum of one subrequest per stream.\n  */\n int netfs_writeback_single(struct address_space *mapping,\n \t\t\t   struct writeback_control *wbc,\n-\t\t\t   struct iov_iter *iter)\n+\t\t\t   struct iov_iter *iter,\n+\t\t\t   size_t len)\n {\n \tstruct netfs_io_request *wreq;\n \tstruct netfs_inode *ictx = netfs_inode(mapping->host);\n \tint ret;\n \n+\t_enter(\"%zx,%zx\", iov_iter_count(iter), len);\n+\n \tif (!mutex_trylock(&ictx->wb_lock)) {\n \t\tif (wbc->sync_mode == WB_SYNC_NONE) {\n \t\t\tnetfs_stat(&netfs_n_wb_lock_skip);\n@@ -809,23 +984,24 @@ int netfs_writeback_single(struct address_space *mapping,\n \t\tret = PTR_ERR(wreq);\n \t\tgoto couldnt_start;\n \t}\n-\twreq->len = iov_iter_count(iter);\n \n-\tret = netfs_extract_iter(iter, wreq->len, INT_MAX, 0, &wreq->dispatch_cursor.bvecq, 0);\n+\twreq->len = len;\n+\n+\tret = netfs_extract_iter(iter, len, INT_MAX, 0, &wreq->load_cursor.bvecq, 0);\n \tif (ret < 0)\n \t\tgoto cleanup_free;\n-\tif (ret < wreq->len) {\n+\tif (ret < len) {\n \t\tret = -EIO;\n \t\tgoto cleanup_free;\n \t}\n \n-\tbvecq_pos_set(&wreq->collect_cursor, &wreq->dispatch_cursor);\n+\tbvecq_pos_set(&wreq->collect_cursor, &wreq->load_cursor);\n \n \t__set_bit(NETFS_RREQ_OFFLOAD_COLLECTION, &wreq->flags);\n \ttrace_netfs_write(wreq, netfs_write_trace_writeback_single);\n \tnetfs_stat(&netfs_n_wh_writepages);\n \n-\tif (__test_and_set_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags))\n+\tif (test_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags))\n \t\twreq->netfs_ops->begin_writeback(wreq);\n \n \tfor (int s = 0; s < NR_IO_STREAMS; s++) {\n@@ -835,13 +1011,22 @@ int netfs_writeback_single(struct address_space *mapping,\n \t\tif (!stream->avail)\n \t\t\tcontinue;\n \n-\t\tnetfs_prepare_write(wreq, stream, 0);\n+\t\tstream->issue_from = 0;\n+\t\tstream->buffered   = len;\n+\n+\t\tsubreq = netfs_alloc_write_subreq(wreq, stream);\n+\t\tif (!subreq) {\n+\t\t\tret = -ENOMEM;\n+\t\t\tbreak;\n+\t\t}\n+\n+\t\tbvecq_pos_set(&stream->dispatch_cursor, &wreq->load_cursor);\n \n-\t\tsubreq = stream->construct;\n-\t\tsubreq->len = wreq->len;\n-\t\tstream->submit_len = subreq->len;\n+\t\tret = stream->issue_write(subreq);\n+\t\tif (ret < 0 && ret != -EIOCBQUEUED)\n+\t\t\tnetfs_write_subrequest_terminated(subreq, ret);\n \n-\t\tnetfs_issue_write(wreq, stream);\n+\t\tbvecq_pos_unset(&stream->dispatch_cursor);\n \t}\n \n \twreq->submitted = wreq->len;\ndiff --git a/fs/netfs/write_retry.c b/fs/netfs/write_retry.c\nindex 5df5c34d4610..096ddf7a2e5c 100644\n--- a/fs/netfs/write_retry.c\n+++ b/fs/netfs/write_retry.c\n@@ -12,12 +12,43 @@\n #include \"internal.h\"\n \n /*\n- * Perform retries on the streams that need it.\n+ * Prepare the write buffer for a retry.  We can't necessarily reuse the write\n+ * buffer from the previous run of a subrequest because the filesystem is\n+ * permitted to modify it (add headers/trailers, encrypt it).  Further, the\n+ * subrequest may now be a different size (e.g. cifs has to negotiate for\n+ * maximum transfer size).  Also, we can't look at *stream as that may still\n+ * refer to the source material being broken up into original subrequests.\n+ */\n+int netfs_prepare_write_retry_buffer(struct netfs_io_subrequest *subreq,\n+\t\t\t\t     unsigned int max_segs)\n+{\n+\tstruct netfs_io_request *wreq = subreq->rreq;\n+\tstruct netfs_io_stream *stream = &wreq->io_streams[subreq->stream_nr];\n+\tsize_t len;\n+\n+\tbvecq_pos_set(&subreq->dispatch_pos, &wreq->retry_cursor);\n+\tbvecq_pos_set(&subreq->content, &wreq->retry_cursor);\n+\tlen = bvecq_slice(&wreq->retry_cursor, subreq->len, max_segs, &subreq->nr_segs);\n+\n+\tif (len < subreq->len) {\n+\t\tsubreq->len = len;\n+\t\ttrace_netfs_sreq(subreq, netfs_sreq_trace_limited);\n+\t}\n+\n+\tstream->issue_from += len;\n+\tstream->buffered   -= len;\n+\tif (stream->buffered == 0)\n+\t\tbvecq_pos_unset(&wreq->retry_cursor);\n+\treturn 0;\n+}\n+\n+/*\n+ * Perform retries on the streams that need it.  This only has to deal with\n+ * buffered writes; unbuffered write retry is handled in direct_write.c.\n  */\n static void netfs_retry_write_stream(struct netfs_io_request *wreq,\n \t\t\t\t     struct netfs_io_stream *stream)\n {\n-\tstruct bvecq_pos dispatch_cursor = {};\n \tstruct list_head *next;\n \n \t_enter(\"R=%x[%x:]\", wreq->debug_id, stream->stream_nr);\n@@ -32,30 +63,14 @@ static void netfs_retry_write_stream(struct netfs_io_request *wreq,\n \tif (unlikely(stream->failed))\n \t\treturn;\n \n-\t/* If there's no renegotiation to do, just resend each failed subreq. */\n-\tif (!stream->prepare_write) {\n-\t\tstruct netfs_io_subrequest *subreq;\n-\n-\t\tlist_for_each_entry(subreq, &stream->subrequests, rreq_link) {\n-\t\t\tif (test_bit(NETFS_SREQ_FAILED, &subreq->flags))\n-\t\t\t\tbreak;\n-\t\t\tif (__test_and_clear_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags)) {\n-\t\t\t\tnetfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit);\n-\t\t\t\tnetfs_reissue_write(stream, subreq);\n-\t\t\t}\n-\t\t}\n-\t\treturn;\n-\t}\n-\n \tnext = stream->subrequests.next;\n \n \tdo {\n \t\tstruct netfs_io_subrequest *subreq = NULL, *from, *to, *tmp;\n \t\tunsigned long long start, len;\n-\t\tsize_t part;\n-\t\tbool boundary = false;\n+\t\tint ret;\n \n-\t\tbvecq_pos_unset(&dispatch_cursor);\n+\t\tbvecq_pos_unset(&wreq->retry_cursor);\n \n \t\t/* Go through the stream and find the next span of contiguous\n \t\t * data that we then rejig (cifs, for example, needs the wsize\n@@ -73,7 +88,6 @@ static void netfs_retry_write_stream(struct netfs_io_request *wreq,\n \t\tlist_for_each_continue(next, &stream->subrequests) {\n \t\t\tsubreq = list_entry(next, struct netfs_io_subrequest, rreq_link);\n \t\t\tif (subreq->start + subreq->transferred != start + len ||\n-\t\t\t    test_bit(NETFS_SREQ_BOUNDARY, &subreq->flags) ||\n \t\t\t    !test_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags))\n \t\t\t\tbreak;\n \t\t\tto = subreq;\n@@ -83,43 +97,40 @@ static void netfs_retry_write_stream(struct netfs_io_request *wreq,\n \t\t/* Determine the set of buffers we're going to use.  Each\n \t\t * subreq gets a subset of a single overall contiguous buffer.\n \t\t */\n-\t\tbvecq_pos_transfer(&dispatch_cursor, &from->dispatch_pos);\n-\t\tbvecq_pos_advance(&dispatch_cursor, from->transferred);\n+\t\tbvecq_pos_transfer(&wreq->retry_cursor, &from->dispatch_pos);\n+\t\tbvecq_pos_advance(&wreq->retry_cursor, from->transferred);\n+\t\twreq->retry_start = start;\n+\t\twreq->retry_buffered = len;\n \n \t\t/* Work through the sublist. */\n \t\tsubreq = from;\n \t\tlist_for_each_entry_from(subreq, &stream->subrequests, rreq_link) {\n-\t\t\tif (!len)\n+\t\t\tif (!wreq->retry_buffered)\n \t\t\t\tbreak;\n \n-\t\t\tsubreq->start\t= start;\n-\t\t\tsubreq->len\t= len;\n-\t\t\t__clear_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags);\n-\t\t\ttrace_netfs_sreq(subreq, netfs_sreq_trace_retry);\n-\n \t\t\tbvecq_pos_unset(&subreq->dispatch_pos);\n \t\t\tbvecq_pos_unset(&subreq->content);\n+\t\t\tsubreq->content.bvecq = NULL;\n+\t\t\tsubreq->content.slot = 0;\n+\t\t\tsubreq->content.offset = 0;\n \n-\t\t\t/* Renegotiate max_len (wsize) */\n-\t\t\tstream->sreq_max_len = len;\n-\t\t\tstream->sreq_max_segs = INT_MAX;\n-\t\t\tstream->prepare_write(subreq);\n-\n-\t\t\tbvecq_pos_set(&subreq->dispatch_pos, &dispatch_cursor);\n-\t\t\tpart = bvecq_slice(&dispatch_cursor,\n-\t\t\t\t\t   umin(len, stream->sreq_max_len),\n-\t\t\t\t\t   stream->sreq_max_segs,\n-\t\t\t\t\t   &subreq->nr_segs);\n-\t\t\tsubreq->len = subreq->transferred + part;\n-\t\t\tsubreq->transferred = 0;\n-\t\t\tlen -= part;\n-\t\t\tstart += part;\n-\t\t\tif (len && subreq == to &&\n-\t\t\t    __test_and_clear_bit(NETFS_SREQ_BOUNDARY, &to->flags))\n-\t\t\t\tboundary = true;\n-\n+\t\t\t__clear_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags);\n+\t\t\t__clear_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags);\n+\t\t\t__clear_bit(NETFS_SREQ_FAILED, &subreq->flags);\n+\t\t\t__set_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags);\n+\t\t\tsubreq->start\t\t= wreq->retry_start;\n+\t\t\tsubreq->len\t\t= wreq->retry_buffered;\n+\t\t\tsubreq->transferred\t= 0;\n+\t\t\tsubreq->retry_count\t+= 1;\n+\t\t\tsubreq->error\t\t= 0;\n+\n+\t\t\tnetfs_stat(&netfs_n_wh_retry_write_subreq);\n+\t\t\ttrace_netfs_sreq(subreq, netfs_sreq_trace_retry);\n \t\t\tnetfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit);\n-\t\t\tnetfs_reissue_write(stream, subreq);\n+\t\t\tret = stream->issue_write(subreq);\n+\t\t\tif (ret < 0 && ret != -EIOCBQUEUED)\n+\t\t\t\tnetfs_write_subrequest_terminated(subreq, ret);\n+\n \t\t\tif (subreq == to)\n \t\t\t\tbreak;\n \t\t}\n@@ -160,12 +171,9 @@ static void netfs_retry_write_stream(struct netfs_io_request *wreq,\n \t\t\tto = list_next_entry(to, rreq_link);\n \t\t\ttrace_netfs_sreq(subreq, netfs_sreq_trace_retry);\n \n-\t\t\tstream->sreq_max_len\t= len;\n-\t\t\tstream->sreq_max_segs\t= INT_MAX;\n \t\t\tswitch (stream->source) {\n \t\t\tcase NETFS_UPLOAD_TO_SERVER:\n \t\t\t\tnetfs_stat(&netfs_n_wh_upload);\n-\t\t\t\tstream->sreq_max_len = umin(len, wreq->wsize);\n \t\t\t\tbreak;\n \t\t\tcase NETFS_WRITE_TO_CACHE:\n \t\t\t\tnetfs_stat(&netfs_n_wh_write);\n@@ -174,32 +182,16 @@ static void netfs_retry_write_stream(struct netfs_io_request *wreq,\n \t\t\t\tWARN_ON_ONCE(1);\n \t\t\t}\n \n-\t\t\tstream->prepare_write(subreq);\n-\n-\t\t\tbvecq_pos_set(&subreq->dispatch_pos, &dispatch_cursor);\n-\t\t\tpart = bvecq_slice(&dispatch_cursor,\n-\t\t\t\t\t   umin(len, stream->sreq_max_len),\n-\t\t\t\t\t   stream->sreq_max_segs,\n-\t\t\t\t\t   &subreq->nr_segs);\n-\t\t\tsubreq->len = subreq->transferred + part;\n-\n-\t\t\tlen -= part;\n-\t\t\tstart += part;\n-\t\t\tif (!len && boundary) {\n-\t\t\t\t__set_bit(NETFS_SREQ_BOUNDARY, &to->flags);\n-\t\t\t\tboundary = false;\n-\t\t\t}\n-\n-\t\t\tnetfs_reissue_write(stream, subreq);\n-\t\t\tif (!len)\n-\t\t\t\tbreak;\n+\t\t\tret = stream->issue_write(subreq);\n+\t\t\tif (ret < 0 && ret != -EIOCBQUEUED)\n+\t\t\t\tnetfs_write_subrequest_terminated(subreq, ret);\n \n \t\t} while (len);\n \n \t} while (!list_is_head(next, &stream->subrequests));\n \n out:\n-\tbvecq_pos_unset(&dispatch_cursor);\n+\tbvecq_pos_unset(&wreq->retry_cursor);\n }\n \n /*\n@@ -237,4 +229,6 @@ void netfs_retry_writes(struct netfs_io_request *wreq)\n \t\t\tnetfs_retry_write_stream(wreq, stream);\n \t\t}\n \t}\n+\n+\tpr_notice(\"Retrying\\n\");\n }\ndiff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig\nindex 12cb0ca738af..ae463867cf01 100644\n--- a/fs/nfs/Kconfig\n+++ b/fs/nfs/Kconfig\n@@ -173,6 +173,7 @@ config NFS_FSCACHE\n \tbool \"Provide NFS client caching support\"\n \tdepends on NFS_FS\n \tselect NETFS_SUPPORT\n+\tselect NETFS_PGPRIV2\n \tselect FSCACHE\n \thelp\n \t  Say Y here if you want NFS data to be cached locally on disc through\ndiff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c\nindex 9b7fdad4a920..bc82821d77a3 100644\n--- a/fs/nfs/fscache.c\n+++ b/fs/nfs/fscache.c\n@@ -273,8 +273,6 @@ static int nfs_netfs_init_request(struct netfs_io_request *rreq, struct file *fi\n \trreq->debug_id = atomic_inc_return(&nfs_netfs_debug_id);\n \t/* [DEPRECATED] Use PG_private_2 to mark folio being written to the cache. */\n \t__set_bit(NETFS_RREQ_USE_PGPRIV2, &rreq->flags);\n-\trreq->io_streams[0].sreq_max_len = NFS_SB(rreq->inode->i_sb)->rsize;\n-\n \treturn 0;\n }\n \n@@ -296,8 +294,9 @@ static struct nfs_netfs_io_data *nfs_netfs_alloc(struct netfs_io_subrequest *sre\n \treturn netfs;\n }\n \n-static void nfs_netfs_issue_read(struct netfs_io_subrequest *sreq)\n+static int nfs_netfs_issue_read(struct netfs_io_subrequest *sreq)\n {\n+\tstruct netfs_io_request\t\t*rreq = sreq->rreq;\n \tstruct nfs_netfs_io_data\t*netfs;\n \tstruct nfs_pageio_descriptor\tpgio;\n \tstruct inode *inode = sreq->rreq->inode;\n@@ -307,6 +306,13 @@ static void nfs_netfs_issue_read(struct netfs_io_subrequest *sreq)\n \tpgoff_t start, last;\n \tint err;\n \n+\tif (sreq->len > NFS_SB(rreq->inode->i_sb)->rsize)\n+\t\tsreq->len = NFS_SB(rreq->inode->i_sb)->rsize;\n+\n+\terr = netfs_prepare_read_buffer(sreq, INT_MAX);\n+\tif (err < 0)\n+\t\treturn err;\n+\n \tstart = (sreq->start + sreq->transferred) >> PAGE_SHIFT;\n \tlast = ((sreq->start + sreq->len - sreq->transferred - 1) >> PAGE_SHIFT);\n \n@@ -314,14 +320,15 @@ static void nfs_netfs_issue_read(struct netfs_io_subrequest *sreq)\n \t\t\t     &nfs_async_read_completion_ops);\n \n \tnetfs = nfs_netfs_alloc(sreq);\n-\tif (!netfs) {\n-\t\tsreq->error = -ENOMEM;\n-\t\treturn netfs_read_subreq_terminated(sreq);\n-\t}\n+\tif (!netfs)\n+\t\treturn -ENOMEM;\n+\n+\t/* After this point, we're not allowed to return an error. */\n+\tnetfs_mark_read_submission(sreq);\n \n \tpgio.pg_netfs = netfs; /* used in completion */\n \n-\txa_for_each_range(&sreq->rreq->mapping->i_pages, idx, page, start, last) {\n+\txa_for_each_range(&rreq->mapping->i_pages, idx, page, start, last) {\n \t\t/* nfs_read_add_folio() may schedule() due to pNFS layout and other RPCs  */\n \t\terr = nfs_read_add_folio(&pgio, ctx, page_folio(page));\n \t\tif (err < 0) {\n@@ -332,6 +339,7 @@ static void nfs_netfs_issue_read(struct netfs_io_subrequest *sreq)\n out:\n \tnfs_pageio_complete_read(&pgio);\n \tnfs_netfs_put(netfs);\n+\treturn -EIOCBQUEUED;\n }\n \n void nfs_netfs_initiate_read(struct nfs_pgio_header *hdr)\ndiff --git a/fs/smb/client/cifssmb.c b/fs/smb/client/cifssmb.c\nindex 3990a9012264..dc9120802edb 100644\n--- a/fs/smb/client/cifssmb.c\n+++ b/fs/smb/client/cifssmb.c\n@@ -1466,8 +1466,7 @@ cifs_readv_callback(struct TCP_Server_Info *server, struct mid_q_entry *mid)\n \tstruct netfs_inode *ictx = netfs_inode(rdata->rreq->inode);\n \tstruct cifs_tcon *tcon = tlink_tcon(rdata->req->cfile->tlink);\n \tstruct smb_rqst rqst = { .rq_iov = rdata->iov,\n-\t\t\t\t .rq_nvec = 1,\n-\t\t\t\t .rq_iter = rdata->subreq.io_iter };\n+\t\t\t\t .rq_nvec = 1};\n \tstruct cifs_credits credits = {\n \t\t.value = 1,\n \t\t.instance = 0,\n@@ -1481,6 +1480,11 @@ cifs_readv_callback(struct TCP_Server_Info *server, struct mid_q_entry *mid)\n \t\t __func__, mid->mid, mid->mid_state, rdata->result,\n \t\t rdata->subreq.len);\n \n+\tif (rdata->got_bytes)\n+\t\tiov_iter_bvec_queue(&rqst.rq_iter, ITER_DEST,\n+\t\t\t\t    rdata->subreq.content.bvecq, rdata->subreq.content.slot,\n+\t\t\t\t    rdata->subreq.content.offset, rdata->subreq.len);\n+\n \tswitch (mid->mid_state) {\n \tcase MID_RESPONSE_RECEIVED:\n \t\t/* result already set, check signature */\n@@ -2002,7 +2006,10 @@ cifs_async_writev(struct cifs_io_subrequest *wdata)\n \n \trqst.rq_iov = iov;\n \trqst.rq_nvec = 1;\n-\trqst.rq_iter = wdata->subreq.io_iter;\n+\n+\tiov_iter_bvec_queue(&rqst.rq_iter, ITER_SOURCE,\n+\t\t\t    wdata->subreq.content.bvecq, wdata->subreq.content.slot,\n+\t\t\t    wdata->subreq.content.offset, wdata->subreq.len);\n \n \tcifs_dbg(FYI, \"async write at %llu %zu bytes\\n\",\n \t\t wdata->subreq.start, wdata->subreq.len);\ndiff --git a/fs/smb/client/file.c b/fs/smb/client/file.c\nindex cffcf82c1b69..a933c12b39ea 100644\n--- a/fs/smb/client/file.c\n+++ b/fs/smb/client/file.c\n@@ -44,18 +44,34 @@ static int cifs_reopen_file(struct cifsFileInfo *cfile, bool can_flush);\n  * Prepare a subrequest to upload to the server.  We need to allocate credits\n  * so that we know the maximum amount of data that we can include in it.\n  */\n-static void cifs_prepare_write(struct netfs_io_subrequest *subreq)\n+static int cifs_estimate_write(struct netfs_io_request *wreq,\n+\t\t\t       struct netfs_io_stream *stream,\n+\t\t\t       struct netfs_write_estimate *estimate)\n+{\n+\tstruct cifs_sb_info *cifs_sb = CIFS_SB(wreq->inode->i_sb);\n+\n+\testimate->issue_at = stream->issue_from + cifs_sb->ctx->wsize;\n+\treturn 0;\n+}\n+\n+/*\n+ * Issue a subrequest to upload to the server.\n+ */\n+static int cifs_issue_write(struct netfs_io_subrequest *subreq)\n {\n \tstruct cifs_io_subrequest *wdata =\n \t\tcontainer_of(subreq, struct cifs_io_subrequest, subreq);\n \tstruct cifs_io_request *req = wdata->req;\n-\tstruct netfs_io_stream *stream = &req->rreq.io_streams[subreq->stream_nr];\n \tstruct TCP_Server_Info *server;\n \tstruct cifsFileInfo *open_file = req->cfile;\n-\tstruct cifs_sb_info *cifs_sb = CIFS_SB(wdata->rreq->inode->i_sb);\n-\tsize_t wsize = req->rreq.wsize;\n+\tstruct cifs_sb_info *cifs_sb = CIFS_SB(subreq->rreq->inode->i_sb);\n+\tunsigned int max_segs = INT_MAX;\n+\tsize_t len;\n \tint rc;\n \n+\tif (cifs_forced_shutdown(cifs_sb))\n+\t\treturn smb_EIO(smb_eio_trace_forced_shutdown);\n+\n \tif (!wdata->have_xid) {\n \t\twdata->xid = get_xid();\n \t\twdata->have_xid = true;\n@@ -74,18 +90,16 @@ static void cifs_prepare_write(struct netfs_io_subrequest *subreq)\n \t\tif (rc < 0) {\n \t\t\tif (rc == -EAGAIN)\n \t\t\t\tgoto retry;\n-\t\t\tsubreq->error = rc;\n-\t\t\treturn netfs_prepare_write_failed(subreq);\n+\t\t\treturn rc;\n \t\t}\n \t}\n \n-\trc = server->ops->wait_mtu_credits(server, wsize, &stream->sreq_max_len,\n-\t\t\t\t\t   &wdata->credits);\n-\tif (rc < 0) {\n-\t\tsubreq->error = rc;\n-\t\treturn netfs_prepare_write_failed(subreq);\n-\t}\n+\tlen = umin(subreq->len, cifs_sb->ctx->wsize);\n+\trc = server->ops->wait_mtu_credits(server, len, &len, &wdata->credits);\n+\tif (rc < 0)\n+\t\treturn rc;\n \n+\tsubreq->len = len;\n \twdata->credits.rreq_debug_id = subreq->rreq->debug_id;\n \twdata->credits.rreq_debug_index = subreq->debug_index;\n \twdata->credits.in_flight_check = 1;\n@@ -101,39 +115,29 @@ static void cifs_prepare_write(struct netfs_io_subrequest *subreq)\n \t\tconst struct smbdirect_socket_parameters *sp =\n \t\t\tsmbd_get_parameters(server->smbd_conn);\n \n-\t\tstream->sreq_max_segs = sp->max_frmr_depth;\n+\t\tmax_segs = sp->max_frmr_depth;\n \t}\n #endif\n-}\n-\n-/*\n- * Issue a subrequest to upload to the server.\n- */\n-static void cifs_issue_write(struct netfs_io_subrequest *subreq)\n-{\n-\tstruct cifs_io_subrequest *wdata =\n-\t\tcontainer_of(subreq, struct cifs_io_subrequest, subreq);\n-\tstruct cifs_sb_info *sbi = CIFS_SB(subreq->rreq->inode->i_sb);\n-\tint rc;\n \n-\tif (cifs_forced_shutdown(sbi)) {\n-\t\trc = smb_EIO(smb_eio_trace_forced_shutdown);\n-\t\tgoto fail;\n+\trc = netfs_prepare_write_buffer(subreq, max_segs);\n+\tif (rc < 0) {\n+\t\tadd_credits_and_wake_if(wdata->server, &wdata->credits, 0);\n+\t\treturn rc;\n \t}\n \n-\trc = adjust_credits(wdata->server, wdata, cifs_trace_rw_credits_issue_write_adjust);\n+\trc = adjust_credits(server, wdata, cifs_trace_rw_credits_issue_write_adjust);\n \tif (rc)\n-\t\tgoto fail;\n+\t\tgoto fail_with_credits;\n \n \trc = -EAGAIN;\n \tif (wdata->req->cfile->invalidHandle)\n-\t\tgoto fail;\n+\t\tgoto fail_with_credits;\n \n \twdata->server->ops->async_writev(wdata);\n out:\n-\treturn;\n+\treturn -EIOCBQUEUED;\n \n-fail:\n+fail_with_credits:\n \tif (rc == -EAGAIN)\n \t\ttrace_netfs_sreq(subreq, netfs_sreq_trace_retry);\n \telse\n@@ -149,17 +153,25 @@ static void cifs_netfs_invalidate_cache(struct netfs_io_request *wreq)\n }\n \n /*\n- * Negotiate the size of a read operation on behalf of the netfs library.\n+ * Issue a read operation on behalf of the netfs helper functions.  We're asked\n+ * to make a read of a certain size at a point in the file.  We are permitted\n+ * to only read a portion of that, but as long as we read something, the netfs\n+ * helper will call us again so that we can issue another read.\n  */\n-static int cifs_prepare_read(struct netfs_io_subrequest *subreq)\n+static int cifs_issue_read(struct netfs_io_subrequest *subreq)\n {\n \tstruct netfs_io_request *rreq = subreq->rreq;\n \tstruct cifs_io_subrequest *rdata = container_of(subreq, struct cifs_io_subrequest, subreq);\n \tstruct cifs_io_request *req = container_of(subreq->rreq, struct cifs_io_request, rreq);\n-\tstruct TCP_Server_Info *server;\n+\tstruct TCP_Server_Info *server = rdata->server;\n \tstruct cifs_sb_info *cifs_sb = CIFS_SB(rreq->inode->i_sb);\n-\tsize_t size;\n-\tint rc = 0;\n+\tunsigned int max_segs = INT_MAX;\n+\tsize_t len;\n+\tint rc;\n+\n+\tcifs_dbg(FYI, \"%s: op=%08x[%x] mapping=%p len=%zu/%zu\\n\",\n+\t\t __func__, rreq->debug_id, subreq->debug_index, rreq->mapping,\n+\t\t subreq->transferred, subreq->len);\n \n \tif (!rdata->have_xid) {\n \t\trdata->xid = get_xid();\n@@ -173,17 +185,15 @@ static int cifs_prepare_read(struct netfs_io_subrequest *subreq)\n \t\tcifs_negotiate_rsize(server, cifs_sb->ctx,\n \t\t\t\t     tlink_tcon(req->cfile->tlink));\n \n-\trc = server->ops->wait_mtu_credits(server, cifs_sb->ctx->rsize,\n-\t\t\t\t\t   &size, &rdata->credits);\n+\tlen = umin(subreq->len, cifs_sb->ctx->rsize);\n+\trc = server->ops->wait_mtu_credits(server, len, &len, &rdata->credits);\n \tif (rc)\n \t\treturn rc;\n \n-\trreq->io_streams[0].sreq_max_len = size;\n-\n-\trdata->credits.in_flight_check = 1;\n+\tsubreq->len = len;\n \trdata->credits.rreq_debug_id = rreq->debug_id;\n \trdata->credits.rreq_debug_index = subreq->debug_index;\n-\n+\trdata->credits.in_flight_check = 1;\n \ttrace_smb3_rw_credits(rdata->rreq->debug_id,\n \t\t\t      rdata->subreq.debug_index,\n \t\t\t      rdata->credits.value,\n@@ -195,33 +205,17 @@ static int cifs_prepare_read(struct netfs_io_subrequest *subreq)\n \t\tconst struct smbdirect_socket_parameters *sp =\n \t\t\tsmbd_get_parameters(server->smbd_conn);\n \n-\t\trreq->io_streams[0].sreq_max_segs = sp->max_frmr_depth;\n+\t\tmax_segs = sp->max_frmr_depth;\n \t}\n #endif\n-\treturn 0;\n-}\n-\n-/*\n- * Issue a read operation on behalf of the netfs helper functions.  We're asked\n- * to make a read of a certain size at a point in the file.  We are permitted\n- * to only read a portion of that, but as long as we read something, the netfs\n- * helper will call us again so that we can issue another read.\n- */\n-static void cifs_issue_read(struct netfs_io_subrequest *subreq)\n-{\n-\tstruct netfs_io_request *rreq = subreq->rreq;\n-\tstruct cifs_io_subrequest *rdata = container_of(subreq, struct cifs_io_subrequest, subreq);\n-\tstruct cifs_io_request *req = container_of(subreq->rreq, struct cifs_io_request, rreq);\n-\tstruct TCP_Server_Info *server = rdata->server;\n-\tint rc = 0;\n \n-\tcifs_dbg(FYI, \"%s: op=%08x[%x] mapping=%p len=%zu/%zu\\n\",\n-\t\t __func__, rreq->debug_id, subreq->debug_index, rreq->mapping,\n-\t\t subreq->transferred, subreq->len);\n+\trc = netfs_prepare_read_buffer(subreq, max_segs);\n+\tif (rc < 0)\n+\t\tgoto fail_with_credits;\n \n \trc = adjust_credits(server, rdata, cifs_trace_rw_credits_issue_read_adjust);\n \tif (rc)\n-\t\tgoto failed;\n+\t\tgoto fail_with_credits;\n \n \tif (req->cfile->invalidHandle) {\n \t\tdo {\n@@ -235,15 +229,24 @@ static void cifs_issue_read(struct netfs_io_subrequest *subreq)\n \t    subreq->rreq->origin != NETFS_DIO_READ)\n \t\t__set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);\n \n-\ttrace_netfs_sreq(subreq, netfs_sreq_trace_submit);\n+\t/* After this point, we're not allowed to return an error. */\n+\tnetfs_mark_read_submission(subreq);\n+\n \trc = rdata->server->ops->async_readv(rdata);\n-\tif (rc)\n-\t\tgoto failed;\n-\treturn;\n+\tif (rc) {\n+\t\tsubreq->error = rc;\n+\t\tnetfs_read_subreq_terminated(subreq);\n+\t}\n+\treturn -EIOCBQUEUED;\n \n+fail_with_credits:\n+\tif (rc == -EAGAIN)\n+\t\ttrace_netfs_sreq(subreq, netfs_sreq_trace_retry);\n+\telse\n+\t\ttrace_netfs_sreq(subreq, netfs_sreq_trace_fail);\n+\tadd_credits_and_wake_if(rdata->server, &rdata->credits, 0);\n failed:\n-\tsubreq->error = rc;\n-\tnetfs_read_subreq_terminated(subreq);\n+\treturn rc;\n }\n \n /*\n@@ -353,11 +356,10 @@ const struct netfs_request_ops cifs_req_ops = {\n \t.init_request\t\t= cifs_init_request,\n \t.free_request\t\t= cifs_free_request,\n \t.free_subrequest\t= cifs_free_subrequest,\n-\t.prepare_read\t\t= cifs_prepare_read,\n \t.issue_read\t\t= cifs_issue_read,\n \t.done\t\t\t= cifs_rreq_done,\n \t.begin_writeback\t= cifs_begin_writeback,\n-\t.prepare_write\t\t= cifs_prepare_write,\n+\t.estimate_write\t\t= cifs_estimate_write,\n \t.issue_write\t\t= cifs_issue_write,\n \t.invalidate_cache\t= cifs_netfs_invalidate_cache,\n };\ndiff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c\nindex 0d19c8fc4c3d..d15f196df1e7 100644\n--- a/fs/smb/client/smb2ops.c\n+++ b/fs/smb/client/smb2ops.c\n@@ -4705,6 +4705,7 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid,\n \tunsigned int cur_page_idx;\n \tunsigned int pad_len;\n \tstruct cifs_io_subrequest *rdata = mid->callback_data;\n+\tstruct iov_iter iter;\n \tstruct smb2_hdr *shdr = (struct smb2_hdr *)buf;\n \tsize_t copied;\n \tbool use_rdma_mr = false;\n@@ -4777,6 +4778,10 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid,\n \n \tpad_len = data_offset - server->vals->read_rsp_size;\n \n+\tiov_iter_bvec_queue(&iter, ITER_DEST,\n+\t\t\t    rdata->subreq.content.bvecq, rdata->subreq.content.slot,\n+\t\t\t    rdata->subreq.content.offset, rdata->subreq.len);\n+\n \tif (buf_len <= data_offset) {\n \t\t/* read response payload is in pages */\n \t\tcur_page_idx = pad_len / PAGE_SIZE;\n@@ -4806,7 +4811,7 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid,\n \n \t\t/* Copy the data to the output I/O iterator. */\n \t\trdata->result = cifs_copy_bvecq_to_iter(buffer, buffer_len,\n-\t\t\t\t\t\t\tcur_off, &rdata->subreq.io_iter);\n+\t\t\t\t\t\t\tcur_off, &iter);\n \t\tif (rdata->result != 0) {\n \t\t\tif (is_offloaded)\n \t\t\t\tmid->mid_state = MID_RESPONSE_MALFORMED;\n@@ -4819,7 +4824,7 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid,\n \t} else if (buf_len >= data_offset + data_len) {\n \t\t/* read response payload is in buf */\n \t\tWARN_ONCE(buffer, \"read data can be either in buf or in buffer\");\n-\t\tcopied = copy_to_iter(buf + data_offset, data_len, &rdata->subreq.io_iter);\n+\t\tcopied = copy_to_iter(buf + data_offset, data_len, &iter);\n \t\tif (copied == 0)\n \t\t\treturn smb_EIO2(smb_eio_trace_rx_copy_to_iter, copied, data_len);\n \t\trdata->got_bytes = copied;\ndiff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c\nindex c43ca74e8704..717d65d32dd3 100644\n--- a/fs/smb/client/smb2pdu.c\n+++ b/fs/smb/client/smb2pdu.c\n@@ -4539,9 +4539,13 @@ smb2_new_read_req(void **buf, unsigned int *total_len,\n \t */\n \tif (rdata && smb3_use_rdma_offload(io_parms)) {\n \t\tstruct smbdirect_buffer_descriptor_v1 *v1;\n+\t\tstruct iov_iter iter;\n \t\tbool need_invalidate = server->dialect == SMB30_PROT_ID;\n \n-\t\trdata->mr = smbd_register_mr(server->smbd_conn, &rdata->subreq.io_iter,\n+\t\tiov_iter_bvec_queue(&iter, ITER_DEST,\n+\t\t\t\t    rdata->subreq.content.bvecq, rdata->subreq.content.slot,\n+\t\t\t\t    rdata->subreq.content.offset, rdata->subreq.len);\n+\t\trdata->mr = smbd_register_mr(server->smbd_conn, &iter,\n \t\t\t\t\t     true, need_invalidate);\n \t\tif (!rdata->mr)\n \t\t\treturn -EAGAIN;\n@@ -4606,9 +4610,10 @@ smb2_readv_callback(struct TCP_Server_Info *server, struct mid_q_entry *mid)\n \tunsigned int rreq_debug_id = rdata->rreq->debug_id;\n \tunsigned int subreq_debug_index = rdata->subreq.debug_index;\n \n-\tif (rdata->got_bytes) {\n-\t\trqst.rq_iter\t  = rdata->subreq.io_iter;\n-\t}\n+\tif (rdata->got_bytes)\n+\t\tiov_iter_bvec_queue(&rqst.rq_iter, ITER_DEST,\n+\t\t\t\t    rdata->subreq.content.bvecq, rdata->subreq.content.slot,\n+\t\t\t\t    rdata->subreq.content.offset, rdata->subreq.len);\n \n \tWARN_ONCE(rdata->server != server,\n \t\t  \"rdata server %p != mid server %p\",\n@@ -5096,7 +5101,9 @@ smb2_async_writev(struct cifs_io_subrequest *wdata)\n \t\tgoto out;\n \n \trqst.rq_iov = iov;\n-\trqst.rq_iter = wdata->subreq.io_iter;\n+\tiov_iter_bvec_queue(&rqst.rq_iter, ITER_SOURCE,\n+\t\t\t    wdata->subreq.content.bvecq, wdata->subreq.content.slot,\n+\t\t\t    wdata->subreq.content.offset, wdata->subreq.len);\n \n \trqst.rq_iov[0].iov_len = total_len - 1;\n \trqst.rq_iov[0].iov_base = (char *)req;\n@@ -5135,9 +5142,14 @@ smb2_async_writev(struct cifs_io_subrequest *wdata)\n \t */\n \tif (smb3_use_rdma_offload(io_parms)) {\n \t\tstruct smbdirect_buffer_descriptor_v1 *v1;\n+\t\tstruct iov_iter iter;\n \t\tbool need_invalidate = server->dialect == SMB30_PROT_ID;\n \n-\t\twdata->mr = smbd_register_mr(server->smbd_conn, &wdata->subreq.io_iter,\n+\t\tiov_iter_bvec_queue(&iter, ITER_SOURCE,\n+\t\t\t\t    wdata->subreq.content.bvecq, wdata->subreq.content.slot,\n+\t\t\t\t    wdata->subreq.content.offset, wdata->subreq.len);\n+\n+\t\twdata->mr = smbd_register_mr(server->smbd_conn, &iter,\n \t\t\t\t\t     false, need_invalidate);\n \t\tif (!wdata->mr) {\n \t\t\trc = -EAGAIN;\n@@ -5176,8 +5188,8 @@ smb2_async_writev(struct cifs_io_subrequest *wdata)\n \t\tsmb2_set_replay(server, &rqst);\n \t}\n \n-\tcifs_dbg(FYI, \"async write at %llu %u bytes iter=%zx\\n\",\n-\t\t io_parms->offset, io_parms->length, iov_iter_count(&wdata->subreq.io_iter));\n+\tcifs_dbg(FYI, \"async write at %llu %u bytes len=%zx\\n\",\n+\t\t io_parms->offset, io_parms->length, wdata->subreq.len);\n \n \tif (wdata->credits.value > 0) {\n \t\tshdr->CreditCharge = cpu_to_le16(DIV_ROUND_UP(wdata->subreq.len,\ndiff --git a/fs/smb/client/transport.c b/fs/smb/client/transport.c\nindex 05f8099047e1..dd1313736fcb 100644\n--- a/fs/smb/client/transport.c\n+++ b/fs/smb/client/transport.c\n@@ -1264,12 +1264,19 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid)\n \t}\n \n #ifdef CONFIG_CIFS_SMB_DIRECT\n-\tif (rdata->mr)\n+\tif (rdata->mr) {\n \t\tlength = data_len; /* An RDMA read is already done. */\n-\telse\n+\t} else {\n+#endif\n+\t\tstruct iov_iter iter;\n+\n+\t\tiov_iter_bvec_queue(&iter, ITER_DEST, rdata->subreq.content.bvecq,\n+\t\t\t\t    rdata->subreq.content.slot, rdata->subreq.content.offset,\n+\t\t\t\t    data_len);\n+\t\tlength = cifs_read_iter_from_socket(server, &iter, data_len);\n+#ifdef CONFIG_CIFS_SMB_DIRECT\n+\t}\n #endif\n-\t\tlength = cifs_read_iter_from_socket(server, &rdata->subreq.io_iter,\n-\t\t\t\t\t\t    data_len);\n \tif (length > 0)\n \t\trdata->got_bytes += length;\n \tserver->total_read += length;\ndiff --git a/include/linux/netfs.h b/include/linux/netfs.h\nindex 65e39f9b0c10..51c021975f0d 100644\n--- a/include/linux/netfs.h\n+++ b/include/linux/netfs.h\n@@ -66,7 +66,7 @@ struct netfs_inode {\n #endif\n \tstruct mutex\t\twb_lock;\t/* Writeback serialisation */\n \tloff_t\t\t\tremote_i_size;\t/* Size of the remote file */\n-\tloff_t\t\t\tzero_point;\t/* Size after which we assume there's no data\n+\tunsigned long long\tzero_point;\t/* Size after which we assume there's no data\n \t\t\t\t\t\t * on the server */\n \tatomic_t\t\tio_count;\t/* Number of outstanding reqs */\n \tunsigned long\t\tflags;\n@@ -126,25 +126,39 @@ static inline struct netfs_group *netfs_folio_group(struct folio *folio)\n \treturn priv;\n }\n \n+/*\n+ * Estimate of maximum write subrequest for writeback.  The filesystem is\n+ * responsible for filling this in when called from ->estimate_write(), though\n+ * netfslib will preset infinite defaults.\n+ */\n+struct netfs_write_estimate {\n+\tunsigned long long\tissue_at;\t/* Point at which we must submit */\n+\tint\t\t\tmax_segs;\t/* Max number of segments in a single RPC */\n+};\n+\n /*\n  * Stream of I/O subrequests going to a particular destination, such as the\n  * server or the local cache.  This is mainly intended for writing where we may\n  * have to write to multiple destinations concurrently.\n  */\n struct netfs_io_stream {\n-\t/* Submission tracking */\n-\tstruct netfs_io_subrequest *construct;\t/* Op being constructed */\n-\tsize_t\t\t\tsreq_max_len;\t/* Maximum size of a subrequest */\n-\tunsigned int\t\tsreq_max_segs;\t/* 0 or max number of segments in an iterator */\n-\tunsigned int\t\tsubmit_off;\t/* Folio offset we're submitting from */\n-\tunsigned int\t\tsubmit_len;\t/* Amount of data left to submit */\n-\tvoid (*prepare_write)(struct netfs_io_subrequest *subreq);\n-\tvoid (*issue_write)(struct netfs_io_subrequest *subreq);\n+\t/* Submission tracking (main dispatch only; not retry) */\n+\tstruct bvecq_pos\tdispatch_cursor; /* Point from which buffers are dispatched */\n+\tunsigned long long\tissue_from;\t/* Current issue point */\n+\tsize_t\t\t\tbuffered;\t/* Amount in buffer */\n+\tu8\t\t\tapplicable;\t/* What sources are applicable (NOTE_* mask) */\n+\tbool\t\t\tbuffering;\t/* T if buffering on this stream */\n+\tint (*estimate_write)(struct netfs_io_request *wreq,\n+\t\t\t      struct netfs_io_stream *stream,\n+\t\t\t      struct netfs_write_estimate *estimate);\n+\tint (*issue_write)(struct netfs_io_subrequest *subreq);\n+\tatomic64_t\t\tissued_to;\t/* Point to which can be considered issued */\n+\n \t/* Collection tracking */\n \tstruct list_head\tsubrequests;\t/* Contributory I/O operations */\n \tunsigned long long\tcollected_to;\t/* Position we've collected results to */\n \tsize_t\t\t\ttransferred;\t/* The amount transferred from this stream */\n-\tunsigned short\t\terror;\t\t/* Aggregate error for the stream */\n+\tshort\t\t\terror;\t\t/* Aggregate error for the stream */\n \tenum netfs_io_source\tsource;\t\t/* Where to read from/write to */\n \tunsigned char\t\tstream_nr;\t/* Index of stream in parent table */\n \tbool\t\t\tavail;\t\t/* T if stream is available */\n@@ -180,14 +194,13 @@ struct netfs_io_subrequest {\n \tstruct list_head\trreq_link;\t/* Link in rreq->subrequests */\n \tstruct bvecq_pos\tdispatch_pos;\t/* Bookmark in the combined queue of the start */\n \tstruct bvecq_pos\tcontent;\t/* The (copied) content of the subrequest */\n-\tstruct iov_iter\t\tio_iter;\t/* Iterator for this subrequest */\n \tunsigned long long\tstart;\t\t/* Where to start the I/O */\n \tsize_t\t\t\tlen;\t\t/* Size of the I/O */\n \tsize_t\t\t\ttransferred;\t/* Amount of data transferred */\n+\tunsigned int\t\tnr_segs;\t/* Number of segments in content */\n \trefcount_t\t\tref;\n \tshort\t\t\terror;\t\t/* 0 or error that occurred */\n \tunsigned short\t\tdebug_index;\t/* Index in list (for debugging output) */\n-\tunsigned int\t\tnr_segs;\t/* Number of segs in io_iter */\n \tu8\t\t\tretry_count;\t/* The number of retries (0 on initial pass) */\n \tenum netfs_io_source\tsource;\t\t/* Where to read from/write to */\n \tunsigned char\t\tstream_nr;\t/* I/O stream this belongs to */\n@@ -196,7 +209,6 @@ struct netfs_io_subrequest {\n #define NETFS_SREQ_CLEAR_TAIL\t\t1\t/* Set if the rest of the read should be cleared */\n #define NETFS_SREQ_MADE_PROGRESS\t4\t/* Set if we transferred at least some data */\n #define NETFS_SREQ_ONDEMAND\t\t5\t/* Set if it's from on-demand read mode */\n-#define NETFS_SREQ_BOUNDARY\t\t6\t/* Set if ends on hard boundary (eg. ceph object) */\n #define NETFS_SREQ_HIT_EOF\t\t7\t/* Set if short due to EOF */\n #define NETFS_SREQ_IN_PROGRESS\t\t8\t/* Unlocked when the subrequest completes */\n #define NETFS_SREQ_NEED_RETRY\t\t9\t/* Set if the filesystem requests a retry */\n@@ -243,22 +255,25 @@ struct netfs_io_request {\n \tstruct netfs_group\t*group;\t\t/* Writeback group being written back */\n \tstruct bvecq_pos\tcollect_cursor;\t/* Clear-up point of I/O buffer */\n \tstruct bvecq_pos\tload_cursor;\t/* Point at which new folios are loaded in */\n-\tstruct bvecq_pos\tdispatch_cursor; /* Point from which buffers are dispatched */\n+\tstruct bvecq_pos\tretry_cursor;\t/* Point from which retries are dispatched */\n \twait_queue_head_t\twaitq;\t\t/* Processor waiter */\n \tvoid\t\t\t*netfs_priv;\t/* Private data for the netfs */\n \tvoid\t\t\t*netfs_priv2;\t/* Private data for the netfs */\n-\tunsigned long long\tlast_end;\t/* End pos of last folio submitted */\n \tunsigned long long\tsubmitted;\t/* Amount submitted for I/O so far */\n \tunsigned long long\tlen;\t\t/* Length of the request */\n \tsize_t\t\t\ttransferred;\t/* Amount to be indicated as transferred */\n \tlong\t\t\terror;\t\t/* 0 or error that occurred */\n \tunsigned long long\ti_size;\t\t/* Size of the file */\n \tunsigned long long\tstart;\t\t/* Start position */\n-\tatomic64_t\t\tissued_to;\t/* Write issuer folio cursor */\n \tunsigned long long\tcollected_to;\t/* Point we've collected to */\n \tunsigned long long\tcache_coll_to;\t/* Point the cache has collected to */\n \tunsigned long long\tcleaned_to;\t/* Position we've cleaned folios to */\n \tunsigned long long\tabandon_to;\t/* Position to abandon folios to */\n+#ifdef CONFIG_NETFS_PGPRIV2\n+\tunsigned long long\tlast_end;\t/* End of last folio added */\n+#endif\n+\tunsigned long long\tretry_start;\t/* Position to retry from */\n+\tsize_t\t\t\tretry_buffered;\t/* Amount of data to retry */\n \tpgoff_t\t\t\tno_unlock_folio; /* Don't unlock this folio after read */\n \tunsigned int\t\tdebug_id;\n \tunsigned int\t\trsize;\t\t/* Maximum read size (0 for none) */\n@@ -282,8 +297,10 @@ struct netfs_io_request {\n #define NETFS_RREQ_UPLOAD_TO_SERVER\t11\t/* Need to write to the server */\n #define NETFS_RREQ_USE_IO_ITER\t\t12\t/* Use ->io_iter rather than ->i_pages */\n #define NETFS_RREQ_NEED_PUT_RA_REFS\t13\t/* Need to put the folio refs RA gave us */\n+#ifdef CONFIG_NETFS_PGPRIV2\n #define NETFS_RREQ_USE_PGPRIV2\t\t31\t/* [DEPRECATED] Use PG_private_2 to mark\n \t\t\t\t\t\t * write to cache on read */\n+#endif\n \tconst struct netfs_request_ops *netfs_ops;\n };\n \n@@ -299,8 +316,7 @@ struct netfs_request_ops {\n \n \t/* Read request handling */\n \tvoid (*expand_readahead)(struct netfs_io_request *rreq);\n-\tint (*prepare_read)(struct netfs_io_subrequest *subreq);\n-\tvoid (*issue_read)(struct netfs_io_subrequest *subreq);\n+\tint (*issue_read)(struct netfs_io_subrequest *subreq);\n \tbool (*is_still_valid)(struct netfs_io_request *rreq);\n \tint (*check_write_begin)(struct file *file, loff_t pos, unsigned len,\n \t\t\t\t struct folio **foliop, void **_fsdata);\n@@ -312,8 +328,10 @@ struct netfs_request_ops {\n \n \t/* Write request handling */\n \tvoid (*begin_writeback)(struct netfs_io_request *wreq);\n-\tvoid (*prepare_write)(struct netfs_io_subrequest *subreq);\n-\tvoid (*issue_write)(struct netfs_io_subrequest *subreq);\n+\tint (*estimate_write)(struct netfs_io_request *wreq,\n+\t\t\t      struct netfs_io_stream *stream,\n+\t\t\t      struct netfs_write_estimate *estimate);\n+\tint (*issue_write)(struct netfs_io_subrequest *subreq);\n \tvoid (*retry_request)(struct netfs_io_request *wreq, struct netfs_io_stream *stream);\n \tvoid (*invalidate_cache)(struct netfs_io_request *wreq);\n };\n@@ -348,8 +366,16 @@ struct netfs_cache_ops {\n \t\t     netfs_io_terminated_t term_func,\n \t\t     void *term_func_priv);\n \n+\t/* Estimate the amount of data that can be written in an op. */\n+\tint (*estimate_write)(struct netfs_io_request *wreq,\n+\t\t\t      struct netfs_io_stream *stream,\n+\t\t\t      struct netfs_write_estimate *estimate);\n+\n+\t/* Read data from the cache for a netfs subrequest. */\n+\tint (*issue_read)(struct netfs_io_subrequest *subreq);\n+\n \t/* Write data to the cache from a netfs subrequest. */\n-\tvoid (*issue_write)(struct netfs_io_subrequest *subreq);\n+\tint (*issue_write)(struct netfs_io_subrequest *subreq);\n \n \t/* Expand readahead request */\n \tvoid (*expand_readahead)(struct netfs_cache_resources *cres,\n@@ -357,25 +383,6 @@ struct netfs_cache_ops {\n \t\t\t\t unsigned long long *_len,\n \t\t\t\t unsigned long long i_size);\n \n-\t/* Prepare a read operation, shortening it to a cached/uncached\n-\t * boundary as appropriate.\n-\t */\n-\tint (*prepare_read)(struct netfs_io_subrequest *subreq);\n-\n-\t/* Prepare a write subrequest, working out if we're allowed to do it\n-\t * and finding out the maximum amount of data to gather before\n-\t * attempting to submit.  If we're not permitted to do it, the\n-\t * subrequest should be marked failed.\n-\t */\n-\tvoid (*prepare_write_subreq)(struct netfs_io_subrequest *subreq);\n-\n-\t/* Prepare a write operation, working out what part of the write we can\n-\t * actually do.\n-\t */\n-\tint (*prepare_write)(struct netfs_cache_resources *cres,\n-\t\t\t     loff_t *_start, size_t *_len, size_t upper_len,\n-\t\t\t     loff_t i_size, bool no_space_allocated_yet);\n-\n \t/* Prepare an on-demand read operation, shortening it to a cached/uncached\n \t * boundary as appropriate.\n \t */\n@@ -418,10 +425,9 @@ void netfs_single_mark_inode_dirty(struct inode *inode);\n ssize_t netfs_read_single(struct inode *inode, struct file *file, struct iov_iter *iter);\n int netfs_writeback_single(struct address_space *mapping,\n \t\t\t   struct writeback_control *wbc,\n-\t\t\t   struct iov_iter *iter);\n+\t\t\t   struct iov_iter *iter, size_t len);\n \n /* Address operations API */\n-struct readahead_control;\n void netfs_readahead(struct readahead_control *);\n int netfs_read_folio(struct file *, struct folio *);\n int netfs_write_begin(struct netfs_inode *, struct file *,\n@@ -439,6 +445,7 @@ bool netfs_release_folio(struct folio *folio, gfp_t gfp);\n vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_group);\n \n /* (Sub)request management API. */\n+void netfs_mark_read_submission(struct netfs_io_subrequest *subreq);\n void netfs_read_subreq_progress(struct netfs_io_subrequest *subreq);\n void netfs_read_subreq_terminated(struct netfs_io_subrequest *subreq);\n void netfs_get_subrequest(struct netfs_io_subrequest *subreq,\n@@ -448,9 +455,8 @@ void netfs_put_subrequest(struct netfs_io_subrequest *subreq,\n ssize_t netfs_extract_iter(struct iov_iter *orig, size_t orig_len, size_t max_segs,\n \t\t\t   unsigned long long fpos, struct bvecq **_bvecq_head,\n \t\t\t   iov_iter_extraction_t extraction_flags);\n-size_t netfs_limit_iter(const struct iov_iter *iter, size_t start_offset,\n-\t\t\tsize_t max_size, size_t max_segs);\n-void netfs_prepare_write_failed(struct netfs_io_subrequest *subreq);\n+int netfs_prepare_read_buffer(struct netfs_io_subrequest *subreq, unsigned int max_segs);\n+int netfs_prepare_write_buffer(struct netfs_io_subrequest *subreq, unsigned int max_segs);\n void netfs_write_subrequest_terminated(void *_op, ssize_t transferred_or_error);\n \n int netfs_start_io_read(struct inode *inode);\ndiff --git a/include/trace/events/cachefiles.h b/include/trace/events/cachefiles.h\nindex 4bba6fda1f8b..c080167451ab 100644\n--- a/include/trace/events/cachefiles.h\n+++ b/include/trace/events/cachefiles.h\n@@ -70,6 +70,7 @@ enum cachefiles_coherency_trace {\n enum cachefiles_trunc_trace {\n \tcachefiles_trunc_clear_padding,\n \tcachefiles_trunc_dio_adjust,\n+\tcachefiles_trunc_discard_tail,\n \tcachefiles_trunc_expand_tmpfile,\n \tcachefiles_trunc_shrink,\n };\n@@ -160,6 +161,7 @@ enum cachefiles_error_trace {\n #define cachefiles_trunc_traces\t\t\t\t\t\t\\\n \tEM(cachefiles_trunc_clear_padding,\t\"CLRPAD\")\t\t\\\n \tEM(cachefiles_trunc_dio_adjust,\t\t\"DIOADJ\")\t\t\\\n+\tEM(cachefiles_trunc_discard_tail,\t\"DSCDTL\")\t\t\\\n \tEM(cachefiles_trunc_expand_tmpfile,\t\"EXPTMP\")\t\t\\\n \tE_(cachefiles_trunc_shrink,\t\t\"SHRINK\")\n \ndiff --git a/include/trace/events/netfs.h b/include/trace/events/netfs.h\nindex eeb8386e0709..ba38cc102bd7 100644\n--- a/include/trace/events/netfs.h\n+++ b/include/trace/events/netfs.h\n@@ -49,6 +49,7 @@\n \tE_(NETFS_PGPRIV2_COPY_TO_CACHE,\t\t\"2C\")\n \n #define netfs_rreq_traces\t\t\t\t\t\\\n+\tEM(netfs_rreq_trace_all_queued,\t\t\"ALL-Q  \")\t\\\n \tEM(netfs_rreq_trace_assess,\t\t\"ASSESS \")\t\\\n \tEM(netfs_rreq_trace_collect,\t\t\"COLLECT\")\t\\\n \tEM(netfs_rreq_trace_complete,\t\t\"COMPLET\")\t\\\n@@ -77,7 +78,8 @@\n \tEM(netfs_rreq_trace_waited_quiesce,\t\"DONE-QUIESCE\")\t\\\n \tEM(netfs_rreq_trace_wake_ip,\t\t\"WAKE-IP\")\t\\\n \tEM(netfs_rreq_trace_wake_queue,\t\t\"WAKE-Q \")\t\\\n-\tE_(netfs_rreq_trace_write_done,\t\t\"WR-DONE\")\n+\tEM(netfs_rreq_trace_write_done,\t\t\"WR-DONE\")\t\\\n+\tE_(netfs_rreq_trace_zero_unread,\t\"ZERO-UR\")\n \n #define netfs_sreq_sources\t\t\t\t\t\\\n \tEM(NETFS_SOURCE_UNKNOWN,\t\t\"----\")\t\t\\\n@@ -126,6 +128,7 @@\n \tEM(netfs_sreq_trace_superfluous,\t\"SPRFL\")\t\\\n \tEM(netfs_sreq_trace_terminated,\t\t\"TERM \")\t\\\n \tEM(netfs_sreq_trace_too_much,\t\t\"!TOOM\")\t\\\n+\tEM(netfs_sreq_trace_too_many_retries,\t\"!RETR\")\t\\\n \tEM(netfs_sreq_trace_wait_for,\t\t\"_WAIT\")\t\\\n \tEM(netfs_sreq_trace_write,\t\t\"WRITE\")\t\\\n \tEM(netfs_sreq_trace_write_skip,\t\t\"SKIP \")\t\\\n@@ -189,12 +192,12 @@\n \tEM(netfs_folio_trace_alloc_buffer,\t\"alloc-buf\")\t\\\n \tEM(netfs_folio_trace_cancel_copy,\t\"cancel-copy\")\t\\\n \tEM(netfs_folio_trace_cancel_store,\t\"cancel-store\")\t\\\n-\tEM(netfs_folio_trace_clear,\t\t\"clear\")\t\\\n-\tEM(netfs_folio_trace_clear_cc,\t\t\"clear-cc\")\t\\\n-\tEM(netfs_folio_trace_clear_g,\t\t\"clear-g\")\t\\\n-\tEM(netfs_folio_trace_clear_s,\t\t\"clear-s\")\t\\\n \tEM(netfs_folio_trace_copy_to_cache,\t\"mark-copy\")\t\\\n \tEM(netfs_folio_trace_end_copy,\t\t\"end-copy\")\t\\\n+\tEM(netfs_folio_trace_endwb,\t\t\"endwb\")\t\\\n+\tEM(netfs_folio_trace_endwb_cc,\t\t\"endwb-cc\")\t\\\n+\tEM(netfs_folio_trace_endwb_g,\t\t\"endwb-g\")\t\\\n+\tEM(netfs_folio_trace_endwb_s,\t\t\"endwb-s\")\t\\\n \tEM(netfs_folio_trace_filled_gaps,\t\"filled-gaps\")\t\\\n \tEM(netfs_folio_trace_kill,\t\t\"kill\")\t\t\\\n \tEM(netfs_folio_trace_kill_cc,\t\t\"kill-cc\")\t\\\n@@ -381,10 +384,10 @@ TRACE_EVENT(netfs_sreq,\n \t\t    __entry->len\t= sreq->len;\n \t\t    __entry->transferred = sreq->transferred;\n \t\t    __entry->start\t= sreq->start;\n-\t\t    __entry->slot\t= sreq->dispatch_pos.slot;\n+\t\t    __entry->slot\t= sreq->content.slot;\n \t\t\t   ),\n \n-\t    TP_printk(\"R=%08x[%x] %s %s f=%03x s=%llx %zx/%zx qs=%u e=%d\",\n+\t    TP_printk(\"R=%08x[%x] %s %s f=%03x s=%llx %zx/%zx bv=%u e=%d\",\n \t\t      __entry->rreq, __entry->index,\n \t\t      __print_symbolic(__entry->source, netfs_sreq_sources),\n \t\t      __print_symbolic(__entry->what, netfs_sreq_traces),\n@@ -492,6 +495,7 @@ TRACE_EVENT(netfs_folio,\n \t    TP_STRUCT__entry(\n \t\t    __field(ino_t,\t\t\tino)\n \t\t    __field(pgoff_t,\t\t\tindex)\n+\t\t    __field(unsigned long,\t\tpfn)\n \t\t    __field(unsigned int,\t\tnr)\n \t\t    __field(enum netfs_folio_trace,\twhy)\n \t\t\t     ),\n@@ -502,13 +506,40 @@ TRACE_EVENT(netfs_folio,\n \t\t    __entry->why = why;\n \t\t    __entry->index = folio->index;\n \t\t    __entry->nr = folio_nr_pages(folio);\n+\t\t    __entry->pfn = folio_pfn(folio);\n \t\t\t   ),\n \n-\t    TP_printk(\"i=%05lx ix=%05lx-%05lx %s\",\n+\t    TP_printk(\"p=%lx i=%05lx ix=%05lx-%05lx %s\",\n+\t\t      __entry->pfn,\n \t\t      __entry->ino, __entry->index, __entry->index + __entry->nr - 1,\n \t\t      __print_symbolic(__entry->why, netfs_folio_traces))\n \t    );\n \n+TRACE_EVENT(netfs_wback,\n+\t    TP_PROTO(struct netfs_io_request *wreq, struct folio *folio, unsigned int notes),\n+\n+\t    TP_ARGS(wreq, folio, notes),\n+\n+\t    TP_STRUCT__entry(\n+\t\t    __field(pgoff_t,\t\t\tindex)\n+\t\t    __field(unsigned int,\t\twreq)\n+\t\t    __field(unsigned int,\t\tnr)\n+\t\t    __field(unsigned int,\t\tnotes)\n+\t\t\t     ),\n+\n+\t    TP_fast_assign(\n+\t\t    __entry->wreq = wreq->debug_id;\n+\t\t    __entry->notes = notes;\n+\t\t    __entry->index = folio->index;\n+\t\t    __entry->nr = folio_nr_pages(folio);\n+\t\t\t   ),\n+\n+\t    TP_printk(\"R=%08x ix=%05lx-%05lx n=%02x\",\n+\t\t      __entry->wreq,\n+\t\t      __entry->index, __entry->index + __entry->nr - 1,\n+\t\t      __entry->notes)\n+\t    );\n+\n TRACE_EVENT(netfs_write_iter,\n \t    TP_PROTO(const struct kiocb *iocb, const struct iov_iter *from),\n \n@@ -751,7 +782,7 @@ TRACE_EVENT(netfs_collect_stream,\n \t\t    __entry->wreq\t= wreq->debug_id;\n \t\t    __entry->stream\t= stream->stream_nr;\n \t\t    __entry->collected_to = stream->collected_to;\n-\t\t    __entry->issued_to\t= atomic64_read(&wreq->issued_to);\n+\t\t    __entry->issued_to\t= atomic64_read(&stream->issued_to);\n \t\t\t   ),\n \n \t    TP_printk(\"R=%08x[%x:] cto=%llx ito=%llx\",\n@@ -775,7 +806,7 @@ TRACE_EVENT(netfs_bvecq,\n \t\t    __entry->trace\t= trace;\n \t\t\t   ),\n \n-\t    TP_printk(\"fq=%x %s\",\n+\t    TP_printk(\"bq=%x %s\",\n \t\t      __entry->id,\n \t\t      __print_symbolic(__entry->trace, netfs_bvecq_traces))\n \t    );\ndiff --git a/net/9p/client.c b/net/9p/client.c\nindex f0dcf252af7e..8d365c000553 100644\n--- a/net/9p/client.c\n+++ b/net/9p/client.c\n@@ -1561,6 +1561,7 @@ void\n p9_client_write_subreq(struct netfs_io_subrequest *subreq)\n {\n \tstruct netfs_io_request *wreq = subreq->rreq;\n+\tstruct iov_iter iter;\n \tstruct p9_fid *fid = wreq->netfs_priv;\n \tstruct p9_client *clnt = fid->clnt;\n \tstruct p9_req_t *req;\n@@ -1571,14 +1572,17 @@ p9_client_write_subreq(struct netfs_io_subrequest *subreq)\n \tp9_debug(P9_DEBUG_9P, \">>> TWRITE fid %d offset %llu len %d\\n\",\n \t\t fid->fid, start, len);\n \n+\tiov_iter_bvec_queue(&iter, ITER_SOURCE, subreq->content.bvecq,\n+\t\t\t    subreq->content.slot, subreq->content.offset, subreq->len);\n+\n \t/* Don't bother zerocopy for small IO (< 1024) */\n \tif (clnt->trans_mod->zc_request && len > 1024) {\n-\t\treq = p9_client_zc_rpc(clnt, P9_TWRITE, NULL, &subreq->io_iter,\n+\t\treq = p9_client_zc_rpc(clnt, P9_TWRITE, NULL, &iter,\n \t\t\t\t       0, wreq->len, P9_ZC_HDR_SZ, \"dqd\",\n \t\t\t\t       fid->fid, start, len);\n \t} else {\n \t\treq = p9_client_rpc(clnt, P9_TWRITE, \"dqV\", fid->fid,\n-\t\t\t\t    start, len, &subreq->io_iter);\n+\t\t\t\t    start, len, &iter);\n \t}\n \tif (IS_ERR(req)) {\n \t\tnetfs_write_subrequest_terminated(subreq, PTR_ERR(req));\n",
    "prefixes": [
        "26/26"
    ]
}