Patch Detail
get:
Show a patch.
patch:
Update a patch.
put:
Update a patch.
GET /api/patches/2197758/?format=api
{ "id": 2197758, "url": "http://patchwork.ozlabs.org/api/patches/2197758/?format=api", "web_url": "http://patchwork.ozlabs.org/project/qemu-devel/patch/20260218132633.29748-17-hreitz@redhat.com/", "project": { "id": 14, "url": "http://patchwork.ozlabs.org/api/projects/14/?format=api", "name": "QEMU Development", "link_name": "qemu-devel", "list_id": "qemu-devel.nongnu.org", "list_email": "qemu-devel@nongnu.org", "web_url": "", "scm_url": "", "webscm_url": "", "list_archive_url": "", "list_archive_url_format": "", "commit_url_format": "" }, "msgid": "<20260218132633.29748-17-hreitz@redhat.com>", "list_archive_url": null, "date": "2026-02-18T13:26:25", "name": "[v4,16/24] fuse: Manually process requests (without libfuse)", "commit_ref": null, "pull_url": null, "state": "new", "archived": false, "hash": "5a0f237297e0bd46c9e6823044361c80b9e036df", "submitter": { "id": 82279, "url": "http://patchwork.ozlabs.org/api/people/82279/?format=api", "name": "Hanna Czenczek", "email": "hreitz@redhat.com" }, "delegate": null, "mbox": "http://patchwork.ozlabs.org/project/qemu-devel/patch/20260218132633.29748-17-hreitz@redhat.com/mbox/", "series": [ { "id": 492547, "url": "http://patchwork.ozlabs.org/api/series/492547/?format=api", "web_url": "http://patchwork.ozlabs.org/project/qemu-devel/list/?series=492547", "date": "2026-02-18T13:26:09", "name": "export/fuse: Use coroutines and multi-threading", "version": 4, "mbox": "http://patchwork.ozlabs.org/series/492547/mbox/" } ], "comments": "http://patchwork.ozlabs.org/api/patches/2197758/comments/", "check": "pending", "checks": "http://patchwork.ozlabs.org/api/patches/2197758/checks/", "tags": {}, "related": [], "headers": { "Return-Path": "<qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org>", "X-Original-To": "incoming@patchwork.ozlabs.org", "Delivered-To": "patchwork-incoming@legolas.ozlabs.org", "Authentication-Results": [ "legolas.ozlabs.org;\n\tdkim=pass (1024-bit key;\n unprotected) header.d=redhat.com header.i=@redhat.com header.a=rsa-sha256\n header.s=mimecast20190719 header.b=ZKr8KnFj;\n\tdkim=pass (2048-bit key;\n unprotected) header.d=redhat.com header.i=@redhat.com header.a=rsa-sha256\n header.s=google header.b=Xar+vHFL;\n\tdkim-atps=neutral", "legolas.ozlabs.org;\n spf=pass (sender SPF authorized) smtp.mailfrom=nongnu.org\n (client-ip=209.51.188.17; helo=lists.gnu.org;\n envelope-from=qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org;\n receiver=patchwork.ozlabs.org)" ], "Received": [ "from lists.gnu.org (lists.gnu.org [209.51.188.17])\n\t(using TLSv1.2 with cipher ECDHE-ECDSA-AES256-GCM-SHA384 (256/256 bits))\n\t(No client certificate requested)\n\tby legolas.ozlabs.org (Postfix) with ESMTPS id 4fGHb55Btnz1xpY\n\tfor <incoming@patchwork.ozlabs.org>; Thu, 19 Feb 2026 00:36:01 +1100 (AEDT)", "from localhost ([::1] helo=lists1p.gnu.org)\n\tby lists.gnu.org with esmtp (Exim 4.90_1)\n\t(envelope-from <qemu-devel-bounces@nongnu.org>)\n\tid 1vshfj-0004JM-9T; Wed, 18 Feb 2026 08:32:55 -0500", "from eggs.gnu.org ([2001:470:142:3::10])\n by lists.gnu.org with esmtps (TLS1.2:ECDHE_RSA_AES_256_GCM_SHA384:256)\n (Exim 4.90_1) (envelope-from <hreitz@redhat.com>) id 1vshfg-0004Go-9n\n for qemu-devel@nongnu.org; Wed, 18 Feb 2026 08:32:52 -0500", "from us-smtp-delivery-124.mimecast.com ([170.10.133.124])\n by eggs.gnu.org with esmtps (TLS1.2:ECDHE_RSA_AES_256_GCM_SHA384:256)\n (Exim 4.90_1) (envelope-from <hreitz@redhat.com>) id 1vshfT-0005hQ-81\n for qemu-devel@nongnu.org; Wed, 18 Feb 2026 08:32:51 -0500", "from mail-wm1-f70.google.com (mail-wm1-f70.google.com\n [209.85.128.70]) by relay.mimecast.com with ESMTP with STARTTLS\n (version=TLSv1.3, cipher=TLS_AES_256_GCM_SHA384) id\n us-mta-617-In_BYx9ZMlO7iX1shLFmQA-1; Wed, 18 Feb 2026 08:27:26 -0500", "by mail-wm1-f70.google.com with SMTP id\n 5b1f17b1804b1-4803e8b6007so45186055e9.0\n for <qemu-devel@nongnu.org>; Wed, 18 Feb 2026 05:27:26 -0800 (PST)", "from localhost\n (p200300cfd737d029edef7b8da7441ac2.dip0.t-ipconnect.de.\n [2003:cf:d737:d029:edef:7b8d:a744:1ac2])\n by smtp.gmail.com with ESMTPSA id\n 5b1f17b1804b1-4839840f0a1sm17986285e9.32.2026.02.18.05.27.22\n (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256);\n Wed, 18 Feb 2026 05:27:22 -0800 (PST)" ], "DKIM-Signature": [ "v=1; a=rsa-sha256; c=relaxed/relaxed; d=redhat.com;\n s=mimecast20190719; t=1771421553;\n h=from:from:reply-to:subject:subject:date:date:message-id:message-id:\n to:to:cc:cc:mime-version:mime-version:\n content-type:content-type:content-type:\n content-transfer-encoding:content-transfer-encoding:\n in-reply-to:in-reply-to:references:references;\n bh=UQRl9meOJuWdYVedpi7sUqHpEjV3sJFFswUPTEReFew=;\n b=ZKr8KnFjeE/ugpLzd3uTBToKUGKja3ovLKlMZVpt+OiZNdVfbRezbyk6rYYXgiIN+P8M/V\n Zyj7B8bK+klnoSU8vHSNzj1IP4Ng57WqC5hKMe7MWfQkB3wLTh3WhYr1oxA9sdlnSP+Rvt\n ecT/OA67c4Oza6WOAb32ySyrGe9PjF8=", "v=1; a=rsa-sha256; c=relaxed/relaxed;\n d=redhat.com; s=google; t=1771421245; x=1772026045; darn=nongnu.org;\n h=content-transfer-encoding:mime-version:references:in-reply-to\n :message-id:date:subject:cc:to:from:from:to:cc:subject:date\n :message-id:reply-to;\n bh=UQRl9meOJuWdYVedpi7sUqHpEjV3sJFFswUPTEReFew=;\n b=Xar+vHFLUeHZu95nkhdCE/N70TptnrhVRNJ0fOAejPGJ4tp7FJ+jIBBExdqGjEabOl\n eSGDhnyQtq52gkAXAnvM1EXRj1XuiBPAFvKBuyzfYjlPvW4tboS0eQbZOg2aevlktafC\n TtKPP9qPtjgL9uaqYDcmTtAE7iP1SsEu3DfWdZ3ExhycBM4q1+1WrgbS9RBdTnKI+EwK\n WjhsTJcPGOexQ80YrU5NY9QPqGefIyfUPeqO+05217ugUhbfyaCuBgzNF42JdvyDRmZp\n yIOU2DhpD/FN0mlrdtYuwoMuCX8yfe0lEFGa+LrxL7a6Y2tVHmlxXGL2CNLDjm2zcqQE\n 6b9w==" ], "X-MC-Unique": "In_BYx9ZMlO7iX1shLFmQA-1", "X-Mimecast-MFC-AGG-ID": "In_BYx9ZMlO7iX1shLFmQA_1771421245", "X-Google-DKIM-Signature": "v=1; a=rsa-sha256; c=relaxed/relaxed;\n d=1e100.net; s=20230601; t=1771421245; x=1772026045;\n h=content-transfer-encoding:mime-version:references:in-reply-to\n :message-id:date:subject:cc:to:from:x-gm-gg:x-gm-message-state:from\n :to:cc:subject:date:message-id:reply-to;\n bh=UQRl9meOJuWdYVedpi7sUqHpEjV3sJFFswUPTEReFew=;\n b=JETF7HCFCGVD2YjJeNsuKzwvOvK8pNmFXFT9+loau+7HobjiDe68tRIX4G2OAP+OcU\n Fh0HBQYRPNOwbF6HgJJtJbHgyGu5NjZ+PPQqcyhKIYB+r6QgXy4159hi1dkfMYpmMjCn\n BFQ3WWnx7xvP1MAONjOW3uMgkhVOxRH0lY62EvQwqu2/IcpnpMXODezCVHT0zZlBqQ5j\n lzQmasZ3UXGanFJ94KTeZ9i8VGUXuCvhCPvp3kp/j+FM7ayX9HIZ4GKma493Dc4x28RM\n RqL/JKg9rts24EBPYo1Gvof2HQYLkt5pKr9/Jfd2y5soMykvitZHRIR5EjY9pozI9/Ti\n hdNA==", "X-Gm-Message-State": "AOJu0YzoFzZC/vnAmziYyF5N8tWW4RcGpi/XaQixI9v2G+m8IxRiwia2\n JVtdv2sBeeTQ6HEahkz57V3d9QWnHSxV9d5GyUBYlxKXf3ZiVyXRvE9TQBSWMzwNfI2s7nxBpzG\n DbnddAn2sTT5RScpDLXOazdWGWuXPRlxt2xDJ3ixUZg2KR5HJA2Pb6zEv", "X-Gm-Gg": "AZuq6aLgfDOJPOwyUPQq4IuiYlWC4eqSp5nhmeo/KsJaja0N0HZPW0SW2IvXE82fUS9\n /p4lEkK1diW3uf2PilZCC8+eXEA229LqltMEt1NuUQfr3DocRviCw5SUoUUXWgKRnv22R9th7BB\n tSyDbj/YcEnlrBKaTnPOGfYRcOXrsR8dxE3eqh5cIci7zlLGJ75GJgdENsc0vnIBSXFFRgZK1gv\n RtpxkQZcOFfiv/MHGKS9uBBTLO8+vyh3gSXMdB+nG1nWUhzaVa4usoKZiiKet93Wln1aU+yOUDg\n zcJt/R0D0SDembAYK5PXPhOmQ0LsbL9+eixdDYHeLsCmN06PtKS8/vVdBh+orbUSj73K4wVOa5t\n j09GF86kT0V3QdMrpGIQAaTbcK5PyOw9WRlHogang2GN1mJgX0Z7J0IZuCvQRgSS8nEv3Y5FffC\n 6Z1+Eb", "X-Received": [ "by 2002:a05:600c:3f14:b0:483:78e1:784 with SMTP id\n 5b1f17b1804b1-48398a433abmr33612855e9.4.1771421244565;\n Wed, 18 Feb 2026 05:27:24 -0800 (PST)", "by 2002:a05:600c:3f14:b0:483:78e1:784 with SMTP id\n 5b1f17b1804b1-48398a433abmr33612205e9.4.1771421243781;\n Wed, 18 Feb 2026 05:27:23 -0800 (PST)" ], "From": "Hanna Czenczek <hreitz@redhat.com>", "To": "qemu-block@nongnu.org", "Cc": "qemu-devel@nongnu.org, Hanna Czenczek <hreitz@redhat.com>,\n Kevin Wolf <kwolf@redhat.com>, Brian Song <hibriansong@gmail.com>", "Subject": "[PATCH v4 16/24] fuse: Manually process requests (without libfuse)", "Date": "Wed, 18 Feb 2026 14:26:25 +0100", "Message-ID": "<20260218132633.29748-17-hreitz@redhat.com>", "X-Mailer": "git-send-email 2.53.0", "In-Reply-To": "<20260218132633.29748-1-hreitz@redhat.com>", "References": "<20260218132633.29748-1-hreitz@redhat.com>", "MIME-Version": "1.0", "Content-Type": [ "text/plain; charset=UTF-8", "text/plain; charset=UTF-8" ], "Content-Transfer-Encoding": "8bit", "Received-SPF": "pass client-ip=170.10.133.124; envelope-from=hreitz@redhat.com;\n helo=us-smtp-delivery-124.mimecast.com", "X-Spam_score_int": "-20", "X-Spam_score": "-2.1", "X-Spam_bar": "--", "X-Spam_report": "(-2.1 / 5.0 requ) BAYES_00=-1.9, DKIMWL_WL_HIGH=-0.043,\n DKIM_SIGNED=0.1, DKIM_VALID=-0.1, DKIM_VALID_AU=-0.1, DKIM_VALID_EF=-0.1,\n RCVD_IN_DNSWL_NONE=-0.0001, RCVD_IN_MSPIKE_H5=0.001, RCVD_IN_MSPIKE_WL=0.001,\n RCVD_IN_VALIDITY_RPBL_BLOCKED=0.001, RCVD_IN_VALIDITY_SAFE_BLOCKED=0.001,\n SPF_HELO_PASS=-0.001,\n SPF_PASS=-0.001 autolearn=unavailable autolearn_force=no", "X-Spam_action": "no action", "X-BeenThere": "qemu-devel@nongnu.org", "X-Mailman-Version": "2.1.29", "Precedence": "list", "List-Id": "qemu development <qemu-devel.nongnu.org>", "List-Unsubscribe": "<https://lists.nongnu.org/mailman/options/qemu-devel>,\n <mailto:qemu-devel-request@nongnu.org?subject=unsubscribe>", "List-Archive": "<https://lists.nongnu.org/archive/html/qemu-devel>", "List-Post": "<mailto:qemu-devel@nongnu.org>", "List-Help": "<mailto:qemu-devel-request@nongnu.org?subject=help>", "List-Subscribe": "<https://lists.nongnu.org/mailman/listinfo/qemu-devel>,\n <mailto:qemu-devel-request@nongnu.org?subject=subscribe>", "Errors-To": "qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org", "Sender": "qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org" }, "content": "Manually read requests from the /dev/fuse FD and process them, without\nusing libfuse. This allows us to safely add parallel request processing\nin coroutines later, without having to worry about libfuse internals.\n(Technically, we already have exactly that problem with\nread_from_fuse_export()/read_from_fuse_fd() nesting.)\n\nWe will continue to use libfuse for mounting the filesystem; fusermount3\nis a effectively a helper program of libfuse, so it should know best how\nto interact with it. (Doing it manually without libfuse, while doable,\nis a bit of a pain, and it is not clear to me how stable the \"protocol\"\nactually is.)\n\nTake this opportunity of quite a major rewrite to update the Copyright\nline with corrected information that has surfaced in the meantime.\n\nHere are some benchmarks from before this patch (4k, iodepth=16, libaio;\nexcept 'sync', which are iodepth=1 and pvsync2):\n\nfile:\n read:\n seq aio: 99.8k ±1.5k IOPS\n rand aio: 50.5k ±1.0k\n seq sync: 36.1k ±1.1k\n rand sync: 10.0k ±0.1k\n write:\n seq aio: 72.0k ±9.3k\n rand aio: 70.6k ±2.5k\n seq sync: 30.6k ±0.8k\n rand sync: 30.1k ±1.0k\nnull:\n read:\n seq aio: 157.9k ±4.7k\n rand aio: 158.7k ±4.8k\n seq sync: 80.2k ±2.8k\n rand sync: 77.5k ±3.8k\n write:\n seq aio: 154.3k ±3.6k\n rand aio: 154.3k ±4.2k\n seq sync: 76.1k ±5.2k\n rand sync: 72.9k ±4.0k\n\nAnd with this patch applied:\n\nfile:\n read:\n seq aio: 106.8k ±1.9k (+7%)\n rand aio: 48.3k ±8.8k (-4%)\n seq sync: 35.5k ±1.4k (-2%)\n rand sync: 10.0k ±0.2k (±0%)\n write:\n seq aio: 76.3k ±6.6k (+6%)\n rand aio: 76.4k ±1.5k (+8%)\n seq sync: 31.6k ±0.6k (+3%)\n rand sync: 30.9k ±0.8k (+3%)\nnull:\n read:\n seq aio: 161.7k ±6.0k (+2%)\n rand aio: 165.6k ±7.1k (+4%)\n seq sync: 80.5k ±3.0k (±0%)\n rand sync: 78.5k ±3.1k (+1%)\n write:\n seq aio: 185.1k ±3.3k (+20%)\n rand aio: 186.7k ±4.8k (+21%)\n seq sync: 82.5k ±4.2k (+8%)\n rand sync: 78.7k ±3.2k (+8%)\n\nSo not much difference, aside from write AIO to a null-co export getting\na bit better.\n\nSigned-off-by: Hanna Czenczek <hreitz@redhat.com>\n---\n block/export/fuse.c | 944 +++++++++++++++++++++++++++++++++-----------\n 1 file changed, 720 insertions(+), 224 deletions(-)", "diff": "diff --git a/block/export/fuse.c b/block/export/fuse.c\nindex af0a8de17b..c481fb72a2 100644\n--- a/block/export/fuse.c\n+++ b/block/export/fuse.c\n@@ -1,7 +1,7 @@\n /*\n * Present a block device as a raw image through FUSE\n *\n- * Copyright (c) 2020 Max Reitz <mreitz@redhat.com>\n+ * Copyright (c) 2020, 2025 Hanna Czenczek <hreitz@redhat.com>\n *\n * This program is free software; you can redistribute it and/or modify\n * it under the terms of the GNU General Public License as published by\n@@ -27,12 +27,15 @@\n #include \"block/qapi.h\"\n #include \"qapi/error.h\"\n #include \"qapi/qapi-commands-block.h\"\n+#include \"qemu/error-report.h\"\n #include \"qemu/main-loop.h\"\n #include \"system/block-backend.h\"\n \n #include <fuse.h>\n #include <fuse_lowlevel.h>\n \n+#include \"standard-headers/linux/fuse.h\"\n+\n #if defined(CONFIG_FALLOCATE_ZERO_RANGE)\n #include <linux/falloc.h>\n #endif\n@@ -42,17 +45,102 @@\n #endif\n \n /* Prevent overly long bounce buffer allocations */\n-#define FUSE_MAX_BOUNCE_BYTES (MIN(BDRV_REQUEST_MAX_BYTES, 64 * 1024 * 1024))\n+#define FUSE_MAX_READ_BYTES (MIN(BDRV_REQUEST_MAX_BYTES, 64 * 1024 * 1024))\n+/* Small enough to fit in the request buffer */\n+#define FUSE_MAX_WRITE_BYTES (64 * 1024)\n \n+/*\n+ * fuse_init_in structure before 7.36. We don't need the flags2 field added\n+ * there, so we can work with the smaller older structure to stay compatible\n+ * with older kernels.\n+ */\n+struct fuse_init_in_compat {\n+ uint32_t major;\n+ uint32_t minor;\n+ uint32_t max_readahead;\n+ uint32_t flags;\n+};\n+\n+typedef struct FuseRequestInHeader {\n+ struct fuse_in_header common;\n+ /* All supported requests */\n+ union {\n+ struct fuse_init_in_compat init;\n+ struct fuse_open_in open;\n+ struct fuse_setattr_in setattr;\n+ struct fuse_read_in read;\n+ struct fuse_write_in write;\n+ struct fuse_fallocate_in fallocate;\n+#ifdef CONFIG_FUSE_LSEEK\n+ struct fuse_lseek_in lseek;\n+#endif\n+ };\n+} FuseRequestInHeader;\n+\n+typedef struct FuseRequestOutHeader {\n+ struct fuse_out_header common;\n+ /* All supported requests */\n+ union {\n+ struct fuse_init_out init;\n+ struct fuse_statfs_out statfs;\n+ struct fuse_open_out open;\n+ struct fuse_attr_out attr;\n+ struct fuse_write_out write;\n+#ifdef CONFIG_FUSE_LSEEK\n+ struct fuse_lseek_out lseek;\n+#endif\n+ };\n+} FuseRequestOutHeader;\n+\n+typedef union FuseRequestInHeaderBuf {\n+ struct FuseRequestInHeader structured;\n+ struct {\n+ /*\n+ * Part of the request header that is filled for write requests\n+ * (Needed because we want the data to go into a different buffer, to\n+ * avoid having to use a bounce buffer)\n+ */\n+ char head[sizeof(struct fuse_in_header) +\n+ sizeof(struct fuse_write_in)];\n+ /*\n+ * Rest of the request header for requests that have a longer header\n+ * than write requests\n+ */\n+ char tail[sizeof(FuseRequestInHeader) -\n+ (sizeof(struct fuse_in_header) +\n+ sizeof(struct fuse_write_in))];\n+ };\n+} FuseRequestInHeaderBuf;\n+\n+QEMU_BUILD_BUG_ON(sizeof(FuseRequestInHeaderBuf) !=\n+ sizeof(FuseRequestInHeader));\n+QEMU_BUILD_BUG_ON(sizeof(((FuseRequestInHeaderBuf *)0)->head) +\n+ sizeof(((FuseRequestInHeaderBuf *)0)->tail) !=\n+ sizeof(FuseRequestInHeader));\n \n typedef struct FuseExport {\n BlockExport common;\n \n struct fuse_session *fuse_session;\n- struct fuse_buf fuse_buf;\n unsigned int in_flight; /* atomic */\n bool mounted, fd_handler_set_up;\n \n+ /*\n+ * Cached buffer to receive the data of WRITE requests. Cached because:\n+ * To read requests, we put a FuseRequestInHeaderBuf (FRIHB) object on the\n+ * stack, and a (WRITE data) buffer on the heap. We pass FRIHB.head and the\n+ * data buffer to readv(). This way, for WRITE requests, we get exactly\n+ * their data in the data buffer and can avoid bounce buffering.\n+ * However, for non-WRITE requests, some of the header may end up in the\n+ * data buffer, so we will need to copy that back into the FRIHB object, and\n+ * then we don't need the heap buffer anymore. That is why we cache it, so\n+ * we can trivially reuse it between non-WRITE requests.\n+ *\n+ * Note that these data buffers and thus req_write_data_cached are allocated\n+ * via blk_blockalign() and thus need to be freed via qemu_vfree().\n+ */\n+ void *req_write_data_cached;\n+\n /*\n * Set when there was an unrecoverable error and no requests should be read\n * from the device anymore (basically only in case of something we would\n@@ -60,6 +148,8 @@ typedef struct FuseExport {\n */\n bool halted;\n \n+ int fuse_fd;\n+\n char *mountpoint;\n bool writable;\n bool growable;\n@@ -71,20 +161,31 @@ typedef struct FuseExport {\n gid_t st_gid;\n } FuseExport;\n \n+/*\n+ * Verify that the size of FuseRequestInHeaderBuf.head plus the data\n+ * buffer are big enough to be accepted by the FUSE kernel driver.\n+ */\n+QEMU_BUILD_BUG_ON(sizeof(((FuseRequestInHeaderBuf *)0)->head) +\n+ FUSE_MAX_WRITE_BYTES <\n+ FUSE_MIN_READ_BUFFER);\n+\n static GHashTable *exports;\n-static const struct fuse_lowlevel_ops fuse_ops;\n \n static void fuse_export_shutdown(BlockExport *exp);\n static void fuse_export_delete(BlockExport *exp);\n-static void fuse_export_halt(FuseExport *exp) G_GNUC_UNUSED;\n+static void fuse_export_halt(FuseExport *exp);\n \n static void init_exports_table(void);\n \n static int mount_fuse_export(FuseExport *exp, Error **errp);\n-static void read_from_fuse_export(void *opaque);\n \n static bool is_regular_file(const char *path, Error **errp);\n \n+static void read_from_fuse_fd(void *opaque);\n+static void fuse_process_request(FuseExport *exp,\n+ const FuseRequestInHeader *in_hdr,\n+ const void *data_buffer);\n+static int fuse_write_err(int fd, const struct fuse_in_header *in_hdr, int err);\n \n static void fuse_inc_in_flight(FuseExport *exp)\n {\n@@ -105,22 +206,26 @@ static void fuse_dec_in_flight(FuseExport *exp)\n }\n }\n \n+/**\n+ * Attach FUSE FD read handler.\n+ */\n static void fuse_attach_handlers(FuseExport *exp)\n {\n if (qatomic_read(&exp->halted)) {\n return;\n }\n \n- aio_set_fd_handler(exp->common.ctx,\n- fuse_session_fd(exp->fuse_session),\n- read_from_fuse_export, NULL, NULL, NULL, exp);\n+ aio_set_fd_handler(exp->common.ctx, exp->fuse_fd,\n+ read_from_fuse_fd, NULL, NULL, NULL, exp);\n exp->fd_handler_set_up = true;\n }\n \n+/**\n+ * Detach FUSE FD read handler.\n+ */\n static void fuse_detach_handlers(FuseExport *exp)\n {\n- aio_set_fd_handler(exp->common.ctx,\n- fuse_session_fd(exp->fuse_session),\n+ aio_set_fd_handler(exp->common.ctx, exp->fuse_fd,\n NULL, NULL, NULL, NULL, NULL);\n exp->fd_handler_set_up = false;\n }\n@@ -247,6 +352,13 @@ static int fuse_export_create(BlockExport *blk_exp,\n \n g_hash_table_insert(exports, g_strdup(exp->mountpoint), NULL);\n \n+ exp->fuse_fd = fuse_session_fd(exp->fuse_session);\n+ ret = qemu_fcntl_addfl(exp->fuse_fd, O_NONBLOCK);\n+ if (ret < 0) {\n+ error_setg_errno(errp, -ret, \"Failed to make FUSE FD non-blocking\");\n+ goto fail;\n+ }\n+\n fuse_attach_handlers(exp);\n return 0;\n \n@@ -278,6 +390,17 @@ static int mount_fuse_export(FuseExport *exp, Error **errp)\n char *mount_opts;\n struct fuse_args fuse_args;\n int ret;\n+ /*\n+ * We just create the session for mounting/unmounting, no need to provide\n+ * any operations. However, since libfuse commit 52a633a5d, we have to\n+ * provide some op struct and cannot just pass NULL (even though the commit\n+ * message (\"allow passing ops as NULL\") seems to imply the exact opposite,\n+ * as does the comment added to fuse_session_new_fn() (\"To create a no-op\n+ * session just for mounting pass op as NULL.\").\n+ * This is how said libfuse commit implements a no-op session internally, so\n+ * do it the same way.\n+ */\n+ static const struct fuse_lowlevel_ops null_ops = { 0 };\n \n /*\n * Note that these mount options differ from what we would pass to a direct\n@@ -292,7 +415,7 @@ static int mount_fuse_export(FuseExport *exp, Error **errp)\n mount_opts = g_strdup_printf(\"%s,nosuid,nodev,noatime,max_read=%zu,\"\n \"default_permissions%s\",\n exp->writable ? \"rw\" : \"ro\",\n- FUSE_MAX_BOUNCE_BYTES,\n+ FUSE_MAX_READ_BYTES,\n exp->allow_other ? \",allow_other\" : \"\");\n \n fuse_argv[0] = \"\"; /* Dummy program name */\n@@ -301,8 +424,8 @@ static int mount_fuse_export(FuseExport *exp, Error **errp)\n fuse_argv[3] = NULL;\n fuse_args = (struct fuse_args)FUSE_ARGS_INIT(3, (char **)fuse_argv);\n \n- exp->fuse_session = fuse_session_new(&fuse_args, &fuse_ops,\n- sizeof(fuse_ops), exp);\n+ exp->fuse_session = fuse_session_new(&fuse_args, &null_ops,\n+ sizeof(null_ops), NULL);\n g_free(mount_opts);\n if (!exp->fuse_session) {\n error_setg(errp, \"Failed to set up FUSE session\");\n@@ -326,36 +449,163 @@ fail:\n }\n \n /**\n- * Callback to be invoked when the FUSE session FD can be read from.\n- * (This is basically the FUSE event loop.)\n+ * Allocate a buffer to receive WRITE data, or take the cached one.\n */\n-static void read_from_fuse_export(void *opaque)\n+static void *get_write_data_buffer(FuseExport *exp)\n {\n- FuseExport *exp = opaque;\n- int ret;\n+ if (exp->req_write_data_cached) {\n+ void *cached = exp->req_write_data_cached;\n+ exp->req_write_data_cached = NULL;\n+ return cached;\n+ } else {\n+ return blk_blockalign(exp->common.blk, FUSE_MAX_WRITE_BYTES);\n+ }\n+}\n \n- if (unlikely(qatomic_read(&exp->halted))) {\n+/**\n+ * Release a WRITE data buffer, possibly reusing it for a subsequent request.\n+ */\n+static void release_write_data_buffer(FuseExport *exp, void **buffer)\n+{\n+ if (!*buffer) {\n return;\n }\n \n+ if (!exp->req_write_data_cached) {\n+ exp->req_write_data_cached = *buffer;\n+ } else {\n+ qemu_vfree(*buffer);\n+ }\n+ *buffer = NULL;\n+}\n+\n+/**\n+ * Return the length of the specific operation's own in_header.\n+ * Return -ENOSYS if the operation is not supported.\n+ */\n+static ssize_t req_op_hdr_len(const FuseRequestInHeader *in_hdr)\n+{\n+ switch (in_hdr->common.opcode) {\n+ case FUSE_INIT:\n+ return sizeof(in_hdr->init);\n+ case FUSE_OPEN:\n+ return sizeof(in_hdr->open);\n+ case FUSE_SETATTR:\n+ return sizeof(in_hdr->setattr);\n+ case FUSE_READ:\n+ return sizeof(in_hdr->read);\n+ case FUSE_WRITE:\n+ return sizeof(in_hdr->write);\n+ case FUSE_FALLOCATE:\n+ return sizeof(in_hdr->fallocate);\n+#ifdef CONFIG_FUSE_LSEEK\n+ case FUSE_LSEEK:\n+ return sizeof(in_hdr->lseek);\n+#endif\n+ case FUSE_DESTROY:\n+ case FUSE_STATFS:\n+ case FUSE_RELEASE:\n+ case FUSE_LOOKUP:\n+ case FUSE_FORGET:\n+ case FUSE_BATCH_FORGET:\n+ case FUSE_GETATTR:\n+ case FUSE_FSYNC:\n+ case FUSE_FLUSH:\n+ /* These requests don't have their own header or we don't care */\n+ return 0;\n+ default:\n+ return -ENOSYS;\n+ }\n+}\n+\n+/**\n+ * Try to read and process a single request from the FUSE FD.\n+ */\n+static void read_from_fuse_fd(void *opaque)\n+{\n+ FuseExport *exp = opaque;\n+ int fuse_fd = exp->fuse_fd;\n+ ssize_t ret;\n+ FuseRequestInHeaderBuf in_hdr_buf;\n+ const FuseRequestInHeader *in_hdr;\n+ void *data_buffer = NULL;\n+ struct iovec iov[2];\n+ ssize_t op_hdr_len;\n+\n fuse_inc_in_flight(exp);\n \n- do {\n- ret = fuse_session_receive_buf(exp->fuse_session, &exp->fuse_buf);\n- } while (ret == -EINTR);\n- if (ret < 0) {\n- goto out;\n+ if (unlikely(qatomic_read(&exp->halted))) {\n+ goto no_request;\n+ }\n+\n+ data_buffer = get_write_data_buffer(exp);\n+\n+ /* Construct the I/O vector to hold the FUSE request */\n+ iov[0] = (struct iovec) { &in_hdr_buf.head, sizeof(in_hdr_buf.head) };\n+ iov[1] = (struct iovec) { data_buffer, FUSE_MAX_WRITE_BYTES };\n+ ret = RETRY_ON_EINTR(readv(fuse_fd, iov, ARRAY_SIZE(iov)));\n+ if (ret < 0 && errno == EAGAIN) {\n+ /* No request available */\n+ goto no_request;\n+ } else if (unlikely(ret < 0)) {\n+ error_report(\"Failed to read from FUSE device: %s\", strerror(errno));\n+ goto no_request;\n+ }\n+\n+ if (unlikely(ret < sizeof(in_hdr->common))) {\n+ error_report(\"Incomplete read from FUSE device, expected at least %zu \"\n+ \"bytes, read %zi bytes; cannot trust subsequent \"\n+ \"requests, halting the export\",\n+ sizeof(in_hdr->common), ret);\n+ fuse_export_halt(exp);\n+ goto no_request;\n+ }\n+ in_hdr = &in_hdr_buf.structured;\n+\n+ if (unlikely(ret != in_hdr->common.len)) {\n+ error_report(\"Number of bytes read from FUSE device does not match \"\n+ \"request size, expected %\" PRIu32 \" bytes, read %zi \"\n+ \"bytes; cannot trust subsequent requests, halting the \"\n+ \"export\",\n+ in_hdr->common.len, ret);\n+ fuse_export_halt(exp);\n+ goto no_request;\n+ }\n+\n+ op_hdr_len = req_op_hdr_len(in_hdr);\n+ if (op_hdr_len < 0) {\n+ fuse_write_err(fuse_fd, &in_hdr->common, op_hdr_len);\n+ goto no_request;\n+ }\n+\n+ if (unlikely(ret < sizeof(in_hdr->common) + op_hdr_len)) {\n+ error_report(\"FUSE request truncated, expected %zu bytes, read %zi \"\n+ \"bytes\",\n+ sizeof(in_hdr->common) + op_hdr_len, ret);\n+ fuse_write_err(fuse_fd, &in_hdr->common, -EINVAL);\n+ goto no_request;\n }\n \n /*\n- * Note that aio_poll() in any request-processing function can lead to a\n- * nested read_from_fuse_export() call, which will overwrite the contents of\n- * exp->fuse_buf. Anything that takes a buffer needs to take care that the\n- * content is copied before potentially polling via aio_poll().\n+ * Only WRITE uses the write data buffer, so for non-WRITE requests longer\n+ * than .head, we need to copy any data that spilled into data_buffer into\n+ * .tail. Then we can release the write data buffer.\n */\n- fuse_session_process_buf(exp->fuse_session, &exp->fuse_buf);\n+ if (in_hdr->common.opcode != FUSE_WRITE) {\n+ if (ret > sizeof(in_hdr_buf.head)) {\n+ size_t len;\n+ /* Limit size to prevent overflow */\n+ len = MIN(ret - sizeof(in_hdr_buf.head), sizeof(in_hdr_buf.tail));\n+ memcpy(in_hdr_buf.tail, data_buffer, len);\n+ }\n \n-out:\n+ release_write_data_buffer(exp, &data_buffer);\n+ }\n+\n+ fuse_process_request(exp, in_hdr, data_buffer);\n+\n+no_request:\n+ release_write_data_buffer(exp, &data_buffer);\n fuse_dec_in_flight(exp);\n }\n \n@@ -363,18 +613,14 @@ static void fuse_export_shutdown(BlockExport *blk_exp)\n {\n FuseExport *exp = container_of(blk_exp, FuseExport, common);\n \n- if (exp->fuse_session) {\n- fuse_session_exit(exp->fuse_session);\n-\n- if (exp->fd_handler_set_up) {\n- fuse_detach_handlers(exp);\n- }\n+ if (exp->fd_handler_set_up) {\n+ fuse_detach_handlers(exp);\n }\n \n if (exp->mountpoint) {\n /*\n- * Safe to drop now, because we will not handle any requests\n- * for this export anymore anyway.\n+ * Safe to drop now, because we will not handle any requests for this\n+ * export anymore anyway (at least not from the main thread).\n */\n g_hash_table_remove(exports, exp->mountpoint);\n }\n@@ -392,7 +638,7 @@ static void fuse_export_delete(BlockExport *blk_exp)\n fuse_session_destroy(exp->fuse_session);\n }\n \n- free(exp->fuse_buf.mem);\n+ qemu_vfree(exp->req_write_data_cached);\n g_free(exp->mountpoint);\n }\n \n@@ -434,46 +680,101 @@ static bool is_regular_file(const char *path, Error **errp)\n }\n \n /**\n- * A chance to set change some parameters supplied to FUSE_INIT.\n+ * Process FUSE INIT.\n+ * Return the number of bytes written to *out on success, and -errno on error.\n */\n-static void fuse_init(void *userdata, struct fuse_conn_info *conn)\n+static ssize_t fuse_init(FuseExport *exp, struct fuse_init_out *out,\n+ const struct fuse_init_in_compat *in)\n {\n+ const uint32_t supported_flags = FUSE_ASYNC_READ | FUSE_ASYNC_DIO;\n+\n+ if (in->major != 7) {\n+ error_report(\"FUSE major version mismatch: We have 7, but kernel has %\"\n+ PRIu32, in->major);\n+ return -EINVAL;\n+ }\n+\n+ /* 2007's 7.9 added fuse_attr.blksize; working around that would be hard */\n+ if (in->minor < 9) {\n+ error_report(\"FUSE minor version too old: 9 required, but kernel has %\"\n+ PRIu32, in->minor);\n+ return -EINVAL;\n+ }\n+\n+ *out = (struct fuse_init_out) {\n+ .major = 7,\n+ .minor = MIN(FUSE_KERNEL_MINOR_VERSION, in->minor),\n+ .max_readahead = in->max_readahead,\n+ .max_write = FUSE_MAX_WRITE_BYTES,\n+ .flags = in->flags & supported_flags,\n+ .flags2 = 0,\n+\n+ /* libfuse maximum: 2^16 - 1 */\n+ .max_background = UINT16_MAX,\n+\n+ /* libfuse default: max_background * 3 / 4 */\n+ .congestion_threshold = (int)UINT16_MAX * 3 / 4,\n+\n+ /* libfuse default: 1 */\n+ .time_gran = 1,\n+\n+ /*\n+ * probably unneeded without FUSE_MAX_PAGES, but this would be the\n+ * libfuse default\n+ */\n+ .max_pages = DIV_ROUND_UP(FUSE_MAX_WRITE_BYTES,\n+ qemu_real_host_page_size()),\n+\n+ /* Only needed for mappings (i.e. DAX) */\n+ .map_alignment = 0,\n+ };\n+\n /*\n- * MIN_NON_ZERO() would not be wrong here, but what we set here\n- * must equal what has been passed to fuse_session_new().\n- * Therefore, as long as max_read must be passed as a mount option\n- * (which libfuse claims will be changed at some point), we have\n- * to set max_read to a fixed value here.\n+ * Before 7.23, fuse_init_out is shorter.\n+ * Drop the tail (time_gran, max_pages, map_alignment).\n */\n- conn->max_read = FUSE_MAX_BOUNCE_BYTES;\n-\n- conn->max_write = MIN_NON_ZERO(BDRV_REQUEST_MAX_BYTES, conn->max_write);\n+ return out->minor >= 23 ? sizeof(*out) : FUSE_COMPAT_22_INIT_OUT_SIZE;\n }\n \n /**\n- * Let clients look up files. Always return ENOENT because we only\n- * care about the mountpoint itself.\n+ * Return some filesystem information, just to not break e.g. `df`.\n */\n-static void fuse_lookup(fuse_req_t req, fuse_ino_t parent, const char *name)\n+static ssize_t fuse_statfs(FuseExport *exp, struct fuse_statfs_out *out)\n {\n- fuse_reply_err(req, ENOENT);\n+ BlockDriverState *root_bs;\n+ uint32_t opt_transfer = 512;\n+\n+ root_bs = blk_bs(exp->common.blk);\n+ if (root_bs) {\n+ opt_transfer = root_bs->bl.opt_transfer;\n+ if (!opt_transfer) {\n+ opt_transfer = root_bs->bl.request_alignment;\n+ }\n+ opt_transfer = MAX(opt_transfer, 512);\n+ }\n+\n+ *out = (struct fuse_statfs_out) {\n+ /* These are the fields libfuse sets by default */\n+ .st = {\n+ .namelen = 255,\n+ .bsize = opt_transfer,\n+ },\n+ };\n+ return sizeof(*out);\n }\n \n /**\n * Let clients get file attributes (i.e., stat() the file).\n+ * Return the number of bytes written to *out on success, and -errno on error.\n */\n-static void fuse_getattr(fuse_req_t req, fuse_ino_t inode,\n- struct fuse_file_info *fi)\n+static ssize_t fuse_getattr(FuseExport *exp, struct fuse_attr_out *out)\n {\n- struct stat statbuf;\n int64_t length, allocated_blocks;\n time_t now = time(NULL);\n- FuseExport *exp = fuse_req_userdata(req);\n \n length = blk_getlength(exp->common.blk);\n if (length < 0) {\n- fuse_reply_err(req, -length);\n- return;\n+ return length;\n }\n \n allocated_blocks = bdrv_get_allocated_file_size(blk_bs(exp->common.blk));\n@@ -483,21 +784,24 @@ static void fuse_getattr(fuse_req_t req, fuse_ino_t inode,\n allocated_blocks = DIV_ROUND_UP(allocated_blocks, 512);\n }\n \n- statbuf = (struct stat) {\n- .st_ino = 1,\n- .st_mode = exp->st_mode,\n- .st_nlink = 1,\n- .st_uid = exp->st_uid,\n- .st_gid = exp->st_gid,\n- .st_size = length,\n- .st_blksize = blk_bs(exp->common.blk)->bl.request_alignment,\n- .st_blocks = allocated_blocks,\n- .st_atime = now,\n- .st_mtime = now,\n- .st_ctime = now,\n+ *out = (struct fuse_attr_out) {\n+ .attr_valid = 1,\n+ .attr = {\n+ .ino = 1,\n+ .mode = exp->st_mode,\n+ .nlink = 1,\n+ .uid = exp->st_uid,\n+ .gid = exp->st_gid,\n+ .size = length,\n+ .blksize = blk_bs(exp->common.blk)->bl.request_alignment,\n+ .blocks = allocated_blocks,\n+ .atime = now,\n+ .mtime = now,\n+ .ctime = now,\n+ },\n };\n \n- fuse_reply_attr(req, &statbuf, 1.);\n+ return sizeof(*out);\n }\n \n static int fuse_do_truncate(const FuseExport *exp, int64_t size,\n@@ -550,101 +854,98 @@ static int fuse_do_truncate(const FuseExport *exp, int64_t size,\n * permit access: Read-only exports cannot be given +w, and exports\n * without allow_other cannot be given a different UID or GID, and\n * they cannot be given non-owner access.\n+ * Return the number of bytes written to *out on success, and -errno on error.\n */\n-static void fuse_setattr(fuse_req_t req, fuse_ino_t inode, struct stat *statbuf,\n- int to_set, struct fuse_file_info *fi)\n+static ssize_t fuse_setattr(FuseExport *exp, struct fuse_attr_out *out,\n+ uint32_t to_set, uint64_t size, uint32_t mode,\n+ uint32_t uid, uint32_t gid)\n {\n- FuseExport *exp = fuse_req_userdata(req);\n int supported_attrs;\n int ret;\n \n- supported_attrs = FUSE_SET_ATTR_SIZE | FUSE_SET_ATTR_MODE;\n+ /* SIZE and MODE are actually supported, the others can be safely ignored */\n+ supported_attrs = FATTR_SIZE | FATTR_MODE |\n+ FATTR_FH | FATTR_LOCKOWNER | FATTR_KILL_SUIDGID;\n if (exp->allow_other) {\n- supported_attrs |= FUSE_SET_ATTR_UID | FUSE_SET_ATTR_GID;\n+ supported_attrs |= FATTR_UID | FATTR_GID;\n }\n \n if (to_set & ~supported_attrs) {\n- fuse_reply_err(req, ENOTSUP);\n- return;\n+ return -ENOTSUP;\n }\n \n /* Do some argument checks first before committing to anything */\n- if (to_set & FUSE_SET_ATTR_MODE) {\n+ if (to_set & FATTR_MODE) {\n /*\n * Without allow_other, non-owners can never access the export, so do\n * not allow setting permissions for them\n */\n- if (!exp->allow_other &&\n- (statbuf->st_mode & (S_IRWXG | S_IRWXO)) != 0)\n- {\n- fuse_reply_err(req, EPERM);\n- return;\n+ if (!exp->allow_other && (mode & (S_IRWXG | S_IRWXO)) != 0) {\n+ return -EPERM;\n }\n \n /* +w for read-only exports makes no sense, disallow it */\n- if (!exp->writable &&\n- (statbuf->st_mode & (S_IWUSR | S_IWGRP | S_IWOTH)) != 0)\n- {\n- fuse_reply_err(req, EROFS);\n- return;\n+ if (!exp->writable && (mode & (S_IWUSR | S_IWGRP | S_IWOTH)) != 0) {\n+ return -EROFS;\n }\n }\n \n- if (to_set & FUSE_SET_ATTR_SIZE) {\n+ if (to_set & FATTR_SIZE) {\n if (!exp->writable) {\n- fuse_reply_err(req, EACCES);\n- return;\n+ return -EACCES;\n }\n \n- ret = fuse_do_truncate(exp, statbuf->st_size, true, PREALLOC_MODE_OFF);\n+ ret = fuse_do_truncate(exp, size, true, PREALLOC_MODE_OFF);\n if (ret < 0) {\n- fuse_reply_err(req, -ret);\n- return;\n+ return ret;\n }\n }\n \n- if (to_set & FUSE_SET_ATTR_MODE) {\n+ if (to_set & FATTR_MODE) {\n /* Ignore FUSE-supplied file type, only change the mode */\n- exp->st_mode = (statbuf->st_mode & 07777) | S_IFREG;\n+ exp->st_mode = (mode & 07777) | S_IFREG;\n }\n \n- if (to_set & FUSE_SET_ATTR_UID) {\n- exp->st_uid = statbuf->st_uid;\n+ if (to_set & FATTR_UID) {\n+ exp->st_uid = uid;\n }\n \n- if (to_set & FUSE_SET_ATTR_GID) {\n- exp->st_gid = statbuf->st_gid;\n+ if (to_set & FATTR_GID) {\n+ exp->st_gid = gid;\n }\n \n- fuse_getattr(req, inode, fi);\n+ return fuse_getattr(exp, out);\n }\n \n /**\n- * Let clients open a file (i.e., the exported image).\n+ * Open an inode. We only have a single inode in our exported filesystem, so we\n+ * just acknowledge the request.\n+ * Return the number of bytes written to *out on success, and -errno on error.\n */\n-static void fuse_open(fuse_req_t req, fuse_ino_t inode,\n- struct fuse_file_info *fi)\n+static ssize_t fuse_open(FuseExport *exp, struct fuse_open_out *out)\n {\n- fi->direct_io = true;\n- fi->parallel_direct_writes = true;\n- fuse_reply_open(req, fi);\n+ *out = (struct fuse_open_out) {\n+ .open_flags = FOPEN_DIRECT_IO | FOPEN_PARALLEL_DIRECT_WRITES,\n+ };\n+ return sizeof(*out);\n }\n \n /**\n- * Handle client reads from the exported image.\n+ * Handle client reads from the exported image. Allocates *bufptr and reads\n+ * data from the block device into that buffer.\n+ * Returns the buffer (read) size on success, and -errno on error.\n+ * After use, *bufptr must be freed via qemu_vfree().\n */\n-static void fuse_read(fuse_req_t req, fuse_ino_t inode,\n- size_t size, off_t offset, struct fuse_file_info *fi)\n+static ssize_t fuse_read(FuseExport *exp, void **bufptr,\n+ uint64_t offset, uint32_t size)\n {\n- FuseExport *exp = fuse_req_userdata(req);\n int64_t blk_len;\n void *buf;\n int ret;\n \n /* Limited by max_read, should not happen */\n- if (size > FUSE_MAX_BOUNCE_BYTES) {\n- fuse_reply_err(req, EINVAL);\n- return;\n+ if (size > FUSE_MAX_READ_BYTES) {\n+ return -EINVAL;\n }\n \n /**\n@@ -653,18 +954,12 @@ static void fuse_read(fuse_req_t req, fuse_ino_t inode,\n */\n blk_len = blk_getlength(exp->common.blk);\n if (blk_len < 0) {\n- fuse_reply_err(req, -blk_len);\n- return;\n+ return blk_len;\n }\n \n if (offset >= blk_len) {\n- /*\n- * Technically libfuse does not allow returning a zero error code for\n- * read requests, but in practice this is a 0-length read (and a future\n- * commit will change this code anyway)\n- */\n- fuse_reply_err(req, 0);\n- return;\n+ *bufptr = NULL;\n+ return 0;\n }\n \n if (offset + size > blk_len) {\n@@ -673,108 +968,96 @@ static void fuse_read(fuse_req_t req, fuse_ino_t inode,\n \n buf = qemu_try_blockalign(blk_bs(exp->common.blk), size);\n if (!buf) {\n- fuse_reply_err(req, ENOMEM);\n- return;\n+ return -ENOMEM;\n }\n \n ret = blk_pread(exp->common.blk, offset, size, buf, 0);\n- if (ret >= 0) {\n- fuse_reply_buf(req, buf, size);\n- } else {\n- fuse_reply_err(req, -ret);\n+ if (ret < 0) {\n+ qemu_vfree(buf);\n+ return ret;\n }\n \n- qemu_vfree(buf);\n+ *bufptr = buf;\n+ return size;\n }\n \n /**\n- * Handle client writes to the exported image.\n+ * Handle client writes to the exported image. @buf has the data to be written.\n+ * Return the number of bytes written to *out on success, and -errno on error.\n */\n-static void fuse_write(fuse_req_t req, fuse_ino_t inode, const char *buf,\n- size_t size, off_t offset, struct fuse_file_info *fi)\n+static ssize_t fuse_write(FuseExport *exp, struct fuse_write_out *out,\n+ uint64_t offset, uint32_t size, const void *buf)\n {\n- FuseExport *exp = fuse_req_userdata(req);\n- QEMU_AUTO_VFREE void *copied = NULL;\n int64_t blk_len;\n int ret;\n \n+ QEMU_BUILD_BUG_ON(FUSE_MAX_WRITE_BYTES > BDRV_REQUEST_MAX_BYTES);\n /* Limited by max_write, should not happen */\n- if (size > BDRV_REQUEST_MAX_BYTES) {\n- fuse_reply_err(req, EINVAL);\n- return;\n+ if (size > FUSE_MAX_WRITE_BYTES) {\n+ return -EINVAL;\n }\n \n if (!exp->writable) {\n- fuse_reply_err(req, EACCES);\n- return;\n+ return -EACCES;\n }\n \n- /*\n- * Heed the note on read_from_fuse_export(): If we call aio_poll() (which\n- * any blk_*() I/O function may do), read_from_fuse_export() may be nested,\n- * overwriting the request buffer content. Therefore, we must copy it here.\n- */\n- copied = blk_blockalign(exp->common.blk, size);\n- memcpy(copied, buf, size);\n-\n /**\n * Clients will expect short writes at EOF, so we have to limit\n * offset+size to the image length.\n */\n blk_len = blk_getlength(exp->common.blk);\n if (blk_len < 0) {\n- fuse_reply_err(req, -blk_len);\n- return;\n+ return blk_len;\n }\n \n if (offset >= blk_len && !exp->growable) {\n- fuse_reply_write(req, 0);\n- return;\n+ *out = (struct fuse_write_out) {\n+ .size = 0,\n+ };\n+ return sizeof(*out);\n }\n \n if (offset + size < offset) {\n- fuse_reply_err(req, EINVAL);\n- return;\n+ return -EINVAL;\n } else if (offset + size > blk_len) {\n if (exp->growable) {\n ret = fuse_do_truncate(exp, offset + size, true, PREALLOC_MODE_OFF);\n if (ret < 0) {\n- fuse_reply_err(req, -ret);\n- return;\n+ return ret;\n }\n } else {\n size = blk_len - offset;\n }\n }\n \n- ret = blk_pwrite(exp->common.blk, offset, size, copied, 0);\n- if (ret >= 0) {\n- fuse_reply_write(req, size);\n- } else {\n- fuse_reply_err(req, -ret);\n+ ret = blk_pwrite(exp->common.blk, offset, size, buf, 0);\n+ if (ret < 0) {\n+ return ret;\n }\n+\n+ *out = (struct fuse_write_out) {\n+ .size = size,\n+ };\n+ return sizeof(*out);\n }\n \n /**\n * Let clients perform various fallocate() operations.\n+ * Return 0 on success (no 'out' object), and -errno on error.\n */\n-static void fuse_fallocate(fuse_req_t req, fuse_ino_t inode, int mode,\n- off_t offset, off_t length,\n- struct fuse_file_info *fi)\n+static ssize_t fuse_fallocate(FuseExport *exp, uint64_t offset, uint64_t length,\n+ uint32_t mode)\n {\n- FuseExport *exp = fuse_req_userdata(req);\n int64_t blk_len;\n int ret;\n \n if (!exp->writable) {\n- fuse_reply_err(req, EACCES);\n- return;\n+ return -EACCES;\n }\n \n blk_len = blk_getlength(exp->common.blk);\n if (blk_len < 0) {\n- fuse_reply_err(req, -blk_len);\n- return;\n+ return blk_len;\n }\n \n #ifdef CONFIG_FALLOCATE_PUNCH_HOLE\n@@ -786,16 +1069,14 @@ static void fuse_fallocate(fuse_req_t req, fuse_ino_t inode, int mode,\n if (!mode) {\n /* We can only fallocate at the EOF with a truncate */\n if (offset < blk_len) {\n- fuse_reply_err(req, EOPNOTSUPP);\n- return;\n+ return -EOPNOTSUPP;\n }\n \n if (offset > blk_len) {\n /* No preallocation needed here */\n ret = fuse_do_truncate(exp, offset, true, PREALLOC_MODE_OFF);\n if (ret < 0) {\n- fuse_reply_err(req, -ret);\n- return;\n+ return ret;\n }\n }\n \n@@ -805,8 +1086,7 @@ static void fuse_fallocate(fuse_req_t req, fuse_ino_t inode, int mode,\n #ifdef CONFIG_FALLOCATE_PUNCH_HOLE\n else if (mode & FALLOC_FL_PUNCH_HOLE) {\n if (!(mode & FALLOC_FL_KEEP_SIZE)) {\n- fuse_reply_err(req, EINVAL);\n- return;\n+ return -EINVAL;\n }\n \n do {\n@@ -834,8 +1114,7 @@ static void fuse_fallocate(fuse_req_t req, fuse_ino_t inode, int mode,\n ret = fuse_do_truncate(exp, offset + length, false,\n PREALLOC_MODE_OFF);\n if (ret < 0) {\n- fuse_reply_err(req, -ret);\n- return;\n+ return ret;\n }\n }\n \n@@ -853,44 +1132,38 @@ static void fuse_fallocate(fuse_req_t req, fuse_ino_t inode, int mode,\n ret = -EOPNOTSUPP;\n }\n \n- fuse_reply_err(req, ret < 0 ? -ret : 0);\n+ return ret < 0 ? ret : 0;\n }\n \n /**\n * Let clients fsync the exported image.\n+ * Return 0 on success (no 'out' object), and -errno on error.\n */\n-static void fuse_fsync(fuse_req_t req, fuse_ino_t inode, int datasync,\n- struct fuse_file_info *fi)\n+static ssize_t fuse_fsync(FuseExport *exp)\n {\n- FuseExport *exp = fuse_req_userdata(req);\n- int ret;\n-\n- ret = blk_flush(exp->common.blk);\n- fuse_reply_err(req, ret < 0 ? -ret : 0);\n+ return blk_flush(exp->common.blk);\n }\n \n /**\n * Called before an FD to the exported image is closed. (libfuse\n * notes this to be a way to return last-minute errors.)\n+ * Return 0 on success (no 'out' object), and -errno on error.\n */\n-static void fuse_flush(fuse_req_t req, fuse_ino_t inode,\n- struct fuse_file_info *fi)\n+static ssize_t fuse_flush(FuseExport *exp)\n {\n- fuse_fsync(req, inode, 1, fi);\n+ return blk_flush(exp->common.blk);\n }\n \n #ifdef CONFIG_FUSE_LSEEK\n /**\n * Let clients inquire allocation status.\n+ * Return the number of bytes written to *out on success, and -errno on error.\n */\n-static void fuse_lseek(fuse_req_t req, fuse_ino_t inode, off_t offset,\n- int whence, struct fuse_file_info *fi)\n+static ssize_t fuse_lseek(FuseExport *exp, struct fuse_lseek_out *out,\n+ uint64_t offset, uint32_t whence)\n {\n- FuseExport *exp = fuse_req_userdata(req);\n-\n if (whence != SEEK_HOLE && whence != SEEK_DATA) {\n- fuse_reply_err(req, EINVAL);\n- return;\n+ return -EINVAL;\n }\n \n while (true) {\n@@ -900,8 +1173,7 @@ static void fuse_lseek(fuse_req_t req, fuse_ino_t inode, off_t offset,\n ret = bdrv_block_status_above(blk_bs(exp->common.blk), NULL,\n offset, INT64_MAX, &pnum, NULL, NULL);\n if (ret < 0) {\n- fuse_reply_err(req, -ret);\n- return;\n+ return ret;\n }\n \n if (!pnum && (ret & BDRV_BLOCK_EOF)) {\n@@ -918,34 +1190,38 @@ static void fuse_lseek(fuse_req_t req, fuse_ino_t inode, off_t offset,\n \n blk_len = blk_getlength(exp->common.blk);\n if (blk_len < 0) {\n- fuse_reply_err(req, -blk_len);\n- return;\n+ return blk_len;\n }\n \n if (offset > blk_len || whence == SEEK_DATA) {\n- fuse_reply_err(req, ENXIO);\n- } else {\n- fuse_reply_lseek(req, offset);\n+ return -ENXIO;\n }\n- return;\n+\n+ *out = (struct fuse_lseek_out) {\n+ .offset = offset,\n+ };\n+ return sizeof(*out);\n }\n \n if (ret & BDRV_BLOCK_DATA) {\n if (whence == SEEK_DATA) {\n- fuse_reply_lseek(req, offset);\n- return;\n+ *out = (struct fuse_lseek_out) {\n+ .offset = offset,\n+ };\n+ return sizeof(*out);\n }\n } else {\n if (whence == SEEK_HOLE) {\n- fuse_reply_lseek(req, offset);\n- return;\n+ *out = (struct fuse_lseek_out) {\n+ .offset = offset,\n+ };\n+ return sizeof(*out);\n }\n }\n \n /* Safety check against infinite loops */\n if (!pnum) {\n- fuse_reply_err(req, ENXIO);\n- return;\n+ return -ENXIO;\n }\n \n offset += pnum;\n@@ -953,21 +1229,241 @@ static void fuse_lseek(fuse_req_t req, fuse_ino_t inode, off_t offset,\n }\n #endif\n \n-static const struct fuse_lowlevel_ops fuse_ops = {\n- .init = fuse_init,\n- .lookup = fuse_lookup,\n- .getattr = fuse_getattr,\n- .setattr = fuse_setattr,\n- .open = fuse_open,\n- .read = fuse_read,\n- .write = fuse_write,\n- .fallocate = fuse_fallocate,\n- .flush = fuse_flush,\n- .fsync = fuse_fsync,\n+/**\n+ * Write a FUSE response to the given @fd.\n+ *\n+ * Effectively, writes out_hdr->common.len bytes of the buffer that is *out_hdr.\n+ *\n+ * @fd: FUSE file descriptor\n+ * @out_hdr: Request response header and request-specific response data\n+ */\n+static int fuse_write_response(int fd, FuseRequestOutHeader *out_hdr)\n+{\n+ size_t to_write = out_hdr->common.len;\n+ ssize_t ret;\n+\n+ /* Must at least write fuse_out_header */\n+ assert(to_write >= sizeof(out_hdr->common));\n+\n+ ret = RETRY_ON_EINTR(write(fd, out_hdr, to_write));\n+ if (ret < 0) {\n+ ret = -errno;\n+ error_report(\"Failed to write to FUSE device: %s\", strerror(-ret));\n+ return ret;\n+ }\n+\n+ /* Short writes are unexpected, treat them as errors */\n+ if (ret != to_write) {\n+ error_report(\"Short write to FUSE device, wrote %zi of %zu bytes\",\n+ ret, to_write);\n+ return -EIO;\n+ }\n+\n+ return 0;\n+}\n+\n+/**\n+ * Write a FUSE error response to @fd.\n+ *\n+ * @fd: FUSE file descriptor\n+ * @in_hdr: Incoming request header to which to respond\n+ * @err: Error code (-errno, must be negative!)\n+ */\n+static int fuse_write_err(int fd, const struct fuse_in_header *in_hdr, int err)\n+{\n+ FuseRequestOutHeader out_hdr = {\n+ .common = {\n+ .len = sizeof(out_hdr.common),\n+ /* FUSE expects negative error values */\n+ .error = err,\n+ .unique = in_hdr->unique,\n+ },\n+ };\n+\n+ return fuse_write_response(fd, &out_hdr);\n+}\n+\n+/**\n+ * Write a FUSE response to the given @fd, using separate buffers for the\n+ * response header and data.\n+ *\n+ * In contrast to fuse_write_response(), this function cannot return a full\n+ * FuseRequestOutHeader (i.e. including request-specific response structs),\n+ * but only FuseRequestOutHeader.common. The remaining data must be in\n+ * *buf.\n+ *\n+ * (Total length must be set in out_hdr->len.)\n+ *\n+ * @fd: FUSE file descriptor\n+ * @out_hdr: Request response header\n+ * @buf: Pointer to response data\n+ */\n+static int fuse_write_buf_response(int fd,\n+ const struct fuse_out_header *out_hdr,\n+ const void *buf)\n+{\n+ size_t to_write = out_hdr->len;\n+ struct iovec iov[2] = {\n+ { (void *)out_hdr, sizeof(*out_hdr) },\n+ { (void *)buf, to_write - sizeof(*out_hdr) },\n+ };\n+ ssize_t ret;\n+\n+ /* *buf length must not be negative */\n+ assert(to_write >= sizeof(*out_hdr));\n+\n+ ret = RETRY_ON_EINTR(writev(fd, iov, ARRAY_SIZE(iov)));\n+ if (ret < 0) {\n+ ret = -errno;\n+ error_report(\"Failed to write to FUSE device: %s\", strerror(-ret));\n+ return ret;\n+ }\n+\n+ /* Short writes are unexpected, treat them as errors */\n+ if (ret != to_write) {\n+ error_report(\"Short write to FUSE device, wrote %zi of %zu bytes\",\n+ ret, to_write);\n+ return -EIO;\n+ }\n+\n+ return 0;\n+}\n+\n+/**\n+ * Process a FUSE request, incl. writing the response.\n+ */\n+static void fuse_process_request(FuseExport *exp,\n+ const FuseRequestInHeader *in_hdr,\n+ const void *data_buffer)\n+{\n+ FuseRequestOutHeader out_hdr;\n+ /* For read requests: Data to be returned */\n+ void *out_data_buffer = NULL;\n+ ssize_t ret;\n+\n+ switch (in_hdr->common.opcode) {\n+ case FUSE_INIT:\n+ ret = fuse_init(exp, &out_hdr.init, &in_hdr->init);\n+ break;\n+\n+ case FUSE_DESTROY:\n+ ret = 0;\n+ break;\n+\n+ case FUSE_STATFS:\n+ ret = fuse_statfs(exp, &out_hdr.statfs);\n+ break;\n+\n+ case FUSE_OPEN:\n+ ret = fuse_open(exp, &out_hdr.open);\n+ break;\n+\n+ case FUSE_RELEASE:\n+ ret = 0;\n+ break;\n+\n+ case FUSE_LOOKUP:\n+ ret = -ENOENT; /* There is no node but the root node */\n+ break;\n+\n+ case FUSE_FORGET:\n+ case FUSE_BATCH_FORGET:\n+ /* These have no response, and there is nothing we need to do */\n+ return;\n+\n+ case FUSE_GETATTR:\n+ ret = fuse_getattr(exp, &out_hdr.attr);\n+ break;\n+\n+ case FUSE_SETATTR: {\n+ const struct fuse_setattr_in *in = &in_hdr->setattr;\n+ ret = fuse_setattr(exp, &out_hdr.attr,\n+ in->valid, in->size, in->mode, in->uid, in->gid);\n+ break;\n+ }\n+\n+ case FUSE_READ: {\n+ const struct fuse_read_in *in = &in_hdr->read;\n+ ret = fuse_read(exp, &out_data_buffer, in->offset, in->size);\n+ break;\n+ }\n+\n+ case FUSE_WRITE: {\n+ const struct fuse_write_in *in = &in_hdr->write;\n+ uint32_t req_len = in_hdr->common.len;\n+\n+ if (unlikely(req_len < sizeof(in_hdr->common) + sizeof(*in) +\n+ in->size)) {\n+ warn_report(\"FUSE WRITE truncated; received %zu bytes of %\" PRIu32,\n+ req_len - sizeof(in_hdr->common) - sizeof(*in),\n+ in->size);\n+ ret = -EINVAL;\n+ break;\n+ }\n+\n+ /*\n+ * read_from_fuse_fd() has checked that in_hdr->len matches the number\n+ * of bytes read, which cannot exceed the max_write value we set\n+ * (FUSE_MAX_WRITE_BYTES). So we know that FUSE_MAX_WRITE_BYTES >=\n+ * in_hdr->len >= in->size + X, so this assertion must hold.\n+ */\n+ assert(in->size <= FUSE_MAX_WRITE_BYTES);\n+\n+ ret = fuse_write(exp, &out_hdr.write,\n+ in->offset, in->size, data_buffer);\n+ break;\n+ }\n+\n+ case FUSE_FALLOCATE: {\n+ const struct fuse_fallocate_in *in = &in_hdr->fallocate;\n+ ret = fuse_fallocate(exp, in->offset, in->length, in->mode);\n+ break;\n+ }\n+\n+ case FUSE_FSYNC:\n+ ret = fuse_fsync(exp);\n+ break;\n+\n+ case FUSE_FLUSH:\n+ ret = fuse_flush(exp);\n+ break;\n+\n #ifdef CONFIG_FUSE_LSEEK\n- .lseek = fuse_lseek,\n+ case FUSE_LSEEK: {\n+ const struct fuse_lseek_in *in = &in_hdr->lseek;\n+ ret = fuse_lseek(exp, &out_hdr.lseek, in->offset, in->whence);\n+ break;\n+ }\n #endif\n-};\n+\n+ default:\n+ ret = -ENOSYS;\n+ }\n+\n+ if (ret >= 0) {\n+ out_hdr.common = (struct fuse_out_header) {\n+ .len = sizeof(out_hdr.common) + ret,\n+ .unique = in_hdr->common.unique,\n+ };\n+ } else {\n+ /* fuse_read() must not return a buffer in case of error */\n+ assert(out_data_buffer == NULL);\n+\n+ out_hdr.common = (struct fuse_out_header) {\n+ .len = sizeof(out_hdr.common),\n+ /* FUSE expects negative errno values */\n+ .error = ret,\n+ .unique = in_hdr->common.unique,\n+ };\n+ }\n+\n+ if (out_data_buffer) {\n+ fuse_write_buf_response(exp->fuse_fd, &out_hdr.common, out_data_buffer);\n+ qemu_vfree(out_data_buffer);\n+ } else {\n+ fuse_write_response(exp->fuse_fd, &out_hdr);\n+ }\n+}\n \n const BlockExportDriver blk_exp_fuse = {\n .type = BLOCK_EXPORT_TYPE_FUSE,\n", "prefixes": [ "v4", "16/24" ] }