Patch Detail
get:
Show a patch.
patch:
Update a patch.
put:
Update a patch.
GET /api/patches/1839914/?format=api
{ "id": 1839914, "url": "http://patchwork.ozlabs.org/api/patches/1839914/?format=api", "web_url": "http://patchwork.ozlabs.org/project/qemu-devel/patch/20230926185738.277351-17-david@redhat.com/", "project": { "id": 14, "url": "http://patchwork.ozlabs.org/api/projects/14/?format=api", "name": "QEMU Development", "link_name": "qemu-devel", "list_id": "qemu-devel.nongnu.org", "list_email": "qemu-devel@nongnu.org", "web_url": "", "scm_url": "", "webscm_url": "", "list_archive_url": "", "list_archive_url_format": "", "commit_url_format": "" }, "msgid": "<20230926185738.277351-17-david@redhat.com>", "list_archive_url": null, "date": "2023-09-26T18:57:36", "name": "[v4,16/18] virtio-mem: Expose device memory dynamically via multiple memslots if enabled", "commit_ref": null, "pull_url": null, "state": "new", "archived": false, "hash": "3cd2fea210b86b8fc90148937e06bcfdaac8f1ae", "submitter": { "id": 70402, "url": "http://patchwork.ozlabs.org/api/people/70402/?format=api", "name": "David Hildenbrand", "email": "david@redhat.com" }, "delegate": null, "mbox": "http://patchwork.ozlabs.org/project/qemu-devel/patch/20230926185738.277351-17-david@redhat.com/mbox/", "series": [ { "id": 374991, "url": "http://patchwork.ozlabs.org/api/series/374991/?format=api", "web_url": "http://patchwork.ozlabs.org/project/qemu-devel/list/?series=374991", "date": "2023-09-26T18:57:23", "name": "virtio-mem: Expose device memory through multiple memslots", "version": 4, "mbox": "http://patchwork.ozlabs.org/series/374991/mbox/" } ], "comments": "http://patchwork.ozlabs.org/api/patches/1839914/comments/", "check": "pending", "checks": "http://patchwork.ozlabs.org/api/patches/1839914/checks/", "tags": {}, "related": [], "headers": { "Return-Path": "<qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org>", "X-Original-To": "incoming@patchwork.ozlabs.org", "Delivered-To": "patchwork-incoming@legolas.ozlabs.org", "Authentication-Results": [ "legolas.ozlabs.org;\n\tdkim=pass (1024-bit key;\n unprotected) header.d=redhat.com header.i=@redhat.com header.a=rsa-sha256\n header.s=mimecast20190719 header.b=YSNCI/kk;\n\tdkim-atps=neutral", "legolas.ozlabs.org;\n spf=pass (sender SPF authorized) smtp.mailfrom=nongnu.org\n (client-ip=209.51.188.17; helo=lists.gnu.org;\n envelope-from=qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org;\n receiver=patchwork.ozlabs.org)" ], "Received": [ "from lists.gnu.org (lists.gnu.org [209.51.188.17])\n\t(using TLSv1.2 with cipher ECDHE-ECDSA-AES256-GCM-SHA384 (256/256 bits))\n\t(No client certificate requested)\n\tby legolas.ozlabs.org (Postfix) with ESMTPS id 4Rw8Hb18vCz1ynX\n\tfor <incoming@patchwork.ozlabs.org>; Wed, 27 Sep 2023 05:02:03 +1000 (AEST)", "from localhost ([::1] helo=lists1p.gnu.org)\n\tby lists.gnu.org with esmtp (Exim 4.90_1)\n\t(envelope-from <qemu-devel-bounces@nongnu.org>)\n\tid 1qlDIF-0002Dr-NZ; Tue, 26 Sep 2023 15:00:24 -0400", "from eggs.gnu.org ([2001:470:142:3::10])\n by lists.gnu.org with esmtps (TLS1.2:ECDHE_RSA_AES_256_GCM_SHA384:256)\n (Exim 4.90_1) (envelope-from <david@redhat.com>) id 1qlDHv-0001oC-PN\n for qemu-devel@nongnu.org; Tue, 26 Sep 2023 15:00:11 -0400", "from us-smtp-delivery-124.mimecast.com ([170.10.133.124])\n by eggs.gnu.org with esmtps (TLS1.2:ECDHE_RSA_AES_256_GCM_SHA384:256)\n (Exim 4.90_1) (envelope-from <david@redhat.com>) id 1qlDHs-0003Fw-Fq\n for qemu-devel@nongnu.org; Tue, 26 Sep 2023 15:00:03 -0400", "from mimecast-mx02.redhat.com (mimecast-mx02.redhat.com\n [66.187.233.88]) by relay.mimecast.com with ESMTP with STARTTLS\n (version=TLSv1.2, cipher=TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384) id\n us-mta-80-9MIqkCUJMXW4b70zUgM2KQ-1; Tue, 26 Sep 2023 14:59:58 -0400", "from smtp.corp.redhat.com (int-mx04.intmail.prod.int.rdu2.redhat.com\n [10.11.54.4])\n (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))\n (No client certificate requested)\n by mimecast-mx02.redhat.com (Postfix) with ESMTPS id 929DA811E7D;\n Tue, 26 Sep 2023 18:59:57 +0000 (UTC)", "from t14s.fritz.box (unknown [10.39.192.33])\n by smtp.corp.redhat.com (Postfix) with ESMTP id B62362026D4B;\n Tue, 26 Sep 2023 18:59:52 +0000 (UTC)" ], "DKIM-Signature": "v=1; a=rsa-sha256; c=relaxed/relaxed; d=redhat.com;\n s=mimecast20190719; t=1695754799;\n h=from:from:reply-to:subject:subject:date:date:message-id:message-id:\n to:to:cc:cc:mime-version:mime-version:\n content-transfer-encoding:content-transfer-encoding:\n in-reply-to:in-reply-to:references:references;\n bh=vV4PQc9MX5cpMSAkjQdv/L1WcUiVtYlZmV5E59fKZWI=;\n b=YSNCI/kkJxhnJY5oxKMf+OU5YIHd5nd9zjju1sIUio1qaFcHJZxpEmYkM4gvt2K/EF2wsH\n FVhIIR2i7cmPaT0KNLbK3uENXlaMgc9xrTdCA2ujqnTUbeB3OVdMwakY5LP3cpRGr+tjZu\n uIcvPqCcTQ2JP1nk0PKrrqZC28/3oKA=", "X-MC-Unique": "9MIqkCUJMXW4b70zUgM2KQ-1", "From": "David Hildenbrand <david@redhat.com>", "To": "qemu-devel@nongnu.org", "Cc": "David Hildenbrand <david@redhat.com>, Paolo Bonzini <pbonzini@redhat.com>,\n Igor Mammedov <imammedo@redhat.com>,\n Xiao Guangrong <xiaoguangrong.eric@gmail.com>,\n \"Michael S. Tsirkin\" <mst@redhat.com>, Peter Xu <peterx@redhat.com>,\n\t=?utf-8?q?Philippe_Mathieu-Daud=C3=A9?= <philmd@linaro.org>,\n Eduardo Habkost <eduardo@habkost.net>,\n Marcel Apfelbaum <marcel.apfelbaum@gmail.com>,\n Yanan Wang <wangyanan55@huawei.com>, Michal Privoznik <mprivozn@redhat.com>,\n\t=?utf-8?q?Daniel_P_=2E_Berrang=C3=A9?= <berrange@redhat.com>,\n Gavin Shan <gshan@redhat.com>, Alex Williamson <alex.williamson@redhat.com>,\n Stefan Hajnoczi <stefanha@redhat.com>,\n \"Maciej S . Szmigiero\" <mail@maciej.szmigiero.name>, kvm@vger.kernel.org", "Subject": "[PATCH v4 16/18] virtio-mem: Expose device memory dynamically via\n multiple memslots if enabled", "Date": "Tue, 26 Sep 2023 20:57:36 +0200", "Message-ID": "<20230926185738.277351-17-david@redhat.com>", "In-Reply-To": "<20230926185738.277351-1-david@redhat.com>", "References": "<20230926185738.277351-1-david@redhat.com>", "MIME-Version": "1.0", "Content-Transfer-Encoding": "8bit", "X-Scanned-By": "MIMEDefang 3.1 on 10.11.54.4", "Received-SPF": "pass client-ip=170.10.133.124; envelope-from=david@redhat.com;\n helo=us-smtp-delivery-124.mimecast.com", "X-Spam_score_int": "-20", "X-Spam_score": "-2.1", "X-Spam_bar": "--", "X-Spam_report": "(-2.1 / 5.0 requ) BAYES_00=-1.9, DKIMWL_WL_HIGH=-0.001,\n DKIM_SIGNED=0.1, DKIM_VALID=-0.1, DKIM_VALID_AU=-0.1, DKIM_VALID_EF=-0.1,\n RCVD_IN_DNSWL_NONE=-0.0001, RCVD_IN_MSPIKE_H3=0.001, RCVD_IN_MSPIKE_WL=0.001,\n SPF_HELO_NONE=0.001, SPF_PASS=-0.001 autolearn=ham autolearn_force=no", "X-Spam_action": "no action", "X-BeenThere": "qemu-devel@nongnu.org", "X-Mailman-Version": "2.1.29", "Precedence": "list", "List-Id": "<qemu-devel.nongnu.org>", "List-Unsubscribe": "<https://lists.nongnu.org/mailman/options/qemu-devel>,\n <mailto:qemu-devel-request@nongnu.org?subject=unsubscribe>", "List-Archive": "<https://lists.nongnu.org/archive/html/qemu-devel>", "List-Post": "<mailto:qemu-devel@nongnu.org>", "List-Help": "<mailto:qemu-devel-request@nongnu.org?subject=help>", "List-Subscribe": "<https://lists.nongnu.org/mailman/listinfo/qemu-devel>,\n <mailto:qemu-devel-request@nongnu.org?subject=subscribe>", "Errors-To": "qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org", "Sender": "qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org" }, "content": "Having large virtio-mem devices that only expose little memory to a VM\nis currently a problem: we map the whole sparse memory region into the\nguest using a single memslot, resulting in one gigantic memslot in KVM.\nKVM allocates metadata for the whole memslot, which can result in quite\nsome memory waste.\n\nAssuming we have a 1 TiB virtio-mem device and only expose little (e.g.,\n1 GiB) memory, we would create a single 1 TiB memslot and KVM has to\nallocate metadata for that 1 TiB memslot: on x86, this implies allocating\na significant amount of memory for metadata:\n\n(1) RMAP: 8 bytes per 4 KiB, 8 bytes per 2 MiB, 8 bytes per 1 GiB\n -> For 1 TiB: 2147483648 + 4194304 + 8192 = ~ 2 GiB (0.2 %)\n\n With the TDP MMU (cat /sys/module/kvm/parameters/tdp_mmu) this gets\n allocated lazily when required for nested VMs\n(2) gfn_track: 2 bytes per 4 KiB\n -> For 1 TiB: 536870912 = ~512 MiB (0.05 %)\n(3) lpage_info: 4 bytes per 2 MiB, 4 bytes per 1 GiB\n -> For 1 TiB: 2097152 + 4096 = ~2 MiB (0.0002 %)\n(4) 2x dirty bitmaps for tracking: 2x 1 bit per 4 KiB page\n -> For 1 TiB: 536870912 = 64 MiB (0.006 %)\n\nSo we primarily care about (1) and (2). The bad thing is, that the\nmemory consumption *doubles* once SMM is enabled, because we create the\nmemslot once for !SMM and once for SMM.\n\nHaving a 1 TiB memslot without the TDP MMU consumes around:\n* With SMM: 5 GiB\n* Without SMM: 2.5 GiB\nHaving a 1 TiB memslot with the TDP MMU consumes around:\n* With SMM: 1 GiB\n* Without SMM: 512 MiB\n\n... and that's really something we want to optimize, to be able to just\nstart a VM with small boot memory (e.g., 4 GiB) and a virtio-mem device\nthat can grow very large (e.g., 1 TiB).\n\nConsequently, using multiple memslots and only mapping the memslots we\nreally need can significantly reduce memory waste and speed up\nmemslot-related operations. Let's expose the sparse RAM memory region using\nmultiple memslots, mapping only the memslots we currently need into our\ndevice memory region container.\n\nThe feature can be enabled using \"dynamic-memslots=on\" and requires\n\"unplugged-inaccessible=on\", which is nowadays the default.\n\nOnce enabled, we'll auto-detect the number of memslots to use based on the\nmemslot limit provided by the core. We'll use at most 1 memslot per\ngigabyte. Note that our global limit of memslots accross all memory devices\nis currently set to 256: even with multiple large virtio-mem devices,\nwe'd still have a sane limit on the number of memslots used.\n\nThe default is to not dynamically map memslot for now\n(\"dynamic-memslots=off\"). The optimization must be enabled manually,\nbecause some vhost setups (e.g., hotplug of vhost-user devices) might be\nproblematic until we support more memslots especially in vhost-user backends.\n\nNote that \"dynamic-memslots=on\" is just a hint that multiple memslots\n*may* be used for internal optimizations, not that multiple memslots\n*must* be used. The actual number of memslots that are used is an\ninternal detail: for example, once memslot metadata is no longer an\nissue, we could simply stop optimizing for that. Migration source and\ndestination can differ on the setting of \"dynamic-memslots\".\n\nSigned-off-by: David Hildenbrand <david@redhat.com>\n---\n hw/virtio/virtio-mem-pci.c | 21 +++\n hw/virtio/virtio-mem.c | 288 +++++++++++++++++++++++++++++++++\n include/hw/virtio/virtio-mem.h | 32 +++-\n 3 files changed, 340 insertions(+), 1 deletion(-)", "diff": "diff --git a/hw/virtio/virtio-mem-pci.c b/hw/virtio/virtio-mem-pci.c\nindex c4597e029e..1b4e9a3284 100644\n--- a/hw/virtio/virtio-mem-pci.c\n+++ b/hw/virtio/virtio-mem-pci.c\n@@ -48,6 +48,25 @@ static MemoryRegion *virtio_mem_pci_get_memory_region(MemoryDeviceState *md,\n return vmc->get_memory_region(vmem, errp);\n }\n \n+static void virtio_mem_pci_decide_memslots(MemoryDeviceState *md,\n+ unsigned int limit)\n+{\n+ VirtIOMEMPCI *pci_mem = VIRTIO_MEM_PCI(md);\n+ VirtIOMEM *vmem = VIRTIO_MEM(&pci_mem->vdev);\n+ VirtIOMEMClass *vmc = VIRTIO_MEM_GET_CLASS(vmem);\n+\n+ vmc->decide_memslots(vmem, limit);\n+}\n+\n+static unsigned int virtio_mem_pci_get_memslots(MemoryDeviceState *md)\n+{\n+ VirtIOMEMPCI *pci_mem = VIRTIO_MEM_PCI(md);\n+ VirtIOMEM *vmem = VIRTIO_MEM(&pci_mem->vdev);\n+ VirtIOMEMClass *vmc = VIRTIO_MEM_GET_CLASS(vmem);\n+\n+ return vmc->get_memslots(vmem);\n+}\n+\n static uint64_t virtio_mem_pci_get_plugged_size(const MemoryDeviceState *md,\n Error **errp)\n {\n@@ -150,6 +169,8 @@ static void virtio_mem_pci_class_init(ObjectClass *klass, void *data)\n mdc->set_addr = virtio_mem_pci_set_addr;\n mdc->get_plugged_size = virtio_mem_pci_get_plugged_size;\n mdc->get_memory_region = virtio_mem_pci_get_memory_region;\n+ mdc->decide_memslots = virtio_mem_pci_decide_memslots;\n+ mdc->get_memslots = virtio_mem_pci_get_memslots;\n mdc->fill_device_info = virtio_mem_pci_fill_device_info;\n mdc->get_min_alignment = virtio_mem_pci_get_min_alignment;\n \ndiff --git a/hw/virtio/virtio-mem.c b/hw/virtio/virtio-mem.c\nindex 0cf47df9cf..e1e4250e69 100644\n--- a/hw/virtio/virtio-mem.c\n+++ b/hw/virtio/virtio-mem.c\n@@ -66,6 +66,13 @@ static uint32_t virtio_mem_default_thp_size(void)\n return default_thp_size;\n }\n \n+/*\n+ * The minimum memslot size depends on this setting (\"sane default\"), the\n+ * device block size, and the memory backend page size. The last (or single)\n+ * memslot might be smaller than this constant.\n+ */\n+#define VIRTIO_MEM_MIN_MEMSLOT_SIZE (1 * GiB)\n+\n /*\n * We want to have a reasonable default block size such that\n * 1. We avoid splitting THPs when unplugging memory, which degrades\n@@ -483,6 +490,96 @@ static bool virtio_mem_valid_range(const VirtIOMEM *vmem, uint64_t gpa,\n return true;\n }\n \n+static void virtio_mem_activate_memslot(VirtIOMEM *vmem, unsigned int idx)\n+{\n+ const uint64_t memslot_offset = idx * vmem->memslot_size;\n+\n+ assert(vmem->memslots);\n+\n+ /*\n+ * Instead of enabling/disabling memslots, we add/remove them. This should\n+ * make address space updates faster, because we don't have to loop over\n+ * many disabled subregions.\n+ */\n+ if (memory_region_is_mapped(&vmem->memslots[idx])) {\n+ return;\n+ }\n+ memory_region_add_subregion(vmem->mr, memslot_offset, &vmem->memslots[idx]);\n+}\n+\n+static void virtio_mem_deactivate_memslot(VirtIOMEM *vmem, unsigned int idx)\n+{\n+ assert(vmem->memslots);\n+\n+ if (!memory_region_is_mapped(&vmem->memslots[idx])) {\n+ return;\n+ }\n+ memory_region_del_subregion(vmem->mr, &vmem->memslots[idx]);\n+}\n+\n+static void virtio_mem_activate_memslots_to_plug(VirtIOMEM *vmem,\n+ uint64_t offset, uint64_t size)\n+{\n+ const unsigned int start_idx = offset / vmem->memslot_size;\n+ const unsigned int end_idx = (offset + size + vmem->memslot_size - 1) /\n+ vmem->memslot_size;\n+ unsigned int idx;\n+\n+ if (!vmem->dynamic_memslots) {\n+ return;\n+ }\n+\n+ /* Activate all involved memslots in a single transaction. */\n+ memory_region_transaction_begin();\n+ for (idx = start_idx; idx < end_idx; idx++) {\n+ virtio_mem_activate_memslot(vmem, idx);\n+ }\n+ memory_region_transaction_commit();\n+}\n+\n+static void virtio_mem_deactivate_unplugged_memslots(VirtIOMEM *vmem,\n+ uint64_t offset,\n+ uint64_t size)\n+{\n+ const uint64_t region_size = memory_region_size(&vmem->memdev->mr);\n+ const unsigned int start_idx = offset / vmem->memslot_size;\n+ const unsigned int end_idx = (offset + size + vmem->memslot_size - 1) /\n+ vmem->memslot_size;\n+ unsigned int idx;\n+\n+ if (!vmem->dynamic_memslots) {\n+ return;\n+ }\n+\n+ /* Deactivate all memslots with unplugged blocks in a single transaction. */\n+ memory_region_transaction_begin();\n+ for (idx = start_idx; idx < end_idx; idx++) {\n+ const uint64_t memslot_offset = idx * vmem->memslot_size;\n+ uint64_t memslot_size = vmem->memslot_size;\n+\n+ /* The size of the last memslot might be smaller. */\n+ if (idx == vmem->nb_memslots - 1) {\n+ memslot_size = region_size - memslot_offset;\n+ }\n+\n+ /*\n+ * Partially covered memslots might still have some blocks plugged and\n+ * have to remain active if that's the case.\n+ */\n+ if (offset > memslot_offset ||\n+ offset + size < memslot_offset + memslot_size) {\n+ const uint64_t gpa = vmem->addr + memslot_offset;\n+\n+ if (!virtio_mem_is_range_unplugged(vmem, gpa, memslot_size)) {\n+ continue;\n+ }\n+ }\n+\n+ virtio_mem_deactivate_memslot(vmem, idx);\n+ }\n+ memory_region_transaction_commit();\n+}\n+\n static int virtio_mem_set_block_state(VirtIOMEM *vmem, uint64_t start_gpa,\n uint64_t size, bool plug)\n {\n@@ -500,6 +597,8 @@ static int virtio_mem_set_block_state(VirtIOMEM *vmem, uint64_t start_gpa,\n }\n virtio_mem_notify_unplug(vmem, offset, size);\n virtio_mem_set_range_unplugged(vmem, start_gpa, size);\n+ /* Deactivate completely unplugged memslots after updating the state. */\n+ virtio_mem_deactivate_unplugged_memslots(vmem, offset, size);\n return 0;\n }\n \n@@ -527,7 +626,20 @@ static int virtio_mem_set_block_state(VirtIOMEM *vmem, uint64_t start_gpa,\n }\n \n if (!ret) {\n+ /*\n+ * Activate before notifying and rollback in case of any errors.\n+ *\n+ * When activating a yet inactive memslot, memory notifiers will get\n+ * notified about the added memory region and can register with the\n+ * RamDiscardManager; this will traverse all plugged blocks and skip the\n+ * blocks we are plugging here. The following notification will inform\n+ * registered listeners about the blocks we're plugging.\n+ */\n+ virtio_mem_activate_memslots_to_plug(vmem, offset, size);\n ret = virtio_mem_notify_plug(vmem, offset, size);\n+ if (ret) {\n+ virtio_mem_deactivate_unplugged_memslots(vmem, offset, size);\n+ }\n }\n if (ret) {\n /* Could be preallocation or a notifier populated memory. */\n@@ -620,6 +732,7 @@ static void virtio_mem_resize_usable_region(VirtIOMEM *vmem,\n \n static int virtio_mem_unplug_all(VirtIOMEM *vmem)\n {\n+ const uint64_t region_size = memory_region_size(&vmem->memdev->mr);\n RAMBlock *rb = vmem->memdev->mr.ram_block;\n \n if (vmem->size) {\n@@ -634,6 +747,9 @@ static int virtio_mem_unplug_all(VirtIOMEM *vmem)\n bitmap_clear(vmem->bitmap, 0, vmem->bitmap_size);\n vmem->size = 0;\n notifier_list_notify(&vmem->size_change_notifiers, &vmem->size);\n+\n+ /* Deactivate all memslots after updating the state. */\n+ virtio_mem_deactivate_unplugged_memslots(vmem, 0, region_size);\n }\n \n trace_virtio_mem_unplugged_all();\n@@ -790,6 +906,43 @@ static void virtio_mem_system_reset(void *opaque)\n virtio_mem_unplug_all(vmem);\n }\n \n+static void virtio_mem_prepare_mr(VirtIOMEM *vmem)\n+{\n+ const uint64_t region_size = memory_region_size(&vmem->memdev->mr);\n+\n+ assert(!vmem->mr && vmem->dynamic_memslots);\n+ vmem->mr = g_new0(MemoryRegion, 1);\n+ memory_region_init(vmem->mr, OBJECT(vmem), \"virtio-mem\",\n+ region_size);\n+ vmem->mr->align = memory_region_get_alignment(&vmem->memdev->mr);\n+}\n+\n+static void virtio_mem_prepare_memslots(VirtIOMEM *vmem)\n+{\n+ const uint64_t region_size = memory_region_size(&vmem->memdev->mr);\n+ unsigned int idx;\n+\n+ g_assert(!vmem->memslots && vmem->nb_memslots && vmem->dynamic_memslots);\n+ vmem->memslots = g_new0(MemoryRegion, vmem->nb_memslots);\n+\n+ /* Initialize our memslots, but don't map them yet. */\n+ for (idx = 0; idx < vmem->nb_memslots; idx++) {\n+ const uint64_t memslot_offset = idx * vmem->memslot_size;\n+ uint64_t memslot_size = vmem->memslot_size;\n+ char name[20];\n+\n+ /* The size of the last memslot might be smaller. */\n+ if (idx == vmem->nb_memslots - 1) {\n+ memslot_size = region_size - memslot_offset;\n+ }\n+\n+ snprintf(name, sizeof(name), \"memslot-%u\", idx);\n+ memory_region_init_alias(&vmem->memslots[idx], OBJECT(vmem), name,\n+ &vmem->memdev->mr, memslot_offset,\n+ memslot_size);\n+ }\n+}\n+\n static void virtio_mem_device_realize(DeviceState *dev, Error **errp)\n {\n MachineState *ms = MACHINE(qdev_get_machine());\n@@ -861,6 +1014,14 @@ static void virtio_mem_device_realize(DeviceState *dev, Error **errp)\n vmem->unplugged_inaccessible = ON_OFF_AUTO_ON;\n #endif /* VIRTIO_MEM_HAS_LEGACY_GUESTS */\n \n+ if (vmem->dynamic_memslots &&\n+ vmem->unplugged_inaccessible != ON_OFF_AUTO_ON) {\n+ error_setg(errp, \"'%s' property set to 'on' requires '%s' to be 'on'\",\n+ VIRTIO_MEM_DYNAMIC_MEMSLOTS_PROP,\n+ VIRTIO_MEM_UNPLUGGED_INACCESSIBLE_PROP);\n+ return;\n+ }\n+\n /*\n * If the block size wasn't configured by the user, use a sane default. This\n * allows using hugetlbfs backends of any page size without manual\n@@ -930,6 +1091,25 @@ static void virtio_mem_device_realize(DeviceState *dev, Error **errp)\n virtio_init(vdev, VIRTIO_ID_MEM, sizeof(struct virtio_mem_config));\n vmem->vq = virtio_add_queue(vdev, 128, virtio_mem_handle_request);\n \n+ /*\n+ * With \"dynamic-memslots=off\" (old behavior) we always map the whole\n+ * RAM memory region directly.\n+ */\n+ if (vmem->dynamic_memslots) {\n+ if (!vmem->mr) {\n+ virtio_mem_prepare_mr(vmem);\n+ }\n+ if (vmem->nb_memslots <= 1) {\n+ vmem->nb_memslots = 1;\n+ vmem->memslot_size = memory_region_size(&vmem->memdev->mr);\n+ }\n+ if (!vmem->memslots) {\n+ virtio_mem_prepare_memslots(vmem);\n+ }\n+ } else {\n+ assert(!vmem->mr && !vmem->nb_memslots && !vmem->memslots);\n+ }\n+\n host_memory_backend_set_mapped(vmem->memdev, true);\n vmstate_register_ram(&vmem->memdev->mr, DEVICE(vmem));\n if (vmem->early_migration) {\n@@ -984,11 +1164,31 @@ static int virtio_mem_restore_unplugged(VirtIOMEM *vmem)\n virtio_mem_discard_range_cb);\n }\n \n+static int virtio_mem_activate_memslot_range_cb(VirtIOMEM *vmem, void *arg,\n+ uint64_t offset, uint64_t size)\n+{\n+ virtio_mem_activate_memslots_to_plug(vmem, offset, size);\n+ return 0;\n+}\n+\n static int virtio_mem_post_load_bitmap(VirtIOMEM *vmem)\n {\n RamDiscardListener *rdl;\n int ret;\n \n+ /*\n+ * We restored the bitmap and updated the requested size; activate all\n+ * memslots (so listeners register) before notifying about plugged blocks.\n+ */\n+ if (vmem->dynamic_memslots) {\n+ /*\n+ * We don't expect any active memslots at this point to deactivate: no\n+ * memory was plugged on the migration destination.\n+ */\n+ virtio_mem_for_each_plugged_range(vmem, NULL,\n+ virtio_mem_activate_memslot_range_cb);\n+ }\n+\n /*\n * We started out with all memory discarded and our memory region is mapped\n * into an address space. Replay, now that we updated the bitmap.\n@@ -1251,11 +1451,79 @@ static MemoryRegion *virtio_mem_get_memory_region(VirtIOMEM *vmem, Error **errp)\n if (!vmem->memdev) {\n error_setg(errp, \"'%s' property must be set\", VIRTIO_MEM_MEMDEV_PROP);\n return NULL;\n+ } else if (vmem->dynamic_memslots) {\n+ if (!vmem->mr) {\n+ virtio_mem_prepare_mr(vmem);\n+ }\n+ return vmem->mr;\n }\n \n return &vmem->memdev->mr;\n }\n \n+static void virtio_mem_decide_memslots(VirtIOMEM *vmem, unsigned int limit)\n+{\n+ uint64_t region_size, memslot_size, min_memslot_size;\n+ unsigned int memslots;\n+ RAMBlock *rb;\n+\n+ if (!vmem->dynamic_memslots) {\n+ return;\n+ }\n+\n+ /* We're called exactly once, before realizing the device. */\n+ assert(!vmem->nb_memslots);\n+\n+ /* If realizing the device will fail, just assume a single memslot. */\n+ if (limit <= 1 || !vmem->memdev || !vmem->memdev->mr.ram_block) {\n+ vmem->nb_memslots = 1;\n+ return;\n+ }\n+\n+ rb = vmem->memdev->mr.ram_block;\n+ region_size = memory_region_size(&vmem->memdev->mr);\n+\n+ /*\n+ * Determine the default block size now, to determine the minimum memslot\n+ * size. We want the minimum slot size to be at least the device block size.\n+ */\n+ if (!vmem->block_size) {\n+ vmem->block_size = virtio_mem_default_block_size(rb);\n+ }\n+ /* If realizing the device will fail, just assume a single memslot. */\n+ if (vmem->block_size < qemu_ram_pagesize(rb) ||\n+ !QEMU_IS_ALIGNED(region_size, vmem->block_size)) {\n+ vmem->nb_memslots = 1;\n+ return;\n+ }\n+\n+ /*\n+ * All memslots except the last one have a reasonable minimum size, and\n+ * and all memslot sizes are aligned to the device block size.\n+ */\n+ memslot_size = QEMU_ALIGN_UP(region_size / limit, vmem->block_size);\n+ min_memslot_size = MAX(vmem->block_size, VIRTIO_MEM_MIN_MEMSLOT_SIZE);\n+ memslot_size = MAX(memslot_size, min_memslot_size);\n+\n+ memslots = QEMU_ALIGN_UP(region_size, memslot_size) / memslot_size;\n+ if (memslots != 1) {\n+ vmem->memslot_size = memslot_size;\n+ }\n+ vmem->nb_memslots = memslots;\n+}\n+\n+static unsigned int virtio_mem_get_memslots(VirtIOMEM *vmem)\n+{\n+ if (!vmem->dynamic_memslots) {\n+ /* Exactly one static RAM memory region. */\n+ return 1;\n+ }\n+\n+ /* We're called after instructed to make a decision. */\n+ g_assert(vmem->nb_memslots);\n+ return vmem->nb_memslots;\n+}\n+\n static void virtio_mem_add_size_change_notifier(VirtIOMEM *vmem,\n Notifier *notifier)\n {\n@@ -1393,6 +1661,21 @@ static void virtio_mem_instance_init(Object *obj)\n NULL, NULL);\n }\n \n+static void virtio_mem_instance_finalize(Object *obj)\n+{\n+ VirtIOMEM *vmem = VIRTIO_MEM(obj);\n+\n+ /*\n+ * Note: the core already dropped the references on all memory regions\n+ * (it's passed as the owner to memory_region_init_*()) and finalized\n+ * these objects. We can simply free the memory.\n+ */\n+ g_free(vmem->memslots);\n+ vmem->memslots = NULL;\n+ g_free(vmem->mr);\n+ vmem->mr = NULL;\n+}\n+\n static Property virtio_mem_properties[] = {\n DEFINE_PROP_UINT64(VIRTIO_MEM_ADDR_PROP, VirtIOMEM, addr, 0),\n DEFINE_PROP_UINT32(VIRTIO_MEM_NODE_PROP, VirtIOMEM, node, 0),\n@@ -1405,6 +1688,8 @@ static Property virtio_mem_properties[] = {\n #endif\n DEFINE_PROP_BOOL(VIRTIO_MEM_EARLY_MIGRATION_PROP, VirtIOMEM,\n early_migration, true),\n+ DEFINE_PROP_BOOL(VIRTIO_MEM_DYNAMIC_MEMSLOTS_PROP, VirtIOMEM,\n+ dynamic_memslots, false),\n DEFINE_PROP_END_OF_LIST(),\n };\n \n@@ -1572,6 +1857,8 @@ static void virtio_mem_class_init(ObjectClass *klass, void *data)\n \n vmc->fill_device_info = virtio_mem_fill_device_info;\n vmc->get_memory_region = virtio_mem_get_memory_region;\n+ vmc->decide_memslots = virtio_mem_decide_memslots;\n+ vmc->get_memslots = virtio_mem_get_memslots;\n vmc->add_size_change_notifier = virtio_mem_add_size_change_notifier;\n vmc->remove_size_change_notifier = virtio_mem_remove_size_change_notifier;\n vmc->unplug_request_check = virtio_mem_unplug_request_check;\n@@ -1589,6 +1876,7 @@ static const TypeInfo virtio_mem_info = {\n .parent = TYPE_VIRTIO_DEVICE,\n .instance_size = sizeof(VirtIOMEM),\n .instance_init = virtio_mem_instance_init,\n+ .instance_finalize = virtio_mem_instance_finalize,\n .class_init = virtio_mem_class_init,\n .class_size = sizeof(VirtIOMEMClass),\n .interfaces = (InterfaceInfo[]) {\ndiff --git a/include/hw/virtio/virtio-mem.h b/include/hw/virtio/virtio-mem.h\nindex ab0fe2b4f2..5f5b02b8f9 100644\n--- a/include/hw/virtio/virtio-mem.h\n+++ b/include/hw/virtio/virtio-mem.h\n@@ -33,6 +33,7 @@ OBJECT_DECLARE_TYPE(VirtIOMEM, VirtIOMEMClass,\n #define VIRTIO_MEM_UNPLUGGED_INACCESSIBLE_PROP \"unplugged-inaccessible\"\n #define VIRTIO_MEM_EARLY_MIGRATION_PROP \"x-early-migration\"\n #define VIRTIO_MEM_PREALLOC_PROP \"prealloc\"\n+#define VIRTIO_MEM_DYNAMIC_MEMSLOTS_PROP \"dynamic-memslots\"\n \n struct VirtIOMEM {\n VirtIODevice parent_obj;\n@@ -44,7 +45,28 @@ struct VirtIOMEM {\n int32_t bitmap_size;\n unsigned long *bitmap;\n \n- /* assigned memory backend and memory region */\n+ /*\n+ * With \"dynamic-memslots=on\": Device memory region in which we dynamically\n+ * map the memslots.\n+ */\n+ MemoryRegion *mr;\n+\n+ /*\n+ * With \"dynamic-memslots=on\": The individual memslots (aliases into the\n+ * memory backend).\n+ */\n+ MemoryRegion *memslots;\n+\n+ /* With \"dynamic-memslots=on\": The total number of memslots. */\n+ uint16_t nb_memslots;\n+\n+ /*\n+ * With \"dynamic-memslots=on\": Size of one memslot (the size of the\n+ * last one can differ).\n+ */\n+ uint64_t memslot_size;\n+\n+ /* Assigned memory backend with the RAM memory region. */\n HostMemoryBackend *memdev;\n \n /* NUMA node */\n@@ -82,6 +104,12 @@ struct VirtIOMEM {\n */\n bool early_migration;\n \n+ /*\n+ * Whether we dynamically map (multiple, if possible) memslots instead of\n+ * statically mapping the whole RAM memory region.\n+ */\n+ bool dynamic_memslots;\n+\n /* notifiers to notify when \"size\" changes */\n NotifierList size_change_notifiers;\n \n@@ -96,6 +124,8 @@ struct VirtIOMEMClass {\n /* public */\n void (*fill_device_info)(const VirtIOMEM *vmen, VirtioMEMDeviceInfo *vi);\n MemoryRegion *(*get_memory_region)(VirtIOMEM *vmem, Error **errp);\n+ void (*decide_memslots)(VirtIOMEM *vmem, unsigned int limit);\n+ unsigned int (*get_memslots)(VirtIOMEM *vmem);\n void (*add_size_change_notifier)(VirtIOMEM *vmem, Notifier *notifier);\n void (*remove_size_change_notifier)(VirtIOMEM *vmem, Notifier *notifier);\n void (*unplug_request_check)(VirtIOMEM *vmem, Error **errp);\n", "prefixes": [ "v4", "16/18" ] }