From patchwork Thu Oct 8 15:59:23 2009 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Gleb Natapov X-Patchwork-Id: 35494 Return-Path: X-Original-To: incoming@patchwork.ozlabs.org Delivered-To: patchwork-incoming@bilbo.ozlabs.org Received: from lists.gnu.org (lists.gnu.org [199.232.76.165]) (using TLSv1 with cipher DHE-RSA-AES256-SHA (256/256 bits)) (Client did not present a certificate) by ozlabs.org (Postfix) with ESMTPS id 5F5A3B70B3 for ; Fri, 9 Oct 2009 03:53:01 +1100 (EST) Received: from localhost ([127.0.0.1]:59937 helo=lists.gnu.org) by lists.gnu.org with esmtp (Exim 4.43) id 1MvwEc-0001uc-Lx for incoming@patchwork.ozlabs.org; Thu, 08 Oct 2009 12:52:58 -0400 Received: from mailman by lists.gnu.org with tmda-scanned (Exim 4.43) id 1MvvP4-0004uh-4w for qemu-devel@nongnu.org; Thu, 08 Oct 2009 11:59:42 -0400 Received: from exim by lists.gnu.org with spam-scanned (Exim 4.43) id 1MvvOx-0004jA-5E for qemu-devel@nongnu.org; Thu, 08 Oct 2009 11:59:40 -0400 Received: from [199.232.76.173] (port=39103 helo=monty-python.gnu.org) by lists.gnu.org with esmtp (Exim 4.43) id 1MvvOx-0004ik-07 for qemu-devel@nongnu.org; Thu, 08 Oct 2009 11:59:35 -0400 Received: from mx1.redhat.com ([209.132.183.28]:49671) by monty-python.gnu.org with esmtp (Exim 4.60) (envelope-from ) id 1MvvOw-0002LO-AB for qemu-devel@nongnu.org; Thu, 08 Oct 2009 11:59:34 -0400 Received: from int-mx03.intmail.prod.int.phx2.redhat.com (int-mx03.intmail.prod.int.phx2.redhat.com [10.5.11.16]) by mx1.redhat.com (8.13.8/8.13.8) with ESMTP id n98FxWiA022521; Thu, 8 Oct 2009 11:59:32 -0400 Received: from dhcp-1-237.tlv.redhat.com (dhcp-1-237.tlv.redhat.com [10.35.1.237]) by int-mx03.intmail.prod.int.phx2.redhat.com (8.13.8/8.13.8) with ESMTP id n98FxVCi031398; Thu, 8 Oct 2009 11:59:31 -0400 Received: by dhcp-1-237.tlv.redhat.com (Postfix, from userid 13519) id 23E7C13380E; Thu, 8 Oct 2009 17:59:27 +0200 (IST) From: Gleb Natapov To: kevin@koconnor.net Date: Thu, 8 Oct 2009 17:59:23 +0200 Message-Id: <1255017566-26220-19-git-send-email-gleb@redhat.com> In-Reply-To: <1255017566-26220-1-git-send-email-gleb@redhat.com> References: <1255017566-26220-1-git-send-email-gleb@redhat.com> X-Scanned-By: MIMEDefang 2.67 on 10.5.11.16 X-detected-operating-system: by monty-python.gnu.org: Genre and OS details not recognized. Cc: qemu-devel@nongnu.org Subject: [Qemu-devel] [PATCH 18/21] Add SRAT ACPI table support. X-BeenThere: qemu-devel@nongnu.org X-Mailman-Version: 2.1.5 Precedence: list List-Id: qemu-devel.nongnu.org List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Sender: qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org Errors-To: qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org Take NUMA topology info from the QEMU firmware configuration interface (number of nodes, node for each (V)CPU and amount of memory) and build a SRAT table describing this topology for the guest OS. Handles more than 4 GB of RAM by including a hole for 32bit PCI memory mapping. Qemu pcbios commit 444f1226c11082d374b7e1361c6f5696e479642a Signed-off-by: Gleb Natapov --- src/acpi.c | 157 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-- src/paravirt.c | 16 ++++++ src/paravirt.h | 2 + 3 files changed, 170 insertions(+), 5 deletions(-) diff --git a/src/acpi.c b/src/acpi.c index 3e7efc8..41ad0cb 100644 --- a/src/acpi.c +++ b/src/acpi.c @@ -151,7 +151,7 @@ struct multiple_apic_table } PACKED; -/* Values for Type in APIC_HEADER_DEF */ +/* Values for Type in APIC sub-headers */ #define APIC_PROCESSOR 0 #define APIC_IO 1 @@ -167,7 +167,7 @@ struct multiple_apic_table /* * MADT sub-structures (Follow MULTIPLE_APIC_DESCRIPTION_TABLE) */ -#define APIC_HEADER_DEF /* Common APIC sub-structure header */\ +#define ACPI_SUB_HEADER_DEF /* Common ACPI sub-structure header */\ u8 type; \ u8 length; @@ -175,7 +175,7 @@ struct multiple_apic_table struct madt_processor_apic { - APIC_HEADER_DEF + ACPI_SUB_HEADER_DEF u8 processor_id; /* ACPI processor id */ u8 local_apic_id; /* Processor's local APIC id */ #if 0 @@ -188,7 +188,7 @@ struct madt_processor_apic struct madt_io_apic { - APIC_HEADER_DEF + ACPI_SUB_HEADER_DEF u8 io_apic_id; /* I/O APIC ID */ u8 reserved; /* Reserved - must be zero */ u32 address; /* APIC physical address */ @@ -199,7 +199,7 @@ struct madt_io_apic #define PCI_ISA_IRQ_MASK 0x0e20 struct madt_intsrcovr { - APIC_HEADER_DEF + ACPI_SUB_HEADER_DEF u8 bus; u8 source; u32 gsi; @@ -230,6 +230,43 @@ struct acpi_20_hpet { } PACKED; #define ACPI_HPET_ADDRESS 0xFED00000UL +/* + * SRAT (NUMA topology description) table + */ + +#define SRAT_PROCESSOR 0 +#define SRAT_MEMORY 1 + +struct system_resource_affinity_table +{ + ACPI_TABLE_HEADER_DEF + u32 reserved1; + u32 reserved2[2]; +} PACKED; + +struct srat_processor_affinity +{ + ACPI_SUB_HEADER_DEF + u8 proximity_lo; + u8 local_apic_id; + u32 flags; + u8 local_sapic_eid; + u8 proximity_hi[3]; + u32 reserved; +} PACKED; + +struct srat_memory_affinity +{ + ACPI_SUB_HEADER_DEF + u8 proximity[4]; + u16 reserved1; + u32 base_addr_low,base_addr_high; + u32 length_low,length_high; + u32 reserved2; + u32 flags; + u32 reserved3[2]; +} PACKED; + #include "acpi-dsdt.hex" static inline u16 cpu_to_le16(u16 x) @@ -447,6 +484,115 @@ build_hpet(void) return hpet; } +static void +acpi_build_srat_memory(struct srat_memory_affinity *numamem, + u64 base, u64 len, int node, int enabled) +{ + numamem->type = SRAT_MEMORY; + numamem->length = sizeof(*numamem); + memset (numamem->proximity, 0 ,4); + numamem->proximity[0] = node; + numamem->flags = cpu_to_le32(!!enabled); + numamem->base_addr_low = base & 0xFFFFFFFF; + numamem->base_addr_high = base >> 32; + numamem->length_low = len & 0xFFFFFFFF; + numamem->length_high = len >> 32; +} + +#define SRAT_SIGNATURE 0x54415253 //HPET +static void * +build_srat(void) +{ + int nb_numa_nodes = qemu_cfg_get_numa_nodes(); + + if (nb_numa_nodes == 0) + return NULL; + + u64 *numadata = malloc_tmphigh(sizeof(u64) * (CountCPUs + nb_numa_nodes)); + if (!numadata) { + dprintf(1, "Not enough memory for read numa data from VM!\n"); + return NULL; + } + + qemu_cfg_get_numa_data(numadata, CountCPUs + nb_numa_nodes); + + struct system_resource_affinity_table *srat; + int srat_size = sizeof(*srat) + + sizeof(struct srat_processor_affinity) * CountCPUs + + sizeof(struct srat_memory_affinity) * (nb_numa_nodes + 2); + + srat = malloc_high(srat_size); + if (!srat) { + dprintf(1, "Not enough memory for srat table!\n"); + return NULL; + } + + memset(srat, 0, srat_size); + srat->reserved1=1; + struct srat_processor_affinity *core = (void*)(srat + 1); + int i; + u64 curnode; + + for (i = 0; i < CountCPUs; ++i) { + core->type = SRAT_PROCESSOR; + core->length = sizeof(*core); + core->local_apic_id = i; + curnode = *numadata++; + core->proximity_lo = curnode; + memset(core->proximity_hi, 0, 3); + core->local_sapic_eid = 0; + if (i < CountCPUs) + core->flags = cpu_to_le32(1); + else + core->flags = 0; + core++; + } + + + /* the memory map is a bit tricky, it contains at least one hole + * from 640k-1M and possibly another one from 3.5G-4G. + */ + struct srat_memory_affinity *numamem = (void*)core; + int slots = 0; + u64 mem_len, mem_base, next_base = 0; + + acpi_build_srat_memory(numamem, 0, 640*1024, 0, 1); + next_base = 1024 * 1024; + numamem++; + slots++; + for (i = 1; i < nb_numa_nodes + 1; ++i) { + mem_base = next_base; + mem_len = *numadata++; + if (i == 1) + mem_len -= 1024 * 1024; + next_base = mem_base + mem_len; + + /* Cut out the PCI hole */ + if (mem_base <= RamSize && next_base > RamSize) { + mem_len -= next_base - RamSize; + if (mem_len > 0) { + acpi_build_srat_memory(numamem, mem_base, mem_len, i-1, 1); + numamem++; + slots++; + } + mem_base = 1ULL << 32; + mem_len = next_base - RamSize; + next_base += (1ULL << 32) - RamSize; + } + acpi_build_srat_memory(numamem, mem_base, mem_len, i-1, 1); + numamem++; + slots++; + } + for (; slots < nb_numa_nodes + 2; slots++) { + acpi_build_srat_memory(numamem, 0, 0, 0, 0); + numamem++; + } + + build_header((void*)srat, SRAT_SIGNATURE, srat_size, 1); + + return srat; +} + struct rsdp_descriptor *RsdpAddr; #define MAX_ACPI_TABLES 20 @@ -486,6 +632,7 @@ acpi_bios_init(void) ACPI_INIT_TABLE(build_ssdt()); ACPI_INIT_TABLE(build_madt()); ACPI_INIT_TABLE(build_hpet()); + ACPI_INIT_TABLE(build_srat()); u16 i, external_tables = qemu_cfg_acpi_additional_tables(); diff --git a/src/paravirt.c b/src/paravirt.c index 8c08ce7..8fbeb9c 100644 --- a/src/paravirt.c +++ b/src/paravirt.c @@ -265,3 +265,19 @@ int qemu_cfg_smbios_load_external(int type, char **p, unsigned *nr_structs, return 0; } +int qemu_cfg_get_numa_nodes(void) +{ + u64 cnt; + + qemu_cfg_read_entry(&cnt, QEMU_CFG_NUMA, sizeof(cnt)); + + return (int)cnt; +} + +void qemu_cfg_get_numa_data(u64 *data, int n) +{ + int i; + + for (i = 0; i < n; i++) + qemu_cfg_read((u8*)(data + i), sizeof(u64)); +} diff --git a/src/paravirt.h b/src/paravirt.h index 2b2f314..04a6907 100644 --- a/src/paravirt.h +++ b/src/paravirt.h @@ -49,5 +49,7 @@ u16 qemu_cfg_smbios_entries(void); size_t qemu_cfg_smbios_load_field(int type, size_t offset, void *addr); int qemu_cfg_smbios_load_external(int type, char **p, unsigned *nr_structs, unsigned *max_struct_size, char *end); +int qemu_cfg_get_numa_nodes(void); +void qemu_cfg_get_numa_data(u64 *data, int n); #endif