diff mbox

[v3,23/32] nvdimm: build ACPI NFIT table

Message ID 1444535584-18220-24-git-send-email-guangrong.xiao@linux.intel.com
State New
Headers show

Commit Message

Xiao Guangrong Oct. 11, 2015, 3:52 a.m. UTC
NFIT is defined in ACPI 6.0: 5.2.25 NVDIMM Firmware Interface Table (NFIT)

Currently, we only support PMEM mode. Each device has 3 structures:
- SPA structure, defines the PMEM region info

- MEM DEV structure, it has the @handle which is used to associate specified
  ACPI NVDIMM  device we will introduce in later patch.
  Also we can happily ignored the memory device's interleave, the real
  nvdimm hardware access is hidden behind host

- DCR structure, it defines vendor ID used to associate specified vendor
  nvdimm driver. Since we only implement PMEM mode this time, Command
  window and Data window are not needed

Signed-off-by: Xiao Guangrong <guangrong.xiao@linux.intel.com>
---
 hw/i386/acpi-build.c     |   4 +
 hw/mem/nvdimm/acpi.c     | 209 ++++++++++++++++++++++++++++++++++++++++++++++-
 hw/mem/nvdimm/internal.h |  13 +++
 hw/mem/nvdimm/nvdimm.c   |  25 ++++++
 include/hw/mem/nvdimm.h  |   2 +
 5 files changed, 252 insertions(+), 1 deletion(-)

Comments

Michael S. Tsirkin Oct. 12, 2015, 11:27 a.m. UTC | #1
On Sun, Oct 11, 2015 at 11:52:55AM +0800, Xiao Guangrong wrote:
> NFIT is defined in ACPI 6.0: 5.2.25 NVDIMM Firmware Interface Table (NFIT)
> 
> Currently, we only support PMEM mode. Each device has 3 structures:
> - SPA structure, defines the PMEM region info
> 
> - MEM DEV structure, it has the @handle which is used to associate specified
>   ACPI NVDIMM  device we will introduce in later patch.
>   Also we can happily ignored the memory device's interleave, the real
>   nvdimm hardware access is hidden behind host
> 
> - DCR structure, it defines vendor ID used to associate specified vendor
>   nvdimm driver. Since we only implement PMEM mode this time, Command
>   window and Data window are not needed
> 
> Signed-off-by: Xiao Guangrong <guangrong.xiao@linux.intel.com>
> ---
>  hw/i386/acpi-build.c     |   4 +
>  hw/mem/nvdimm/acpi.c     | 209 ++++++++++++++++++++++++++++++++++++++++++++++-
>  hw/mem/nvdimm/internal.h |  13 +++
>  hw/mem/nvdimm/nvdimm.c   |  25 ++++++
>  include/hw/mem/nvdimm.h  |   2 +
>  5 files changed, 252 insertions(+), 1 deletion(-)
> 
> diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c
> index 95e0c65..c637dc8 100644
> --- a/hw/i386/acpi-build.c
> +++ b/hw/i386/acpi-build.c
> @@ -1661,6 +1661,7 @@ static bool acpi_has_iommu(void)
>  static
>  void acpi_build(PcGuestInfo *guest_info, AcpiBuildTables *tables)
>  {
> +    PCMachineState *pcms = PC_MACHINE(qdev_get_machine());

I don't like more code poking at machine directly.
I know srat does it, and I don't like it. Any chance you can add
acpi_get_nvdumm_info to get all you need from nvdimm state?

>      GArray *table_offsets;
>      unsigned facs, ssdt, dsdt, rsdt;
>      AcpiCpuInfo cpu;
> @@ -1742,6 +1743,9 @@ void acpi_build(PcGuestInfo *guest_info, AcpiBuildTables *tables)
>          build_dmar_q35(tables_blob, tables->linker);
>      }
>  
> +    nvdimm_build_acpi_table(&pcms->nvdimm_memory, table_offsets, tables_blob,
> +                            tables->linker);
> +
>      /* Add tables supplied by user (if any) */
>      for (u = acpi_table_first(); u; u = acpi_table_next(u)) {
>          unsigned len = acpi_table_len(u);
> diff --git a/hw/mem/nvdimm/acpi.c b/hw/mem/nvdimm/acpi.c
> index b640874..62b1e02 100644
> --- a/hw/mem/nvdimm/acpi.c
> +++ b/hw/mem/nvdimm/acpi.c
> @@ -32,6 +32,46 @@
>  #include "hw/mem/nvdimm.h"
>  #include "internal.h"
>  
> +static void nfit_spa_uuid_pm(uuid_le *uuid)
> +{
> +    uuid_le uuid_pm = UUID_LE(0x66f0d379, 0xb4f3, 0x4074, 0xac, 0x43, 0x0d,
> +                              0x33, 0x18, 0xb7, 0x8c, 0xdb);
> +    memcpy(uuid, &uuid_pm, sizeof(uuid_pm));
> +}
> +

Just add a static constant:
    const uint8_t nfit_spa_uuid[] = {0x79, 0xd3, ..... }
then memcpy instead of a wrapper.

> +enum {
> +    NFIT_STRUCTURE_SPA = 0,
> +    NFIT_STRUCTURE_MEMDEV = 1,
> +    NFIT_STRUCTURE_IDT = 2,
> +    NFIT_STRUCTURE_SMBIOS = 3,
> +    NFIT_STRUCTURE_DCR = 4,
> +    NFIT_STRUCTURE_BDW = 5,
> +    NFIT_STRUCTURE_FLUSH = 6,
> +};
> +
> +enum {
> +    EFI_MEMORY_UC = 0x1ULL,
> +    EFI_MEMORY_WC = 0x2ULL,
> +    EFI_MEMORY_WT = 0x4ULL,
> +    EFI_MEMORY_WB = 0x8ULL,
> +    EFI_MEMORY_UCE = 0x10ULL,
> +    EFI_MEMORY_WP = 0x1000ULL,
> +    EFI_MEMORY_RP = 0x2000ULL,
> +    EFI_MEMORY_XP = 0x4000ULL,
> +    EFI_MEMORY_NV = 0x8000ULL,
> +    EFI_MEMORY_MORE_RELIABLE = 0x10000ULL,
> +};
> +
> +/*
> + * NVDIMM Firmware Interface Table
> + * @signature: "NFIT"
> + */
> +struct nfit {
> +    ACPI_TABLE_HEADER_DEF
> +    uint32_t reserved;
> +} QEMU_PACKED;
> +typedef struct nfit nfit;
> +
>  /* System Physical Address Range Structure */
>  struct nfit_spa {
>      uint16_t type;
> @@ -40,13 +80,21 @@ struct nfit_spa {
>      uint16_t flags;
>      uint32_t reserved;
>      uint32_t proximity_domain;
> -    uint8_t type_guid[16];
> +    uuid_le type_guid;
>      uint64_t spa_base;
>      uint64_t spa_length;
>      uint64_t mem_attr;
>  } QEMU_PACKED;
>  typedef struct nfit_spa nfit_spa;
>  
> +/*
> + * Control region is strictly for management during hot add/online
> + * operation.
> + */
> +#define SPA_FLAGS_ADD_ONLINE_ONLY     (1)

unused

> +/* Data in Proximity Domain field is valid. */
> +#define SPA_FLAGS_PROXIMITY_VALID     (1 << 1)
> +
>  /* Memory Device to System Physical Address Range Mapping Structure */
>  struct nfit_memdev {
>      uint16_t type;
> @@ -91,12 +139,20 @@ struct nfit_dcr {
>  } QEMU_PACKED;
>  typedef struct nfit_dcr nfit_dcr;
>  
> +#define REVSISON_ID    1
> +#define NFIT_FIC1      0x201
> +
>  static uint64_t nvdimm_device_structure_size(uint64_t slots)
>  {
>      /* each nvdimm has three structures. */
>      return slots * (sizeof(nfit_spa) + sizeof(nfit_memdev) + sizeof(nfit_dcr));
>  }
>  
> +static uint64_t get_nfit_total_size(uint64_t slots)
> +{
> +    return sizeof(struct nfit) + nvdimm_device_structure_size(slots);
> +}
> +
>  static uint64_t nvdimm_acpi_memory_size(uint64_t slots, uint64_t page_size)
>  {
>      uint64_t size = nvdimm_device_structure_size(slots);
> @@ -118,3 +174,154 @@ void nvdimm_init_memory_state(NVDIMMState *state, MemoryRegion*system_memory,
>                         NVDIMM_ACPI_MEM_SIZE);
>      memory_region_add_subregion(system_memory, state->base, &state->mr);
>  }
> +
> +static uint32_t nvdimm_slot_to_sn(int slot)
> +{
> +    return 0x123456 + slot;
> +}
> +
> +static uint32_t nvdimm_slot_to_handle(int slot)
> +{
> +    return slot + 1;
> +}
> +
> +static uint16_t nvdimm_slot_to_spa_index(int slot)
> +{
> +    return (slot + 1) << 1;
> +}
> +
> +static uint32_t nvdimm_slot_to_dcr_index(int slot)
> +{
> +    return nvdimm_slot_to_spa_index(slot) + 1;
> +}
> +

There are lots of magic numbers here with no comments.
Pls explain the logic in code comments.

> +static int build_structure_spa(void *buf, NVDIMMDevice *nvdimm)

Pls document the specific chapter that this implements.

same everywhere else.
> +{
> +    nfit_spa *nfit_spa;
> +    uint64_t addr = object_property_get_int(OBJECT(nvdimm), DIMM_ADDR_PROP,
> +                                            NULL);
> +    uint64_t size = object_property_get_int(OBJECT(nvdimm), DIMM_SIZE_PROP,
> +                                            NULL);
> +    uint32_t node = object_property_get_int(OBJECT(nvdimm), DIMM_NODE_PROP,
> +                                            NULL);
> +    int slot = object_property_get_int(OBJECT(nvdimm), DIMM_SLOT_PROP,
> +                                            NULL);
> +
> +    nfit_spa = buf;
> +
> +    nfit_spa->type = cpu_to_le16(NFIT_STRUCTURE_SPA);

Don't do these 1-time enums. They are hard to match against spec.

       nfit_spa->type = cpu_to_le16(0 /* System Physical Address Range Structure */);

same everywhere else.

> +    nfit_spa->length = cpu_to_le16(sizeof(*nfit_spa));
> +    nfit_spa->spa_index = cpu_to_le16(nvdimm_slot_to_spa_index(slot));
> +    nfit_spa->flags = cpu_to_le16(SPA_FLAGS_PROXIMITY_VALID);
> +    nfit_spa->proximity_domain = cpu_to_le32(node);
> +    nfit_spa_uuid_pm(&nfit_spa->type_guid);
> +    nfit_spa->spa_base = cpu_to_le64(addr);
> +    nfit_spa->spa_length = cpu_to_le64(size);
> +    nfit_spa->mem_attr = cpu_to_le64(EFI_MEMORY_WB | EFI_MEMORY_NV);
> +
> +    return sizeof(*nfit_spa);
> +}
> +
> +static int build_structure_memdev(void *buf, NVDIMMDevice *nvdimm)
> +{
> +    nfit_memdev *nfit_memdev;
> +    uint64_t addr = object_property_get_int(OBJECT(nvdimm), DIMM_ADDR_PROP,
> +                                            NULL);
> +    uint64_t size = object_property_get_int(OBJECT(nvdimm), DIMM_SIZE_PROP,
> +                                            NULL);
> +    int slot = object_property_get_int(OBJECT(nvdimm), DIMM_SLOT_PROP,
> +                                            NULL);
> +    uint32_t handle = nvdimm_slot_to_handle(slot);
> +
> +    nfit_memdev = buf;
> +    nfit_memdev->type = cpu_to_le16(NFIT_STRUCTURE_MEMDEV);
> +    nfit_memdev->length = cpu_to_le16(sizeof(*nfit_memdev));
> +    nfit_memdev->nfit_handle = cpu_to_le32(handle);
> +    /* point to nfit_spa. */
> +    nfit_memdev->spa_index = cpu_to_le16(nvdimm_slot_to_spa_index(slot));
> +    /* point to nfit_dcr. */
> +    nfit_memdev->dcr_index = cpu_to_le16(nvdimm_slot_to_dcr_index(slot));
> +    nfit_memdev->region_len = cpu_to_le64(size);
> +    nfit_memdev->region_dpa = cpu_to_le64(addr);
> +    /* Only one interleave for pmem. */
> +    nfit_memdev->interleave_ways = cpu_to_le16(1);
> +
> +    return sizeof(*nfit_memdev);
> +}
> +
> +static int build_structure_dcr(void *buf, NVDIMMDevice *nvdimm)
> +{
> +    nfit_dcr *nfit_dcr;
> +    int slot = object_property_get_int(OBJECT(nvdimm), DIMM_SLOT_PROP,
> +                                       NULL);
> +    uint32_t sn = nvdimm_slot_to_sn(slot);
> +
> +    nfit_dcr = buf;
> +    nfit_dcr->type = cpu_to_le16(NFIT_STRUCTURE_DCR);
> +    nfit_dcr->length = cpu_to_le16(sizeof(*nfit_dcr));
> +    nfit_dcr->dcr_index = cpu_to_le16(nvdimm_slot_to_dcr_index(slot));
> +    nfit_dcr->vendor_id = cpu_to_le16(0x8086);
> +    nfit_dcr->device_id = cpu_to_le16(1);
> +    nfit_dcr->revision_id = cpu_to_le16(REVSISON_ID);
> +    nfit_dcr->serial_number = cpu_to_le32(sn);
> +    nfit_dcr->fic = cpu_to_le16(NFIT_FIC1);
> +
> +    return sizeof(*nfit_dcr);
> +}
> +
> +static void build_device_structure(GSList *device_list, char *buf)
> +{
> +    buf += sizeof(nfit);
> +
> +    for (; device_list; device_list = device_list->next) {
> +        NVDIMMDevice *nvdimm = device_list->data;
> +
> +        /* build System Physical Address Range Description Table. */
> +        buf += build_structure_spa(buf, nvdimm);
> +
> +        /*
> +         * build Memory Device to System Physical Address Range Mapping
> +         * Table.
> +         */
> +        buf += build_structure_memdev(buf, nvdimm);
> +
> +        /* build Control Region Descriptor Table. */
> +        buf += build_structure_dcr(buf, nvdimm);
> +    }
> +}
> +
> +static void build_nfit(GSList *device_list, GArray *table_offsets,
> +                       GArray *table_data, GArray *linker)
> +{
> +    size_t total;
> +    char *buf;
> +    int nfit_start, nr;
> +
> +    nr = g_slist_length(device_list);
> +    total = get_nfit_total_size(nr);
> +
> +    nfit_start = table_data->len;
> +    acpi_add_table(table_offsets, table_data);
> +
> +    buf = acpi_data_push(table_data, total);
> +    build_device_structure(device_list, buf);

This seems fragile. Should build_device_structure overflow
a buffer we'll corrupt memory.
Current code does use acpi_data_push but only for trivial
things like fixed size headers.
Can't you use glib to dynamically append things to table
as they are generated?


> +
> +    build_header(linker, table_data, (void *)(table_data->data + nfit_start),
> +                 "NFIT", table_data->len - nfit_start, 1);
> +}
> +
> +void nvdimm_build_acpi_table(NVDIMMState *state, GArray *table_offsets,
> +                             GArray *table_data, GArray *linker)
> +{
> +    GSList *device_list = nvdimm_get_built_list();
> +
> +    if (!memory_region_size(&state->mr)) {
> +        assert(!device_list);
> +        return;
> +    }
> +
> +    if (device_list) {
> +        build_nfit(device_list, table_offsets, table_data, linker);
> +        g_slist_free(device_list);
> +    }
> +}
> diff --git a/hw/mem/nvdimm/internal.h b/hw/mem/nvdimm/internal.h
> index c4ba750..5551448 100644
> --- a/hw/mem/nvdimm/internal.h
> +++ b/hw/mem/nvdimm/internal.h
> @@ -14,4 +14,17 @@
>  #define NVDIMM_INTERNAL_H
>  
>  #define MIN_NAMESPACE_LABEL_SIZE    (128UL << 10)
> +
> +struct uuid_le {
> +    uint8_t b[16];
> +};
> +typedef struct uuid_le uuid_le;
> +
> +#define UUID_LE(a, b, c, d0, d1, d2, d3, d4, d5, d6, d7)                   \
> +((uuid_le)                                                                 \
> +{ { (a) & 0xff, ((a) >> 8) & 0xff, ((a) >> 16) & 0xff, ((a) >> 24) & 0xff, \
> +    (b) & 0xff, ((b) >> 8) & 0xff, (c) & 0xff, ((c) >> 8) & 0xff,          \
> +    (d0), (d1), (d2), (d3), (d4), (d5), (d6), (d7) } })
> +

Please avoid polluting the global namespace.
Prefix everything with NVDIMM.

> +GSList *nvdimm_get_built_list(void);

You are adding an extern function with no comment
about it's purpose anywhere. Pls fix this.
The name isn't pretty. What does "built" mean?
List of what? Is this a device list?

>  #endif

This header is too small to be worth it.
nvdimm_get_built_list seems to be the only interface -
just stick it in the header you have under include.


> diff --git a/hw/mem/nvdimm/nvdimm.c b/hw/mem/nvdimm/nvdimm.c
> index 0850e82..bc8c577 100644
> --- a/hw/mem/nvdimm/nvdimm.c
> +++ b/hw/mem/nvdimm/nvdimm.c
> @@ -26,6 +26,31 @@
>  #include "hw/mem/nvdimm.h"
>  #include "internal.h"
>  
> +static int nvdimm_built_list(Object *obj, void *opaque)
> +{
> +    GSList **list = opaque;
> +
> +    if (object_dynamic_cast(obj, TYPE_NVDIMM)) {
> +        DeviceState *dev = DEVICE(obj);
> +
> +        /* only realized NVDIMMs matter */
> +        if (dev->realized) {
> +            *list = g_slist_append(*list, dev);
> +        }
> +    }
> +
> +    object_child_foreach(obj, nvdimm_built_list, opaque);
> +    return 0;
> +}
> +
> +GSList *nvdimm_get_built_list(void)
> +{
> +    GSList *list = NULL;
> +
> +    object_child_foreach(qdev_get_machine(), nvdimm_built_list, &list);
> +    return list;
> +}
> +
>  static MemoryRegion *nvdimm_get_memory_region(DIMMDevice *dimm)
>  {
>      NVDIMMDevice *nvdimm = NVDIMM(dimm);
> diff --git a/include/hw/mem/nvdimm.h b/include/hw/mem/nvdimm.h
> index aa95961..0a6bda4 100644
> --- a/include/hw/mem/nvdimm.h
> +++ b/include/hw/mem/nvdimm.h
> @@ -49,4 +49,6 @@ typedef struct NVDIMMState NVDIMMState;
>  
>  void nvdimm_init_memory_state(NVDIMMState *state, MemoryRegion*system_memory,
>                                MachineState *machine , uint64_t page_size);
> +void nvdimm_build_acpi_table(NVDIMMState *state, GArray *table_offsets,
> +                             GArray *table_data, GArray *linker);
>  #endif
> -- 
> 1.8.3.1
Dan Williams Oct. 12, 2015, 4:40 p.m. UTC | #2
On Sat, Oct 10, 2015 at 8:52 PM, Xiao Guangrong
<guangrong.xiao@linux.intel.com> wrote:
> NFIT is defined in ACPI 6.0: 5.2.25 NVDIMM Firmware Interface Table (NFIT)
>
> Currently, we only support PMEM mode. Each device has 3 structures:
> - SPA structure, defines the PMEM region info
>
> - MEM DEV structure, it has the @handle which is used to associate specified
>   ACPI NVDIMM  device we will introduce in later patch.
>   Also we can happily ignored the memory device's interleave, the real
>   nvdimm hardware access is hidden behind host
>
> - DCR structure, it defines vendor ID used to associate specified vendor
>   nvdimm driver. Since we only implement PMEM mode this time, Command
>   window and Data window are not needed
>
> Signed-off-by: Xiao Guangrong <guangrong.xiao@linux.intel.com>
> ---
>  hw/i386/acpi-build.c     |   4 +
>  hw/mem/nvdimm/acpi.c     | 209 ++++++++++++++++++++++++++++++++++++++++++++++-
>  hw/mem/nvdimm/internal.h |  13 +++
>  hw/mem/nvdimm/nvdimm.c   |  25 ++++++
>  include/hw/mem/nvdimm.h  |   2 +
>  5 files changed, 252 insertions(+), 1 deletion(-)
>
> diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c
> index 95e0c65..c637dc8 100644
> --- a/hw/i386/acpi-build.c
> +++ b/hw/i386/acpi-build.c
> @@ -1661,6 +1661,7 @@ static bool acpi_has_iommu(void)
>  static
>  void acpi_build(PcGuestInfo *guest_info, AcpiBuildTables *tables)
>  {
> +    PCMachineState *pcms = PC_MACHINE(qdev_get_machine());
>      GArray *table_offsets;
>      unsigned facs, ssdt, dsdt, rsdt;
>      AcpiCpuInfo cpu;
> @@ -1742,6 +1743,9 @@ void acpi_build(PcGuestInfo *guest_info, AcpiBuildTables *tables)
>          build_dmar_q35(tables_blob, tables->linker);
>      }
>
> +    nvdimm_build_acpi_table(&pcms->nvdimm_memory, table_offsets, tables_blob,
> +                            tables->linker);
> +
>      /* Add tables supplied by user (if any) */
>      for (u = acpi_table_first(); u; u = acpi_table_next(u)) {
>          unsigned len = acpi_table_len(u);
> diff --git a/hw/mem/nvdimm/acpi.c b/hw/mem/nvdimm/acpi.c
> index b640874..62b1e02 100644
> --- a/hw/mem/nvdimm/acpi.c
> +++ b/hw/mem/nvdimm/acpi.c
> @@ -32,6 +32,46 @@
>  #include "hw/mem/nvdimm.h"
>  #include "internal.h"
>
> +static void nfit_spa_uuid_pm(uuid_le *uuid)
> +{
> +    uuid_le uuid_pm = UUID_LE(0x66f0d379, 0xb4f3, 0x4074, 0xac, 0x43, 0x0d,
> +                              0x33, 0x18, 0xb7, 0x8c, 0xdb);
> +    memcpy(uuid, &uuid_pm, sizeof(uuid_pm));
> +}
> +
> +enum {
> +    NFIT_STRUCTURE_SPA = 0,
> +    NFIT_STRUCTURE_MEMDEV = 1,
> +    NFIT_STRUCTURE_IDT = 2,
> +    NFIT_STRUCTURE_SMBIOS = 3,
> +    NFIT_STRUCTURE_DCR = 4,
> +    NFIT_STRUCTURE_BDW = 5,
> +    NFIT_STRUCTURE_FLUSH = 6,
> +};
> +
> +enum {
> +    EFI_MEMORY_UC = 0x1ULL,
> +    EFI_MEMORY_WC = 0x2ULL,
> +    EFI_MEMORY_WT = 0x4ULL,
> +    EFI_MEMORY_WB = 0x8ULL,
> +    EFI_MEMORY_UCE = 0x10ULL,
> +    EFI_MEMORY_WP = 0x1000ULL,
> +    EFI_MEMORY_RP = 0x2000ULL,
> +    EFI_MEMORY_XP = 0x4000ULL,
> +    EFI_MEMORY_NV = 0x8000ULL,
> +    EFI_MEMORY_MORE_RELIABLE = 0x10000ULL,
> +};

Would it worth including / copying the ACPICA header files directly?

https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/tree/include/acpi/actbl1.h
https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/tree/include/acpi/acuuid.h
Xiao Guangrong Oct. 13, 2015, 5:13 a.m. UTC | #3
On 10/12/2015 07:27 PM, Michael S. Tsirkin wrote:
> On Sun, Oct 11, 2015 at 11:52:55AM +0800, Xiao Guangrong wrote:
>> NFIT is defined in ACPI 6.0: 5.2.25 NVDIMM Firmware Interface Table (NFIT)
>>
>> Currently, we only support PMEM mode. Each device has 3 structures:
>> - SPA structure, defines the PMEM region info
>>
>> - MEM DEV structure, it has the @handle which is used to associate specified
>>    ACPI NVDIMM  device we will introduce in later patch.
>>    Also we can happily ignored the memory device's interleave, the real
>>    nvdimm hardware access is hidden behind host
>>
>> - DCR structure, it defines vendor ID used to associate specified vendor
>>    nvdimm driver. Since we only implement PMEM mode this time, Command
>>    window and Data window are not needed
>>
>> Signed-off-by: Xiao Guangrong <guangrong.xiao@linux.intel.com>
>> ---
>>   hw/i386/acpi-build.c     |   4 +
>>   hw/mem/nvdimm/acpi.c     | 209 ++++++++++++++++++++++++++++++++++++++++++++++-
>>   hw/mem/nvdimm/internal.h |  13 +++
>>   hw/mem/nvdimm/nvdimm.c   |  25 ++++++
>>   include/hw/mem/nvdimm.h  |   2 +
>>   5 files changed, 252 insertions(+), 1 deletion(-)
>>
>> diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c
>> index 95e0c65..c637dc8 100644
>> --- a/hw/i386/acpi-build.c
>> +++ b/hw/i386/acpi-build.c
>> @@ -1661,6 +1661,7 @@ static bool acpi_has_iommu(void)
>>   static
>>   void acpi_build(PcGuestInfo *guest_info, AcpiBuildTables *tables)
>>   {
>> +    PCMachineState *pcms = PC_MACHINE(qdev_get_machine());
>
> I don't like more code poking at machine directly.
> I know srat does it, and I don't like it. Any chance you can add
> acpi_get_nvdumm_info to get all you need from nvdimm state?

Do you mean introduce a wrapper to do this,like:
struct nvdimm_state *acpi_get_nvdumm_info(void)
{
	return &PC_MACHINE(qdev_get_machine())->nvdimm_memory;
}

Or should we maintain nvdimm state in other place (for example, a global
value in nvdimm.c)?

>
>>       GArray *table_offsets;
>>       unsigned facs, ssdt, dsdt, rsdt;
>>       AcpiCpuInfo cpu;
>> @@ -1742,6 +1743,9 @@ void acpi_build(PcGuestInfo *guest_info, AcpiBuildTables *tables)
>>           build_dmar_q35(tables_blob, tables->linker);
>>       }
>>
>> +    nvdimm_build_acpi_table(&pcms->nvdimm_memory, table_offsets, tables_blob,
>> +                            tables->linker);
>> +
>>       /* Add tables supplied by user (if any) */
>>       for (u = acpi_table_first(); u; u = acpi_table_next(u)) {
>>           unsigned len = acpi_table_len(u);
>> diff --git a/hw/mem/nvdimm/acpi.c b/hw/mem/nvdimm/acpi.c
>> index b640874..62b1e02 100644
>> --- a/hw/mem/nvdimm/acpi.c
>> +++ b/hw/mem/nvdimm/acpi.c
>> @@ -32,6 +32,46 @@
>>   #include "hw/mem/nvdimm.h"
>>   #include "internal.h"
>>
>> +static void nfit_spa_uuid_pm(uuid_le *uuid)
>> +{
>> +    uuid_le uuid_pm = UUID_LE(0x66f0d379, 0xb4f3, 0x4074, 0xac, 0x43, 0x0d,
>> +                              0x33, 0x18, 0xb7, 0x8c, 0xdb);
>> +    memcpy(uuid, &uuid_pm, sizeof(uuid_pm));
>> +}
>> +
>
> Just add a static constant:
>      const uint8_t nfit_spa_uuid[] = {0x79, 0xd3, ..... }
> then memcpy instead of a wrapper.

Okay, good to me.

>
>> +enum {
>> +    NFIT_STRUCTURE_SPA = 0,
>> +    NFIT_STRUCTURE_MEMDEV = 1,
>> +    NFIT_STRUCTURE_IDT = 2,
>> +    NFIT_STRUCTURE_SMBIOS = 3,
>> +    NFIT_STRUCTURE_DCR = 4,
>> +    NFIT_STRUCTURE_BDW = 5,
>> +    NFIT_STRUCTURE_FLUSH = 6,
>> +};
>> +
>> +enum {
>> +    EFI_MEMORY_UC = 0x1ULL,
>> +    EFI_MEMORY_WC = 0x2ULL,
>> +    EFI_MEMORY_WT = 0x4ULL,
>> +    EFI_MEMORY_WB = 0x8ULL,
>> +    EFI_MEMORY_UCE = 0x10ULL,
>> +    EFI_MEMORY_WP = 0x1000ULL,
>> +    EFI_MEMORY_RP = 0x2000ULL,
>> +    EFI_MEMORY_XP = 0x4000ULL,
>> +    EFI_MEMORY_NV = 0x8000ULL,
>> +    EFI_MEMORY_MORE_RELIABLE = 0x10000ULL,
>> +};
>> +
>> +/*
>> + * NVDIMM Firmware Interface Table
>> + * @signature: "NFIT"
>> + */
>> +struct nfit {
>> +    ACPI_TABLE_HEADER_DEF
>> +    uint32_t reserved;
>> +} QEMU_PACKED;
>> +typedef struct nfit nfit;
>> +
>>   /* System Physical Address Range Structure */
>>   struct nfit_spa {
>>       uint16_t type;
>> @@ -40,13 +80,21 @@ struct nfit_spa {
>>       uint16_t flags;
>>       uint32_t reserved;
>>       uint32_t proximity_domain;
>> -    uint8_t type_guid[16];
>> +    uuid_le type_guid;
>>       uint64_t spa_base;
>>       uint64_t spa_length;
>>       uint64_t mem_attr;
>>   } QEMU_PACKED;
>>   typedef struct nfit_spa nfit_spa;
>>
>> +/*
>> + * Control region is strictly for management during hot add/online
>> + * operation.
>> + */
>> +#define SPA_FLAGS_ADD_ONLINE_ONLY     (1)
>
> unused

Indeed, currently vNVDIMM did not use this flag, it just introduces
the definition following the spec.

I do not see the hurt of these macros, it is really unacceptable?
Or just the programming style in QEMU?

>
>> +/* Data in Proximity Domain field is valid. */
>> +#define SPA_FLAGS_PROXIMITY_VALID     (1 << 1)
>> +
>>   /* Memory Device to System Physical Address Range Mapping Structure */
>>   struct nfit_memdev {
>>       uint16_t type;
>> @@ -91,12 +139,20 @@ struct nfit_dcr {
>>   } QEMU_PACKED;
>>   typedef struct nfit_dcr nfit_dcr;
>>
>> +#define REVSISON_ID    1
>> +#define NFIT_FIC1      0x201
>> +
>>   static uint64_t nvdimm_device_structure_size(uint64_t slots)
>>   {
>>       /* each nvdimm has three structures. */
>>       return slots * (sizeof(nfit_spa) + sizeof(nfit_memdev) + sizeof(nfit_dcr));
>>   }
>>
>> +static uint64_t get_nfit_total_size(uint64_t slots)
>> +{
>> +    return sizeof(struct nfit) + nvdimm_device_structure_size(slots);
>> +}
>> +
>>   static uint64_t nvdimm_acpi_memory_size(uint64_t slots, uint64_t page_size)
>>   {
>>       uint64_t size = nvdimm_device_structure_size(slots);
>> @@ -118,3 +174,154 @@ void nvdimm_init_memory_state(NVDIMMState *state, MemoryRegion*system_memory,
>>                          NVDIMM_ACPI_MEM_SIZE);
>>       memory_region_add_subregion(system_memory, state->base, &state->mr);
>>   }
>> +
>> +static uint32_t nvdimm_slot_to_sn(int slot)
>> +{
>> +    return 0x123456 + slot;
>> +}
>> +
>> +static uint32_t nvdimm_slot_to_handle(int slot)
>> +{
>> +    return slot + 1;
>> +}
>> +
>> +static uint16_t nvdimm_slot_to_spa_index(int slot)
>> +{
>> +    return (slot + 1) << 1;
>> +}
>> +
>> +static uint32_t nvdimm_slot_to_dcr_index(int slot)
>> +{
>> +    return nvdimm_slot_to_spa_index(slot) + 1;
>> +}
>> +
>
> There are lots of magic numbers here with no comments.
> Pls explain the logic in code comments.

Okay, will comment these carefully in the next version.

>
>> +static int build_structure_spa(void *buf, NVDIMMDevice *nvdimm)
>
> Pls document the specific chapter that this implements.
>
> same everywhere else.

Good style indeed, will do.

>> +{
>> +    nfit_spa *nfit_spa;
>> +    uint64_t addr = object_property_get_int(OBJECT(nvdimm), DIMM_ADDR_PROP,
>> +                                            NULL);
>> +    uint64_t size = object_property_get_int(OBJECT(nvdimm), DIMM_SIZE_PROP,
>> +                                            NULL);
>> +    uint32_t node = object_property_get_int(OBJECT(nvdimm), DIMM_NODE_PROP,
>> +                                            NULL);
>> +    int slot = object_property_get_int(OBJECT(nvdimm), DIMM_SLOT_PROP,
>> +                                            NULL);
>> +
>> +    nfit_spa = buf;
>> +
>> +    nfit_spa->type = cpu_to_le16(NFIT_STRUCTURE_SPA);
>
> Don't do these 1-time enums. They are hard to match against spec.
>
>         nfit_spa->type = cpu_to_le16(0 /* System Physical Address Range Structure */);
>
> same everywhere else.

Yeah, learn it.

>
>> +    nfit_spa->length = cpu_to_le16(sizeof(*nfit_spa));
>> +    nfit_spa->spa_index = cpu_to_le16(nvdimm_slot_to_spa_index(slot));
>> +    nfit_spa->flags = cpu_to_le16(SPA_FLAGS_PROXIMITY_VALID);
>> +    nfit_spa->proximity_domain = cpu_to_le32(node);
>> +    nfit_spa_uuid_pm(&nfit_spa->type_guid);
>> +    nfit_spa->spa_base = cpu_to_le64(addr);
>> +    nfit_spa->spa_length = cpu_to_le64(size);
>> +    nfit_spa->mem_attr = cpu_to_le64(EFI_MEMORY_WB | EFI_MEMORY_NV);
>> +
>> +    return sizeof(*nfit_spa);
>> +}
>> +
>> +static int build_structure_memdev(void *buf, NVDIMMDevice *nvdimm)
>> +{
>> +    nfit_memdev *nfit_memdev;
>> +    uint64_t addr = object_property_get_int(OBJECT(nvdimm), DIMM_ADDR_PROP,
>> +                                            NULL);
>> +    uint64_t size = object_property_get_int(OBJECT(nvdimm), DIMM_SIZE_PROP,
>> +                                            NULL);
>> +    int slot = object_property_get_int(OBJECT(nvdimm), DIMM_SLOT_PROP,
>> +                                            NULL);
>> +    uint32_t handle = nvdimm_slot_to_handle(slot);
>> +
>> +    nfit_memdev = buf;
>> +    nfit_memdev->type = cpu_to_le16(NFIT_STRUCTURE_MEMDEV);
>> +    nfit_memdev->length = cpu_to_le16(sizeof(*nfit_memdev));
>> +    nfit_memdev->nfit_handle = cpu_to_le32(handle);
>> +    /* point to nfit_spa. */
>> +    nfit_memdev->spa_index = cpu_to_le16(nvdimm_slot_to_spa_index(slot));
>> +    /* point to nfit_dcr. */
>> +    nfit_memdev->dcr_index = cpu_to_le16(nvdimm_slot_to_dcr_index(slot));
>> +    nfit_memdev->region_len = cpu_to_le64(size);
>> +    nfit_memdev->region_dpa = cpu_to_le64(addr);
>> +    /* Only one interleave for pmem. */
>> +    nfit_memdev->interleave_ways = cpu_to_le16(1);
>> +
>> +    return sizeof(*nfit_memdev);
>> +}
>> +
>> +static int build_structure_dcr(void *buf, NVDIMMDevice *nvdimm)
>> +{
>> +    nfit_dcr *nfit_dcr;
>> +    int slot = object_property_get_int(OBJECT(nvdimm), DIMM_SLOT_PROP,
>> +                                       NULL);
>> +    uint32_t sn = nvdimm_slot_to_sn(slot);
>> +
>> +    nfit_dcr = buf;
>> +    nfit_dcr->type = cpu_to_le16(NFIT_STRUCTURE_DCR);
>> +    nfit_dcr->length = cpu_to_le16(sizeof(*nfit_dcr));
>> +    nfit_dcr->dcr_index = cpu_to_le16(nvdimm_slot_to_dcr_index(slot));
>> +    nfit_dcr->vendor_id = cpu_to_le16(0x8086);
>> +    nfit_dcr->device_id = cpu_to_le16(1);
>> +    nfit_dcr->revision_id = cpu_to_le16(REVSISON_ID);
>> +    nfit_dcr->serial_number = cpu_to_le32(sn);
>> +    nfit_dcr->fic = cpu_to_le16(NFIT_FIC1);
>> +
>> +    return sizeof(*nfit_dcr);
>> +}
>> +
>> +static void build_device_structure(GSList *device_list, char *buf)
>> +{
>> +    buf += sizeof(nfit);
>> +
>> +    for (; device_list; device_list = device_list->next) {
>> +        NVDIMMDevice *nvdimm = device_list->data;
>> +
>> +        /* build System Physical Address Range Description Table. */
>> +        buf += build_structure_spa(buf, nvdimm);
>> +
>> +        /*
>> +         * build Memory Device to System Physical Address Range Mapping
>> +         * Table.
>> +         */
>> +        buf += build_structure_memdev(buf, nvdimm);
>> +
>> +        /* build Control Region Descriptor Table. */
>> +        buf += build_structure_dcr(buf, nvdimm);
>> +    }
>> +}
>> +
>> +static void build_nfit(GSList *device_list, GArray *table_offsets,
>> +                       GArray *table_data, GArray *linker)
>> +{
>> +    size_t total;
>> +    char *buf;
>> +    int nfit_start, nr;
>> +
>> +    nr = g_slist_length(device_list);
>> +    total = get_nfit_total_size(nr);
>> +
>> +    nfit_start = table_data->len;
>> +    acpi_add_table(table_offsets, table_data);
>> +
>> +    buf = acpi_data_push(table_data, total);
>> +    build_device_structure(device_list, buf);
>
> This seems fragile. Should build_device_structure overflow
> a buffer we'll corrupt memory.
> Current code does use acpi_data_push but only for trivial
> things like fixed size headers.
> Can't you use glib to dynamically append things to table
> as they are generated?
>

Okay, good point, will adjust it in the next version.

>
>> +
>> +    build_header(linker, table_data, (void *)(table_data->data + nfit_start),
>> +                 "NFIT", table_data->len - nfit_start, 1);
>> +}
>> +
>> +void nvdimm_build_acpi_table(NVDIMMState *state, GArray *table_offsets,
>> +                             GArray *table_data, GArray *linker)
>> +{
>> +    GSList *device_list = nvdimm_get_built_list();
>> +
>> +    if (!memory_region_size(&state->mr)) {
>> +        assert(!device_list);
>> +        return;
>> +    }
>> +
>> +    if (device_list) {
>> +        build_nfit(device_list, table_offsets, table_data, linker);
>> +        g_slist_free(device_list);
>> +    }
>> +}
>> diff --git a/hw/mem/nvdimm/internal.h b/hw/mem/nvdimm/internal.h
>> index c4ba750..5551448 100644
>> --- a/hw/mem/nvdimm/internal.h
>> +++ b/hw/mem/nvdimm/internal.h
>> @@ -14,4 +14,17 @@
>>   #define NVDIMM_INTERNAL_H
>>
>>   #define MIN_NAMESPACE_LABEL_SIZE    (128UL << 10)
>> +
>> +struct uuid_le {
>> +    uint8_t b[16];
>> +};
>> +typedef struct uuid_le uuid_le;
>> +
>> +#define UUID_LE(a, b, c, d0, d1, d2, d3, d4, d5, d6, d7)                   \
>> +((uuid_le)                                                                 \
>> +{ { (a) & 0xff, ((a) >> 8) & 0xff, ((a) >> 16) & 0xff, ((a) >> 24) & 0xff, \
>> +    (b) & 0xff, ((b) >> 8) & 0xff, (c) & 0xff, ((c) >> 8) & 0xff,          \
>> +    (d0), (d1), (d2), (d3), (d4), (d5), (d6), (d7) } })
>> +
>
> Please avoid polluting the global namespace.
> Prefix everything with NVDIMM.

Hmm... this include-file, "internal.h", locates at hw/mem/nvdimm/ which
is only used in NVDIMM internal. But your point is good to me, i will carefully
name the stuff defined in a include-file.

>
>> +GSList *nvdimm_get_built_list(void);
>
> You are adding an extern function with no comment
> about it's purpose anywhere. Pls fix this.
> The name isn't pretty. What does "built" mean?
> List of what? Is this a device list?

I used the sytle in pc-dimm.c, pc_dimm_built_list(), i will
rename it to nvdimm_device_list() for better match its doing.

>
>>   #endif
>
> This header is too small to be worth it.
> nvdimm_get_built_list seems to be the only interface -
> just stick it in the header you have under include.
>

Other functions are introudced and included into it in later patches,
it includes the internal things shared between nvdimm device, nvdimm ACPI,
nvdimm namespace.

Furthermore, this is a internal include file, it is not bad i think.
Xiao Guangrong Oct. 13, 2015, 5:17 a.m. UTC | #4
On 10/13/2015 12:40 AM, Dan Williams wrote:
> On Sat, Oct 10, 2015 at 8:52 PM, Xiao Guangrong
> <guangrong.xiao@linux.intel.com> wrote:
>> NFIT is defined in ACPI 6.0: 5.2.25 NVDIMM Firmware Interface Table (NFIT)
>>
>> Currently, we only support PMEM mode. Each device has 3 structures:
>> - SPA structure, defines the PMEM region info
>>
>> - MEM DEV structure, it has the @handle which is used to associate specified
>>    ACPI NVDIMM  device we will introduce in later patch.
>>    Also we can happily ignored the memory device's interleave, the real
>>    nvdimm hardware access is hidden behind host
>>
>> - DCR structure, it defines vendor ID used to associate specified vendor
>>    nvdimm driver. Since we only implement PMEM mode this time, Command
>>    window and Data window are not needed
>>
>> Signed-off-by: Xiao Guangrong <guangrong.xiao@linux.intel.com>
>> ---
>>   hw/i386/acpi-build.c     |   4 +
>>   hw/mem/nvdimm/acpi.c     | 209 ++++++++++++++++++++++++++++++++++++++++++++++-
>>   hw/mem/nvdimm/internal.h |  13 +++
>>   hw/mem/nvdimm/nvdimm.c   |  25 ++++++
>>   include/hw/mem/nvdimm.h  |   2 +
>>   5 files changed, 252 insertions(+), 1 deletion(-)
>>
>> diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c
>> index 95e0c65..c637dc8 100644
>> --- a/hw/i386/acpi-build.c
>> +++ b/hw/i386/acpi-build.c
>> @@ -1661,6 +1661,7 @@ static bool acpi_has_iommu(void)
>>   static
>>   void acpi_build(PcGuestInfo *guest_info, AcpiBuildTables *tables)
>>   {
>> +    PCMachineState *pcms = PC_MACHINE(qdev_get_machine());
>>       GArray *table_offsets;
>>       unsigned facs, ssdt, dsdt, rsdt;
>>       AcpiCpuInfo cpu;
>> @@ -1742,6 +1743,9 @@ void acpi_build(PcGuestInfo *guest_info, AcpiBuildTables *tables)
>>           build_dmar_q35(tables_blob, tables->linker);
>>       }
>>
>> +    nvdimm_build_acpi_table(&pcms->nvdimm_memory, table_offsets, tables_blob,
>> +                            tables->linker);
>> +
>>       /* Add tables supplied by user (if any) */
>>       for (u = acpi_table_first(); u; u = acpi_table_next(u)) {
>>           unsigned len = acpi_table_len(u);
>> diff --git a/hw/mem/nvdimm/acpi.c b/hw/mem/nvdimm/acpi.c
>> index b640874..62b1e02 100644
>> --- a/hw/mem/nvdimm/acpi.c
>> +++ b/hw/mem/nvdimm/acpi.c
>> @@ -32,6 +32,46 @@
>>   #include "hw/mem/nvdimm.h"
>>   #include "internal.h"
>>
>> +static void nfit_spa_uuid_pm(uuid_le *uuid)
>> +{
>> +    uuid_le uuid_pm = UUID_LE(0x66f0d379, 0xb4f3, 0x4074, 0xac, 0x43, 0x0d,
>> +                              0x33, 0x18, 0xb7, 0x8c, 0xdb);
>> +    memcpy(uuid, &uuid_pm, sizeof(uuid_pm));
>> +}
>> +
>> +enum {
>> +    NFIT_STRUCTURE_SPA = 0,
>> +    NFIT_STRUCTURE_MEMDEV = 1,
>> +    NFIT_STRUCTURE_IDT = 2,
>> +    NFIT_STRUCTURE_SMBIOS = 3,
>> +    NFIT_STRUCTURE_DCR = 4,
>> +    NFIT_STRUCTURE_BDW = 5,
>> +    NFIT_STRUCTURE_FLUSH = 6,
>> +};
>> +
>> +enum {
>> +    EFI_MEMORY_UC = 0x1ULL,
>> +    EFI_MEMORY_WC = 0x2ULL,
>> +    EFI_MEMORY_WT = 0x4ULL,
>> +    EFI_MEMORY_WB = 0x8ULL,
>> +    EFI_MEMORY_UCE = 0x10ULL,
>> +    EFI_MEMORY_WP = 0x1000ULL,
>> +    EFI_MEMORY_RP = 0x2000ULL,
>> +    EFI_MEMORY_XP = 0x4000ULL,
>> +    EFI_MEMORY_NV = 0x8000ULL,
>> +    EFI_MEMORY_MORE_RELIABLE = 0x10000ULL,
>> +};
>
> Would it worth including / copying the ACPICA header files directly?
>
> https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/tree/include/acpi/actbl1.h
> https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/tree/include/acpi/acuuid.h

Good point, Dan.

These files are not exported under uapi/ so that it is not good to directly
include it, i will learn the definition and adjust it to QEMU's code style
in the next version.

Thanks!
Michael S. Tsirkin Oct. 13, 2015, 5:42 a.m. UTC | #5
On Tue, Oct 13, 2015 at 01:13:18PM +0800, Xiao Guangrong wrote:
> >
> >>  #endif
> >
> >This header is too small to be worth it.
> >nvdimm_get_built_list seems to be the only interface -
> >just stick it in the header you have under include.
> >
> 
> Other functions are introudced and included into it in later patches,
> it includes the internal things shared between nvdimm device, nvdimm ACPI,
> nvdimm namespace.
> 
> Furthermore, this is a internal include file, it is not bad i think.

Each time we do this, this seems to invite abuse where
people add APIs without documenting them.

I guess I could buy this if you add nvdimm_defs.h
with just internal things such as layout of the buffer
used for communication between ACPI and hardware.
Xiao Guangrong Oct. 13, 2015, 6:06 a.m. UTC | #6
On 10/13/2015 01:42 PM, Michael S. Tsirkin wrote:
> On Tue, Oct 13, 2015 at 01:13:18PM +0800, Xiao Guangrong wrote:
>>>
>>>>   #endif
>>>
>>> This header is too small to be worth it.
>>> nvdimm_get_built_list seems to be the only interface -
>>> just stick it in the header you have under include.
>>>
>>
>> Other functions are introudced and included into it in later patches,
>> it includes the internal things shared between nvdimm device, nvdimm ACPI,
>> nvdimm namespace.
>>
>> Furthermore, this is a internal include file, it is not bad i think.
>
> Each time we do this, this seems to invite abuse where
> people add APIs without documenting them.
>

Understood.

> I guess I could buy this if you add nvdimm_defs.h
> with just internal things such as layout of the buffer
> used for communication between ACPI and hardware.
>

Okay, i will rename internel.h to nvdimm_defs.h and carefully document
everything (definitions, function prototypes, etc.) in this file. :)
Michael S. Tsirkin Oct. 13, 2015, 6:07 a.m. UTC | #7
On Tue, Oct 13, 2015 at 01:17:20PM +0800, Xiao Guangrong wrote:
> >Would it worth including / copying the ACPICA header files directly?
> >
> >https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/tree/include/acpi/actbl1.h
> >https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/tree/include/acpi/acuuid.h
> 
> Good point, Dan.
> 
> These files are not exported under uapi/ so that it is not good to directly
> include it, i will learn the definition and adjust it to QEMU's code style
> in the next version.
> 
> Thanks!
> 

You can talk to acpica guys to try to move acuuid.h to uapi
if you like. But there's not a lot there that we need,
I'm not sure it's worth it.
diff mbox

Patch

diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c
index 95e0c65..c637dc8 100644
--- a/hw/i386/acpi-build.c
+++ b/hw/i386/acpi-build.c
@@ -1661,6 +1661,7 @@  static bool acpi_has_iommu(void)
 static
 void acpi_build(PcGuestInfo *guest_info, AcpiBuildTables *tables)
 {
+    PCMachineState *pcms = PC_MACHINE(qdev_get_machine());
     GArray *table_offsets;
     unsigned facs, ssdt, dsdt, rsdt;
     AcpiCpuInfo cpu;
@@ -1742,6 +1743,9 @@  void acpi_build(PcGuestInfo *guest_info, AcpiBuildTables *tables)
         build_dmar_q35(tables_blob, tables->linker);
     }
 
+    nvdimm_build_acpi_table(&pcms->nvdimm_memory, table_offsets, tables_blob,
+                            tables->linker);
+
     /* Add tables supplied by user (if any) */
     for (u = acpi_table_first(); u; u = acpi_table_next(u)) {
         unsigned len = acpi_table_len(u);
diff --git a/hw/mem/nvdimm/acpi.c b/hw/mem/nvdimm/acpi.c
index b640874..62b1e02 100644
--- a/hw/mem/nvdimm/acpi.c
+++ b/hw/mem/nvdimm/acpi.c
@@ -32,6 +32,46 @@ 
 #include "hw/mem/nvdimm.h"
 #include "internal.h"
 
+static void nfit_spa_uuid_pm(uuid_le *uuid)
+{
+    uuid_le uuid_pm = UUID_LE(0x66f0d379, 0xb4f3, 0x4074, 0xac, 0x43, 0x0d,
+                              0x33, 0x18, 0xb7, 0x8c, 0xdb);
+    memcpy(uuid, &uuid_pm, sizeof(uuid_pm));
+}
+
+enum {
+    NFIT_STRUCTURE_SPA = 0,
+    NFIT_STRUCTURE_MEMDEV = 1,
+    NFIT_STRUCTURE_IDT = 2,
+    NFIT_STRUCTURE_SMBIOS = 3,
+    NFIT_STRUCTURE_DCR = 4,
+    NFIT_STRUCTURE_BDW = 5,
+    NFIT_STRUCTURE_FLUSH = 6,
+};
+
+enum {
+    EFI_MEMORY_UC = 0x1ULL,
+    EFI_MEMORY_WC = 0x2ULL,
+    EFI_MEMORY_WT = 0x4ULL,
+    EFI_MEMORY_WB = 0x8ULL,
+    EFI_MEMORY_UCE = 0x10ULL,
+    EFI_MEMORY_WP = 0x1000ULL,
+    EFI_MEMORY_RP = 0x2000ULL,
+    EFI_MEMORY_XP = 0x4000ULL,
+    EFI_MEMORY_NV = 0x8000ULL,
+    EFI_MEMORY_MORE_RELIABLE = 0x10000ULL,
+};
+
+/*
+ * NVDIMM Firmware Interface Table
+ * @signature: "NFIT"
+ */
+struct nfit {
+    ACPI_TABLE_HEADER_DEF
+    uint32_t reserved;
+} QEMU_PACKED;
+typedef struct nfit nfit;
+
 /* System Physical Address Range Structure */
 struct nfit_spa {
     uint16_t type;
@@ -40,13 +80,21 @@  struct nfit_spa {
     uint16_t flags;
     uint32_t reserved;
     uint32_t proximity_domain;
-    uint8_t type_guid[16];
+    uuid_le type_guid;
     uint64_t spa_base;
     uint64_t spa_length;
     uint64_t mem_attr;
 } QEMU_PACKED;
 typedef struct nfit_spa nfit_spa;
 
+/*
+ * Control region is strictly for management during hot add/online
+ * operation.
+ */
+#define SPA_FLAGS_ADD_ONLINE_ONLY     (1)
+/* Data in Proximity Domain field is valid. */
+#define SPA_FLAGS_PROXIMITY_VALID     (1 << 1)
+
 /* Memory Device to System Physical Address Range Mapping Structure */
 struct nfit_memdev {
     uint16_t type;
@@ -91,12 +139,20 @@  struct nfit_dcr {
 } QEMU_PACKED;
 typedef struct nfit_dcr nfit_dcr;
 
+#define REVSISON_ID    1
+#define NFIT_FIC1      0x201
+
 static uint64_t nvdimm_device_structure_size(uint64_t slots)
 {
     /* each nvdimm has three structures. */
     return slots * (sizeof(nfit_spa) + sizeof(nfit_memdev) + sizeof(nfit_dcr));
 }
 
+static uint64_t get_nfit_total_size(uint64_t slots)
+{
+    return sizeof(struct nfit) + nvdimm_device_structure_size(slots);
+}
+
 static uint64_t nvdimm_acpi_memory_size(uint64_t slots, uint64_t page_size)
 {
     uint64_t size = nvdimm_device_structure_size(slots);
@@ -118,3 +174,154 @@  void nvdimm_init_memory_state(NVDIMMState *state, MemoryRegion*system_memory,
                        NVDIMM_ACPI_MEM_SIZE);
     memory_region_add_subregion(system_memory, state->base, &state->mr);
 }
+
+static uint32_t nvdimm_slot_to_sn(int slot)
+{
+    return 0x123456 + slot;
+}
+
+static uint32_t nvdimm_slot_to_handle(int slot)
+{
+    return slot + 1;
+}
+
+static uint16_t nvdimm_slot_to_spa_index(int slot)
+{
+    return (slot + 1) << 1;
+}
+
+static uint32_t nvdimm_slot_to_dcr_index(int slot)
+{
+    return nvdimm_slot_to_spa_index(slot) + 1;
+}
+
+static int build_structure_spa(void *buf, NVDIMMDevice *nvdimm)
+{
+    nfit_spa *nfit_spa;
+    uint64_t addr = object_property_get_int(OBJECT(nvdimm), DIMM_ADDR_PROP,
+                                            NULL);
+    uint64_t size = object_property_get_int(OBJECT(nvdimm), DIMM_SIZE_PROP,
+                                            NULL);
+    uint32_t node = object_property_get_int(OBJECT(nvdimm), DIMM_NODE_PROP,
+                                            NULL);
+    int slot = object_property_get_int(OBJECT(nvdimm), DIMM_SLOT_PROP,
+                                            NULL);
+
+    nfit_spa = buf;
+
+    nfit_spa->type = cpu_to_le16(NFIT_STRUCTURE_SPA);
+    nfit_spa->length = cpu_to_le16(sizeof(*nfit_spa));
+    nfit_spa->spa_index = cpu_to_le16(nvdimm_slot_to_spa_index(slot));
+    nfit_spa->flags = cpu_to_le16(SPA_FLAGS_PROXIMITY_VALID);
+    nfit_spa->proximity_domain = cpu_to_le32(node);
+    nfit_spa_uuid_pm(&nfit_spa->type_guid);
+    nfit_spa->spa_base = cpu_to_le64(addr);
+    nfit_spa->spa_length = cpu_to_le64(size);
+    nfit_spa->mem_attr = cpu_to_le64(EFI_MEMORY_WB | EFI_MEMORY_NV);
+
+    return sizeof(*nfit_spa);
+}
+
+static int build_structure_memdev(void *buf, NVDIMMDevice *nvdimm)
+{
+    nfit_memdev *nfit_memdev;
+    uint64_t addr = object_property_get_int(OBJECT(nvdimm), DIMM_ADDR_PROP,
+                                            NULL);
+    uint64_t size = object_property_get_int(OBJECT(nvdimm), DIMM_SIZE_PROP,
+                                            NULL);
+    int slot = object_property_get_int(OBJECT(nvdimm), DIMM_SLOT_PROP,
+                                            NULL);
+    uint32_t handle = nvdimm_slot_to_handle(slot);
+
+    nfit_memdev = buf;
+    nfit_memdev->type = cpu_to_le16(NFIT_STRUCTURE_MEMDEV);
+    nfit_memdev->length = cpu_to_le16(sizeof(*nfit_memdev));
+    nfit_memdev->nfit_handle = cpu_to_le32(handle);
+    /* point to nfit_spa. */
+    nfit_memdev->spa_index = cpu_to_le16(nvdimm_slot_to_spa_index(slot));
+    /* point to nfit_dcr. */
+    nfit_memdev->dcr_index = cpu_to_le16(nvdimm_slot_to_dcr_index(slot));
+    nfit_memdev->region_len = cpu_to_le64(size);
+    nfit_memdev->region_dpa = cpu_to_le64(addr);
+    /* Only one interleave for pmem. */
+    nfit_memdev->interleave_ways = cpu_to_le16(1);
+
+    return sizeof(*nfit_memdev);
+}
+
+static int build_structure_dcr(void *buf, NVDIMMDevice *nvdimm)
+{
+    nfit_dcr *nfit_dcr;
+    int slot = object_property_get_int(OBJECT(nvdimm), DIMM_SLOT_PROP,
+                                       NULL);
+    uint32_t sn = nvdimm_slot_to_sn(slot);
+
+    nfit_dcr = buf;
+    nfit_dcr->type = cpu_to_le16(NFIT_STRUCTURE_DCR);
+    nfit_dcr->length = cpu_to_le16(sizeof(*nfit_dcr));
+    nfit_dcr->dcr_index = cpu_to_le16(nvdimm_slot_to_dcr_index(slot));
+    nfit_dcr->vendor_id = cpu_to_le16(0x8086);
+    nfit_dcr->device_id = cpu_to_le16(1);
+    nfit_dcr->revision_id = cpu_to_le16(REVSISON_ID);
+    nfit_dcr->serial_number = cpu_to_le32(sn);
+    nfit_dcr->fic = cpu_to_le16(NFIT_FIC1);
+
+    return sizeof(*nfit_dcr);
+}
+
+static void build_device_structure(GSList *device_list, char *buf)
+{
+    buf += sizeof(nfit);
+
+    for (; device_list; device_list = device_list->next) {
+        NVDIMMDevice *nvdimm = device_list->data;
+
+        /* build System Physical Address Range Description Table. */
+        buf += build_structure_spa(buf, nvdimm);
+
+        /*
+         * build Memory Device to System Physical Address Range Mapping
+         * Table.
+         */
+        buf += build_structure_memdev(buf, nvdimm);
+
+        /* build Control Region Descriptor Table. */
+        buf += build_structure_dcr(buf, nvdimm);
+    }
+}
+
+static void build_nfit(GSList *device_list, GArray *table_offsets,
+                       GArray *table_data, GArray *linker)
+{
+    size_t total;
+    char *buf;
+    int nfit_start, nr;
+
+    nr = g_slist_length(device_list);
+    total = get_nfit_total_size(nr);
+
+    nfit_start = table_data->len;
+    acpi_add_table(table_offsets, table_data);
+
+    buf = acpi_data_push(table_data, total);
+    build_device_structure(device_list, buf);
+
+    build_header(linker, table_data, (void *)(table_data->data + nfit_start),
+                 "NFIT", table_data->len - nfit_start, 1);
+}
+
+void nvdimm_build_acpi_table(NVDIMMState *state, GArray *table_offsets,
+                             GArray *table_data, GArray *linker)
+{
+    GSList *device_list = nvdimm_get_built_list();
+
+    if (!memory_region_size(&state->mr)) {
+        assert(!device_list);
+        return;
+    }
+
+    if (device_list) {
+        build_nfit(device_list, table_offsets, table_data, linker);
+        g_slist_free(device_list);
+    }
+}
diff --git a/hw/mem/nvdimm/internal.h b/hw/mem/nvdimm/internal.h
index c4ba750..5551448 100644
--- a/hw/mem/nvdimm/internal.h
+++ b/hw/mem/nvdimm/internal.h
@@ -14,4 +14,17 @@ 
 #define NVDIMM_INTERNAL_H
 
 #define MIN_NAMESPACE_LABEL_SIZE    (128UL << 10)
+
+struct uuid_le {
+    uint8_t b[16];
+};
+typedef struct uuid_le uuid_le;
+
+#define UUID_LE(a, b, c, d0, d1, d2, d3, d4, d5, d6, d7)                   \
+((uuid_le)                                                                 \
+{ { (a) & 0xff, ((a) >> 8) & 0xff, ((a) >> 16) & 0xff, ((a) >> 24) & 0xff, \
+    (b) & 0xff, ((b) >> 8) & 0xff, (c) & 0xff, ((c) >> 8) & 0xff,          \
+    (d0), (d1), (d2), (d3), (d4), (d5), (d6), (d7) } })
+
+GSList *nvdimm_get_built_list(void);
 #endif
diff --git a/hw/mem/nvdimm/nvdimm.c b/hw/mem/nvdimm/nvdimm.c
index 0850e82..bc8c577 100644
--- a/hw/mem/nvdimm/nvdimm.c
+++ b/hw/mem/nvdimm/nvdimm.c
@@ -26,6 +26,31 @@ 
 #include "hw/mem/nvdimm.h"
 #include "internal.h"
 
+static int nvdimm_built_list(Object *obj, void *opaque)
+{
+    GSList **list = opaque;
+
+    if (object_dynamic_cast(obj, TYPE_NVDIMM)) {
+        DeviceState *dev = DEVICE(obj);
+
+        /* only realized NVDIMMs matter */
+        if (dev->realized) {
+            *list = g_slist_append(*list, dev);
+        }
+    }
+
+    object_child_foreach(obj, nvdimm_built_list, opaque);
+    return 0;
+}
+
+GSList *nvdimm_get_built_list(void)
+{
+    GSList *list = NULL;
+
+    object_child_foreach(qdev_get_machine(), nvdimm_built_list, &list);
+    return list;
+}
+
 static MemoryRegion *nvdimm_get_memory_region(DIMMDevice *dimm)
 {
     NVDIMMDevice *nvdimm = NVDIMM(dimm);
diff --git a/include/hw/mem/nvdimm.h b/include/hw/mem/nvdimm.h
index aa95961..0a6bda4 100644
--- a/include/hw/mem/nvdimm.h
+++ b/include/hw/mem/nvdimm.h
@@ -49,4 +49,6 @@  typedef struct NVDIMMState NVDIMMState;
 
 void nvdimm_init_memory_state(NVDIMMState *state, MemoryRegion*system_memory,
                               MachineState *machine , uint64_t page_size);
+void nvdimm_build_acpi_table(NVDIMMState *state, GArray *table_offsets,
+                             GArray *table_data, GArray *linker);
 #endif