[v13,06/12] numa: Extend CLI to provide memory latency and bandwidth information
diff mbox series

Message ID 20191020111125.27659-7-tao3.xu@intel.com
State New
Headers show
Series
  • Build ACPI Heterogeneous Memory Attribute Table (HMAT)
Related show

Commit Message

Tao Xu Oct. 20, 2019, 11:11 a.m. UTC
From: Liu Jingqi <jingqi.liu@intel.com>

Add -numa hmat-lb option to provide System Locality Latency and
Bandwidth Information. These memory attributes help to build
System Locality Latency and Bandwidth Information Structure(s)
in ACPI Heterogeneous Memory Attribute Table (HMAT).

Signed-off-by: Liu Jingqi <jingqi.liu@intel.com>
Signed-off-by: Tao Xu <tao3.xu@intel.com>
---

Changes in v13:
    - Reuse Garray to store the raw bandwidth and bandwidth data
    - Calculate common base unit using range bitmap (Igor)
---
 hw/core/numa.c        | 127 ++++++++++++++++++++++++++++++++++++++++++
 include/sysemu/numa.h |  68 ++++++++++++++++++++++
 qapi/machine.json     |  95 ++++++++++++++++++++++++++++++-
 qemu-options.hx       |  49 +++++++++++++++-
 4 files changed, 336 insertions(+), 3 deletions(-)

Comments

Igor Mammedov Oct. 22, 2019, 7:08 a.m. UTC | #1
On Sun, 20 Oct 2019 19:11:19 +0800
Tao Xu <tao3.xu@intel.com> wrote:

> From: Liu Jingqi <jingqi.liu@intel.com>
> 
> Add -numa hmat-lb option to provide System Locality Latency and
> Bandwidth Information. These memory attributes help to build
> System Locality Latency and Bandwidth Information Structure(s)
> in ACPI Heterogeneous Memory Attribute Table (HMAT).
> 
> Signed-off-by: Liu Jingqi <jingqi.liu@intel.com>
> Signed-off-by: Tao Xu <tao3.xu@intel.com>
> ---
> 
> Changes in v13:
>     - Reuse Garray to store the raw bandwidth and bandwidth data
>     - Calculate common base unit using range bitmap (Igor)
> ---
>  hw/core/numa.c        | 127 ++++++++++++++++++++++++++++++++++++++++++
>  include/sysemu/numa.h |  68 ++++++++++++++++++++++
>  qapi/machine.json     |  95 ++++++++++++++++++++++++++++++-
>  qemu-options.hx       |  49 +++++++++++++++-
>  4 files changed, 336 insertions(+), 3 deletions(-)
> 
> diff --git a/hw/core/numa.c b/hw/core/numa.c
> index eba66ab768..3cf77f6ac9 100644
> --- a/hw/core/numa.c
> +++ b/hw/core/numa.c
> @@ -23,6 +23,7 @@
>   */
>  
>  #include "qemu/osdep.h"
> +#include "qemu/units.h"
>  #include "sysemu/hostmem.h"
>  #include "sysemu/numa.h"
>  #include "sysemu/sysemu.h"
> @@ -198,6 +199,119 @@ void parse_numa_distance(MachineState *ms, NumaDistOptions *dist, Error **errp)
>      ms->numa_state->have_numa_distance = true;
>  }
>  
> +void parse_numa_hmat_lb(NumaState *numa_state, NumaHmatLBOptions *node,
> +                        Error **errp)
> +{
> +    int first_bit, last_bit;
> +    uint64_t temp_latency;
> +    NodeInfo *numa_info = numa_state->nodes;
> +    HMAT_LB_Info *hmat_lb =
> +        numa_state->hmat_lb[node->hierarchy][node->data_type];
> +    HMAT_LB_Data lb_data;
> +
> +    /* Error checking */
> +    if (node->initiator >= numa_state->num_nodes) {
> +        error_setg(errp, "Invalid initiator=%d, it should be less than %d.",
> +                   node->initiator, numa_state->num_nodes);
> +        return;
> +    }
> +    if (node->target >= numa_state->num_nodes) {
> +        error_setg(errp, "Invalid target=%d, it should be less than %d.",
> +                   node->target, numa_state->num_nodes);
> +        return;
> +    }
> +    if (!numa_info[node->initiator].has_cpu) {
> +        error_setg(errp, "Invalid initiator=%d, it isn't an "
> +                   "initiator proximity domain.", node->initiator);
> +        return;
> +    }
> +    if (!numa_info[node->target].present) {
> +        error_setg(errp, "Invalid target=%d, it hasn't a valid NUMA node.",
> +                   node->target);
> +        return;
> +    }
> +
> +    if (!hmat_lb) {
> +        hmat_lb = g_malloc0(sizeof(*hmat_lb));
> +        numa_state->hmat_lb[node->hierarchy][node->data_type] = hmat_lb;
> +        hmat_lb->latency = g_array_new(false, true, sizeof(HMAT_LB_Data));
> +        hmat_lb->bandwidth = g_array_new(false, true, sizeof(HMAT_LB_Data));
> +    }
> +    hmat_lb->hierarchy = node->hierarchy;
> +    hmat_lb->data_type = node->data_type;
> +    lb_data.initiator = node->initiator;
> +    lb_data.target = node->target;
> +
> +    /* Input latency data */
> +    if (node->data_type <= HMATLB_DATA_TYPE_WRITE_LATENCY) {
> +        if (!node->has_latency) {
> +            error_setg(errp, "Missing 'latency' option.");
> +            return;
> +        }
> +        if (node->has_bandwidth) {
> +            error_setg(errp, "Invalid option 'bandwidth' since "
> +                       "the data type is latency.");
> +            return;
> +        }
> +
> +        temp_latency = node->latency;
> +        hmat_lb->base_latency = 1;
> +        while (QEMU_IS_ALIGNED(temp_latency, 10)) {
> +            temp_latency /= 10;
> +            hmat_lb->base_latency *= 10;
> +        }
> +
> +        if (temp_latency >= UINT64_MAX) {
                            ^  ^^^^ doesn't make sense

can't you use range bitmap here as well?

> +            error_setg(errp, "Latency %" PRIu64 " between initiator=%d and "
> +                       "target=%d should not differ from previously entered "
> +                       "values on more than %d.", node->latency,
> +                       node->initiator, node->target, UINT16_MAX - 1);
> +            return;
> +        }
> +        if (temp_latency > hmat_lb->range_left_la) {
> +            hmat_lb->range_left_la = temp_latency;
> +        }
> +
> +        lb_data.rawdata = node->latency;
> +        g_array_append_val(hmat_lb->latency, lb_data);
> +    }
> +
> +    /* Input bandwidth data */
> +    if (node->data_type >= HMATLB_DATA_TYPE_ACCESS_BANDWIDTH) {
> +        if (!node->has_bandwidth) {
> +            error_setg(errp, "Missing 'bandwidth' option.");
> +            return;
> +        }
> +        if (node->has_latency) {
> +            error_setg(errp, "Invalid option 'latency' since "
> +                       "the data type is bandwidth.");
> +            return;
> +        }
> +        if (!QEMU_IS_ALIGNED(node->bandwidth, MiB)) {
> +            error_setg(errp, "Bandwidth %" PRIu64 " between initiator=%d and "
> +                       "target=%d should be 1MB aligned.", node->bandwidth,
> +                       node->initiator, node->target);
> +            return;
> +        }
> +
> +        hmat_lb->range_bitmap_bw |= node->bandwidth;
> +
> +        first_bit = __builtin_ffs(hmat_lb->range_bitmap_bw);
> +        last_bit = __builtin_ctz(hmat_lb->range_bitmap_bw);
> +        if ((last_bit - first_bit) > UINT16_BITS) {
> +            error_setg(errp, "Bandwidth %" PRIu64 " between initiator=%d and "
> +                       "target=%d should not differ from previously entered "
> +                       "values on more than %d.", node->bandwidth,
> +                       node->initiator, node->target, UINT16_MAX - 1);
> +            return;
> +        }
> +
> +        hmat_lb->base_bandwidth = UINT64_C(1) << (first_bit - 1);
> +        lb_data.rawdata = node->bandwidth;
> +        g_array_append_val(hmat_lb->bandwidth, lb_data);
> +    }
> +}
> +
>  void set_numa_options(MachineState *ms, NumaOptions *object, Error **errp)
>  {
>      Error *err = NULL;
> @@ -236,6 +350,19 @@ void set_numa_options(MachineState *ms, NumaOptions *object, Error **errp)
>          machine_set_cpu_numa_node(ms, qapi_NumaCpuOptions_base(&object->u.cpu),
>                                    &err);
>          break;
> +    case NUMA_OPTIONS_TYPE_HMAT_LB:
> +        if (!ms->numa_state->hmat_enabled) {
> +            error_setg(errp, "ACPI Heterogeneous Memory Attribute Table "
> +                       "(HMAT) is disabled, enable it with -machine hmat=on "
> +                       "before using any of hmat specific options.");
> +            return;
> +        }
> +
> +        parse_numa_hmat_lb(ms->numa_state, &object->u.hmat_lb, &err);
> +        if (err) {
> +            goto end;
> +        }
> +        break;
>      default:
>          abort();
>      }
> diff --git a/include/sysemu/numa.h b/include/sysemu/numa.h
> index 788cbec7a2..b45afcb29e 100644
> --- a/include/sysemu/numa.h
> +++ b/include/sysemu/numa.h
> @@ -14,6 +14,29 @@ struct CPUArchId;
>  #define NUMA_DISTANCE_MAX         254
>  #define NUMA_DISTANCE_UNREACHABLE 255
>  
> +/* the value of AcpiHmatLBInfo flags */
> +enum {
> +    HMAT_LB_MEM_MEMORY           = 0,
> +    HMAT_LB_MEM_CACHE_1ST_LEVEL  = 1,
> +    HMAT_LB_MEM_CACHE_2ND_LEVEL  = 2,
> +    HMAT_LB_MEM_CACHE_3RD_LEVEL  = 3,
> +};
> +
> +/* the value of AcpiHmatLBInfo data type */
> +enum {
> +    HMAT_LB_DATA_ACCESS_LATENCY   = 0,
> +    HMAT_LB_DATA_READ_LATENCY     = 1,
> +    HMAT_LB_DATA_WRITE_LATENCY    = 2,
> +    HMAT_LB_DATA_ACCESS_BANDWIDTH = 3,
> +    HMAT_LB_DATA_READ_BANDWIDTH   = 4,
> +    HMAT_LB_DATA_WRITE_BANDWIDTH  = 5,
> +};
> +
> +#define UINT16_BITS       16
> +
> +#define HMAT_LB_LEVELS    (HMAT_LB_MEM_CACHE_3RD_LEVEL + 1)
> +#define HMAT_LB_TYPES     (HMAT_LB_DATA_WRITE_BANDWIDTH + 1)
> +
>  struct NodeInfo {
>      uint64_t node_mem;
>      struct HostMemoryBackend *node_memdev;
> @@ -28,6 +51,46 @@ struct NumaNodeMem {
>      uint64_t node_plugged_mem;
>  };
>  
> +struct HMAT_LB_Data {
> +    uint8_t     initiator;
> +    uint8_t     target;
> +    uint64_t    rawdata;
> +};
> +typedef struct HMAT_LB_Data HMAT_LB_Data;
> +
> +struct HMAT_LB_Info {
> +    /* Indicates it's memory or the specified level memory side cache. */
> +    uint8_t     hierarchy;
> +
> +    /* Present the type of data, access/read/write latency or bandwidth. */
> +    uint8_t     data_type;
> +
> +    /* The left range of latency for calculating common latency base */
> +    uint64_t    range_left_la;
> +
> +    /* The range bitmap of bandwidth for calculating common bandwidth base */
> +    uint64_t    range_bitmap_bw;
> +
> +    /* The common base unit for latencies */
> +    uint64_t    base_latency;
> +
> +    /* The common base unit for bandwidths */
> +    uint64_t    base_bandwidth;
> +
> +    /* Array to store the compressed latencies */
> +    uint16_t    *entry_latency;
> +
> +    /* Array to store the compressed latencies */
> +    uint16_t    *entry_bandwidth;
> +
> +    /* Array to store the latencies */
> +    GArray      *latency;
> +
> +    /* Array to store the bandwidthes */
> +    GArray      *bandwidth;
> +};
> +typedef struct HMAT_LB_Info HMAT_LB_Info;
> +
>  struct NumaState {
>      /* Number of NUMA nodes */
>      int num_nodes;
> @@ -40,11 +103,16 @@ struct NumaState {
>  
>      /* NUMA nodes information */
>      NodeInfo nodes[MAX_NODES];
> +
> +    /* NUMA nodes HMAT Locality Latency and Bandwidth Information */
> +    HMAT_LB_Info *hmat_lb[HMAT_LB_LEVELS][HMAT_LB_TYPES];
>  };
>  typedef struct NumaState NumaState;
>  
>  void set_numa_options(MachineState *ms, NumaOptions *object, Error **errp);
>  void parse_numa_opts(MachineState *ms);
> +void parse_numa_hmat_lb(NumaState *numa_state, NumaHmatLBOptions *node,
> +                        Error **errp);
>  void numa_complete_configuration(MachineState *ms);
>  void query_numa_node_mem(NumaNodeMem node_mem[], MachineState *ms);
>  extern QemuOptsList qemu_numa_opts;
> diff --git a/qapi/machine.json b/qapi/machine.json
> index f1b07b3486..9ca008810b 100644
> --- a/qapi/machine.json
> +++ b/qapi/machine.json
> @@ -426,10 +426,12 @@
>  #
>  # @cpu: property based CPU(s) to node mapping (Since: 2.10)
>  #
> +# @hmat-lb: memory latency and bandwidth information (Since: 4.2)
> +#
>  # Since: 2.1
>  ##
>  { 'enum': 'NumaOptionsType',
> -  'data': [ 'node', 'dist', 'cpu' ] }
> +  'data': [ 'node', 'dist', 'cpu', 'hmat-lb' ] }
>  
>  ##
>  # @NumaOptions:
> @@ -444,7 +446,8 @@
>    'data': {
>      'node': 'NumaNodeOptions',
>      'dist': 'NumaDistOptions',
> -    'cpu': 'NumaCpuOptions' }}
> +    'cpu': 'NumaCpuOptions',
> +    'hmat-lb': 'NumaHmatLBOptions' }}
>  
>  ##
>  # @NumaNodeOptions:
> @@ -557,6 +560,94 @@
>     'base': 'CpuInstanceProperties',
>     'data' : {} }
>  
> +##
> +# @HmatLBMemoryHierarchy:
> +#
> +# The memory hierarchy in the System Locality Latency
> +# and Bandwidth Information Structure of HMAT (Heterogeneous
> +# Memory Attribute Table)
> +#
> +# For more information of @HmatLBMemoryHierarchy see
> +# the chapter 5.2.27.4: Table 5-142: Field "Flags" of ACPI 6.3 spec.
> +#
> +# @memory: the structure represents the memory performance
> +#
> +# @first-level: first level memory of memory side cached memory
> +#
> +# @second-level: second level memory of memory side cached memory
> +#
> +# @third-level: third level memory of memory side cached memory
> +#
> +# Since: 4.2
> +##
> +{ 'enum': 'HmatLBMemoryHierarchy',
> +  'data': [ 'memory', 'first-level', 'second-level', 'third-level' ] }
> +
> +##
> +# @HmatLBDataType:
> +#
> +# Data type in the System Locality Latency
> +# and Bandwidth Information Structure of HMAT (Heterogeneous
> +# Memory Attribute Table)
> +#
> +# For more information of @HmatLBDataType see
> +# the chapter 5.2.27.4: Table 5-142:  Field "Data Type" of ACPI 6.3 spec.
> +#
> +# @access-latency: access latency (nanoseconds)
> +#
> +# @read-latency: read latency (nanoseconds)
> +#
> +# @write-latency: write latency (nanoseconds)
> +#
> +# @access-bandwidth: access bandwidth (MB/s)
> +#
> +# @read-bandwidth: read bandwidth (MB/s)
> +#
> +# @write-bandwidth: write bandwidth (MB/s)
> +#
> +# Since: 4.2
> +##
> +{ 'enum': 'HmatLBDataType',
> +  'data': [ 'access-latency', 'read-latency', 'write-latency',
> +            'access-bandwidth', 'read-bandwidth', 'write-bandwidth' ] }
> +
> +##
> +# @NumaHmatLBOptions:
> +#
> +# Set the system locality latency and bandwidth information
> +# between Initiator and Target proximity Domains.
> +#
> +# For more information of @NumaHmatLBOptions see
> +# the chapter 5.2.27.4: Table 5-142 of ACPI 6.3 spec.
> +#
> +# @initiator: the Initiator Proximity Domain.
> +#
> +# @target: the Target Proximity Domain.
> +#
> +# @hierarchy: the Memory Hierarchy. Indicates the performance
> +#             of memory or side cache.
> +#
> +# @data-type: presents the type of data, access/read/write
> +#             latency or hit latency.
> +#
> +# @latency: the value of latency from @initiator to @target proximity domain,
> +#           the latency units are "ps(picosecond)", "ns(nanosecond)" or
> +#           "us(microsecond)".
> +#
> +# @bandwidth: the value of bandwidth between @initiator and @target proximity
> +#             domain, the bandwidth units are "MB(/s)","GB(/s)" or "TB(/s)".
> +#
> +# Since: 4.2
> +##
> +{ 'struct': 'NumaHmatLBOptions',
> +    'data': {
> +    'initiator': 'uint16',
> +    'target': 'uint16',
> +    'hierarchy': 'HmatLBMemoryHierarchy',
> +    'data-type': 'HmatLBDataType',
> +    '*latency': 'time',
> +    '*bandwidth': 'size' }}
> +
>  ##
>  # @HostMemPolicy:
>  #
> diff --git a/qemu-options.hx b/qemu-options.hx
> index 1f96399521..de97939f9a 100644
> --- a/qemu-options.hx
> +++ b/qemu-options.hx
> @@ -168,16 +168,19 @@ DEF("numa", HAS_ARG, QEMU_OPTION_numa,
>      "-numa node[,mem=size][,cpus=firstcpu[-lastcpu]][,nodeid=node][,initiator=node]\n"
>      "-numa node[,memdev=id][,cpus=firstcpu[-lastcpu]][,nodeid=node][,initiator=node]\n"
>      "-numa dist,src=source,dst=destination,val=distance\n"
> -    "-numa cpu,node-id=node[,socket-id=x][,core-id=y][,thread-id=z]\n",
> +    "-numa cpu,node-id=node[,socket-id=x][,core-id=y][,thread-id=z]\n"
> +    "-numa hmat-lb,initiator=node,target=node,hierarchy=memory|first-level|second-level|third-level,data-type=access-latency|read-latency|write-latency[,latency=lat][,bandwidth=bw]\n",
>      QEMU_ARCH_ALL)
>  STEXI
>  @item -numa node[,mem=@var{size}][,cpus=@var{firstcpu}[-@var{lastcpu}]][,nodeid=@var{node}][,initiator=@var{initiator}]
>  @itemx -numa node[,memdev=@var{id}][,cpus=@var{firstcpu}[-@var{lastcpu}]][,nodeid=@var{node}][,initiator=@var{initiator}]
>  @itemx -numa dist,src=@var{source},dst=@var{destination},val=@var{distance}
>  @itemx -numa cpu,node-id=@var{node}[,socket-id=@var{x}][,core-id=@var{y}][,thread-id=@var{z}]
> +@itemx -numa hmat-lb,initiator=@var{node},target=@var{node},hierarchy=@var{str},data-type=@var{str}[,latency=@var{lat}][,bandwidth=@var{bw}]
>  @findex -numa
>  Define a NUMA node and assign RAM and VCPUs to it.
>  Set the NUMA distance from a source node to a destination node.
> +Set the ACPI Heterogeneous Memory Attributes for the given nodes.
>  
>  Legacy VCPU assignment uses @samp{cpus} option where
>  @var{firstcpu} and @var{lastcpu} are CPU indexes. Each
> @@ -256,6 +259,50 @@ specified resources, it just assigns existing resources to NUMA
>  nodes. This means that one still has to use the @option{-m},
>  @option{-smp} options to allocate RAM and VCPUs respectively.
>  
> +Use @samp{hmat-lb} to set System Locality Latency and Bandwidth Information
> +between initiator and target NUMA nodes in ACPI Heterogeneous Attribute Memory Table (HMAT).
> +Initiator NUMA node can create memory requests, usually including one or more processors.
> +Target NUMA node contains addressable memory.
> +
> +In @samp{hmat-lb} option, @var{node} are NUMA node IDs. @var{str} of 'hierarchy'
> +is the memory hierarchy of the target NUMA node: if @var{str} is 'memory', the structure
> +represents the memory performance; if @var{str} is 'first-level|second-level|third-level',
> +this structure represents aggregated performance of memory side caches for each domain.
> +@var{str} of 'data-type' is type of data represented by this structure instance:
> +if 'hierarchy' is 'memory', 'data-type' is 'access|read|write' latency(nanoseconds)
> +or 'access|read|write' bandwidth(MB/s) of the target memory; if 'hierarchy' is
> +'first-level|second-level|third-level', 'data-type' is 'access|read|write' hit latency
> +or 'access|read|write' hit bandwidth of the target memory side cache.
> +
> +@var{lat} of 'latency' is latency value, the possible value and units are
> +NUM[ps|ns|us] (picosecond|nanosecond|microsecond), the recommended unit is 'ns'. @var{bw}
> +is bandwidth value, the possible value and units are NUM[M|G|T], mean that
> +the bandwidth value are NUM MB/s, GB/s or TB/s. Note that max NUM is 65534,
> +if NUM is 0, means the corresponding latency or bandwidth information is not provided.
> +And if input numbers without any unit, the latency unit will be 'ps' and the bandwidth
> +will be MB/s.
> +
> +For example, the following option assigns NUMA node 0 and 1. Node 0 has 2 cpus and
> +a ram, node 1 has only a ram. The processors in node 0 access memory in node
> +0 with access-latency 5 nanoseconds, access-bandwidth is 200 MB/s;
> +The processors in NUMA node 0 access memory in NUMA node 1 with access-latency 10
> +nanoseconds, access-bandwidth is 100 MB/s.
> +@example
> +-machine hmat=on \
> +-m 2G \
> +-object memory-backend-ram,size=1G,id=m0 \
> +-object memory-backend-ram,size=1G,id=m1 \
> +-smp 2 \
> +-numa node,nodeid=0,memdev=m0 \
> +-numa node,nodeid=1,memdev=m1,initiator=0 \
> +-numa cpu,node-id=0,socket-id=0 \
> +-numa cpu,node-id=0,socket-id=1 \
> +-numa hmat-lb,initiator=0,target=0,hierarchy=memory,data-type=access-latency,latency=5ns \
> +-numa hmat-lb,initiator=0,target=0,hierarchy=memory,data-type=access-bandwidth,bandwidth=200M \
> +-numa hmat-lb,initiator=0,target=1,hierarchy=memory,data-type=access-latency,latency=10ns \
> +-numa hmat-lb,initiator=0,target=1,hierarchy=memory,data-type=access-bandwidth,bandwidth=100M
> +@end example
> +
>  ETEXI
>  
>  DEF("add-fd", HAS_ARG, QEMU_OPTION_add_fd,
Tao Xu Oct. 22, 2019, 8:22 a.m. UTC | #2
On 10/22/2019 3:08 PM, Igor Mammedov wrote:
> On Sun, 20 Oct 2019 19:11:19 +0800
> Tao Xu <tao3.xu@intel.com> wrote:
> 
>> From: Liu Jingqi <jingqi.liu@intel.com>
>>
>> Add -numa hmat-lb option to provide System Locality Latency and
>> Bandwidth Information. These memory attributes help to build
>> System Locality Latency and Bandwidth Information Structure(s)
>> in ACPI Heterogeneous Memory Attribute Table (HMAT).
>>
>> Signed-off-by: Liu Jingqi <jingqi.liu@intel.com>
>> Signed-off-by: Tao Xu <tao3.xu@intel.com>
>> ---
[...]
>> +void parse_numa_hmat_lb(NumaState *numa_state, NumaHmatLBOptions *node,
>> +                        Error **errp)
>> +{
>> +    int first_bit, last_bit;
>> +    uint64_t temp_latency;
>> +    NodeInfo *numa_info = numa_state->nodes;
>> +    HMAT_LB_Info *hmat_lb =
>> +        numa_state->hmat_lb[node->hierarchy][node->data_type];
>> +    HMAT_LB_Data lb_data;
>> +
>> +    /* Error checking */
>> +    if (node->initiator >= numa_state->num_nodes) {
>> +        error_setg(errp, "Invalid initiator=%d, it should be less than %d.",
>> +                   node->initiator, numa_state->num_nodes);
>> +        return;
>> +    }
>> +    if (node->target >= numa_state->num_nodes) {
>> +        error_setg(errp, "Invalid target=%d, it should be less than %d.",
>> +                   node->target, numa_state->num_nodes);
>> +        return;
>> +    }
>> +    if (!numa_info[node->initiator].has_cpu) {
>> +        error_setg(errp, "Invalid initiator=%d, it isn't an "
>> +                   "initiator proximity domain.", node->initiator);
>> +        return;
>> +    }
>> +    if (!numa_info[node->target].present) {
>> +        error_setg(errp, "Invalid target=%d, it hasn't a valid NUMA node.",
>> +                   node->target);
>> +        return;
>> +    }
>> +
>> +    if (!hmat_lb) {
>> +        hmat_lb = g_malloc0(sizeof(*hmat_lb));
>> +        numa_state->hmat_lb[node->hierarchy][node->data_type] = hmat_lb;
>> +        hmat_lb->latency = g_array_new(false, true, sizeof(HMAT_LB_Data));
>> +        hmat_lb->bandwidth = g_array_new(false, true, sizeof(HMAT_LB_Data));
>> +    }
>> +    hmat_lb->hierarchy = node->hierarchy;
>> +    hmat_lb->data_type = node->data_type;
>> +    lb_data.initiator = node->initiator;
>> +    lb_data.target = node->target;
>> +
>> +    /* Input latency data */
>> +    if (node->data_type <= HMATLB_DATA_TYPE_WRITE_LATENCY) {
>> +        if (!node->has_latency) {
>> +            error_setg(errp, "Missing 'latency' option.");
>> +            return;
>> +        }
>> +        if (node->has_bandwidth) {
>> +            error_setg(errp, "Invalid option 'bandwidth' since "
>> +                       "the data type is latency.");
>> +            return;
>> +        }
>> +
>> +        temp_latency = node->latency;
>> +        hmat_lb->base_latency = 1;
>> +        while (QEMU_IS_ALIGNED(temp_latency, 10)) {
>> +            temp_latency /= 10;
>> +            hmat_lb->base_latency *= 10;
>> +        }
>> +
>> +        if (temp_latency >= UINT64_MAX) {
>                              ^  ^^^^ doesn't make sense
> 
> can't you use range bitmap here as well?
> 
Because the time unit is decimal. For example, if latency are 
1ns(1000ps, bit 0011 1110 1000) and 65534ns(0011 1110 0111 1111 1000 
0011 0000)

0000 0000 0000 0000 0011 1110 1000
0011 1110 0111 1111 1000 0011 0000

Then we can get the first_bit is 3 and last_bit is 25, 25 - 3 > 16.

But if we use 10 to find base, we can find the base is 1000ps, 
compressed latency are 1 and 65534(< UINT16_MAX).
Igor Mammedov Oct. 23, 2019, 3:28 p.m. UTC | #3
On Sun, 20 Oct 2019 19:11:19 +0800
Tao Xu <tao3.xu@intel.com> wrote:

> From: Liu Jingqi <jingqi.liu@intel.com>
> 
> Add -numa hmat-lb option to provide System Locality Latency and
> Bandwidth Information. These memory attributes help to build
> System Locality Latency and Bandwidth Information Structure(s)
> in ACPI Heterogeneous Memory Attribute Table (HMAT).
> 
> Signed-off-by: Liu Jingqi <jingqi.liu@intel.com>
> Signed-off-by: Tao Xu <tao3.xu@intel.com>
> ---
> 
> Changes in v13:
>     - Reuse Garray to store the raw bandwidth and bandwidth data
>     - Calculate common base unit using range bitmap (Igor)
> ---
>  hw/core/numa.c        | 127 ++++++++++++++++++++++++++++++++++++++++++
>  include/sysemu/numa.h |  68 ++++++++++++++++++++++
>  qapi/machine.json     |  95 ++++++++++++++++++++++++++++++-
>  qemu-options.hx       |  49 +++++++++++++++-
>  4 files changed, 336 insertions(+), 3 deletions(-)
Below some comments on doc parts of the patch
(since I'm too familiar with the topic y now I can't properly review doc parts)

perhaps Eric and Markus can suggest a better way to describe new options.

[...]

> diff --git a/qapi/machine.json b/qapi/machine.json
> index f1b07b3486..9ca008810b 100644
> --- a/qapi/machine.json
> +++ b/qapi/machine.json
> @@ -426,10 +426,12 @@
>  #
>  # @cpu: property based CPU(s) to node mapping (Since: 2.10)
>  #
> +# @hmat-lb: memory latency and bandwidth information (Since: 4.2)
> +#
>  # Since: 2.1
>  ##
>  { 'enum': 'NumaOptionsType',
> -  'data': [ 'node', 'dist', 'cpu' ] }
> +  'data': [ 'node', 'dist', 'cpu', 'hmat-lb' ] }
>  
>  ##
>  # @NumaOptions:
> @@ -444,7 +446,8 @@
>    'data': {
>      'node': 'NumaNodeOptions',
>      'dist': 'NumaDistOptions',
> -    'cpu': 'NumaCpuOptions' }}
> +    'cpu': 'NumaCpuOptions',
> +    'hmat-lb': 'NumaHmatLBOptions' }}
>  
>  ##
>  # @NumaNodeOptions:
> @@ -557,6 +560,94 @@
>     'base': 'CpuInstanceProperties',
>     'data' : {} }
>  
> +##
> +# @HmatLBMemoryHierarchy:
> +#
> +# The memory hierarchy in the System Locality Latency
> +# and Bandwidth Information Structure of HMAT (Heterogeneous
> +# Memory Attribute Table)
> +#
> +# For more information of @HmatLBMemoryHierarchy see
> +# the chapter 5.2.27.4: Table 5-142: Field "Flags" of ACPI 6.3 spec.
> +#
> +# @memory: the structure represents the memory performance
> +#
> +# @first-level: first level memory of memory side cached memory
> +#
> +# @second-level: second level memory of memory side cached memory
> +#
> +# @third-level: third level memory of memory side cached memory
> +#
> +# Since: 4.2
> +##
> +{ 'enum': 'HmatLBMemoryHierarchy',
> +  'data': [ 'memory', 'first-level', 'second-level', 'third-level' ] }
> +
> +##
> +# @HmatLBDataType:
> +#
> +# Data type in the System Locality Latency
> +# and Bandwidth Information Structure of HMAT (Heterogeneous
> +# Memory Attribute Table)
> +#
> +# For more information of @HmatLBDataType see
> +# the chapter 5.2.27.4: Table 5-142:  Field "Data Type" of ACPI 6.3 spec.
> +#
> +# @access-latency: access latency (nanoseconds)
> +#
> +# @read-latency: read latency (nanoseconds)
> +#
> +# @write-latency: write latency (nanoseconds)
> +#
> +# @access-bandwidth: access bandwidth (MB/s)
> +#
> +# @read-bandwidth: read bandwidth (MB/s)
> +#
> +# @write-bandwidth: write bandwidth (MB/s)
I think units here are not appropriate, values stored in fields are
minimal base units only and nothing else (i.e. ps and B/s)

> +#
> +# Since: 4.2
> +##
> +{ 'enum': 'HmatLBDataType',
> +  'data': [ 'access-latency', 'read-latency', 'write-latency',
> +            'access-bandwidth', 'read-bandwidth', 'write-bandwidth' ] }
> +
> +##
> +# @NumaHmatLBOptions:
> +#
> +# Set the system locality latency and bandwidth information
> +# between Initiator and Target proximity Domains.
> +#
> +# For more information of @NumaHmatLBOptions see
> +# the chapter 5.2.27.4: Table 5-142 of ACPI 6.3 spec.
> +#
> +# @initiator: the Initiator Proximity Domain.
> +#
> +# @target: the Target Proximity Domain.
> +#
> +# @hierarchy: the Memory Hierarchy. Indicates the performance
> +#             of memory or side cache.
> +#
> +# @data-type: presents the type of data, access/read/write
> +#             latency or hit latency.

> +# @latency: the value of latency from @initiator to @target proximity domain,
> +#           the latency units are "ps(picosecond)", "ns(nanosecond)" or
> +#           "us(microsecond)".
> +#
> +# @bandwidth: the value of bandwidth between @initiator and @target proximity
> +#             domain, the bandwidth units are "MB(/s)","GB(/s)" or "TB(/s)".
ditto

> +# Since: 4.2
> +##
> +{ 'struct': 'NumaHmatLBOptions',
> +    'data': {
> +    'initiator': 'uint16',
> +    'target': 'uint16',
> +    'hierarchy': 'HmatLBMemoryHierarchy',
> +    'data-type': 'HmatLBDataType',
> +    '*latency': 'time',
> +    '*bandwidth': 'size' }}
> +
>  ##
>  # @HostMemPolicy:
>  #
> diff --git a/qemu-options.hx b/qemu-options.hx
> index 1f96399521..de97939f9a 100644
> --- a/qemu-options.hx
> +++ b/qemu-options.hx
> @@ -168,16 +168,19 @@ DEF("numa", HAS_ARG, QEMU_OPTION_numa,
>      "-numa node[,mem=size][,cpus=firstcpu[-lastcpu]][,nodeid=node][,initiator=node]\n"
>      "-numa node[,memdev=id][,cpus=firstcpu[-lastcpu]][,nodeid=node][,initiator=node]\n"
>      "-numa dist,src=source,dst=destination,val=distance\n"
> -    "-numa cpu,node-id=node[,socket-id=x][,core-id=y][,thread-id=z]\n",
> +    "-numa cpu,node-id=node[,socket-id=x][,core-id=y][,thread-id=z]\n"
> +    "-numa hmat-lb,initiator=node,target=node,hierarchy=memory|first-level|second-level|third-level,data-type=access-latency|read-latency|write-latency[,latency=lat][,bandwidth=bw]\n",
>      QEMU_ARCH_ALL)
>  STEXI
>  @item -numa node[,mem=@var{size}][,cpus=@var{firstcpu}[-@var{lastcpu}]][,nodeid=@var{node}][,initiator=@var{initiator}]
>  @itemx -numa node[,memdev=@var{id}][,cpus=@var{firstcpu}[-@var{lastcpu}]][,nodeid=@var{node}][,initiator=@var{initiator}]
>  @itemx -numa dist,src=@var{source},dst=@var{destination},val=@var{distance}
>  @itemx -numa cpu,node-id=@var{node}[,socket-id=@var{x}][,core-id=@var{y}][,thread-id=@var{z}]
> +@itemx -numa hmat-lb,initiator=@var{node},target=@var{node},hierarchy=@var{str},data-type=@var{str}[,latency=@var{lat}][,bandwidth=@var{bw}]
                                                                              ^^^                 ^^^
Using the same 'str' for 2 different enums is confusing.
Suggest for 1st use 'level' and for the second just 'type'

>  @findex -numa
>  Define a NUMA node and assign RAM and VCPUs to it.
>  Set the NUMA distance from a source node to a destination node.
> +Set the ACPI Heterogeneous Memory Attributes for the given nodes.
>  
>  Legacy VCPU assignment uses @samp{cpus} option where
>  @var{firstcpu} and @var{lastcpu} are CPU indexes. Each
> @@ -256,6 +259,50 @@ specified resources, it just assigns existing resources to NUMA
>  nodes. This means that one still has to use the @option{-m},
>  @option{-smp} options to allocate RAM and VCPUs respectively.
>  
> +Use @samp{hmat-lb} to set System Locality Latency and Bandwidth Information
> +between initiator and target NUMA nodes in ACPI Heterogeneous Attribute Memory Table (HMAT).
> +Initiator NUMA node can create memory requests, usually including one or more processors.
s/including/it has/

> +Target NUMA node contains addressable memory.
> +
> +In @samp{hmat-lb} option, @var{node} are NUMA node IDs. @var{str} of 'hierarchy'
> +is the memory hierarchy of the target NUMA node: if @var{str} is 'memory', the structure
> +represents the memory performance; if @var{str} is 'first-level|second-level|third-level',
> +this structure represents aggregated performance of memory side caches for each domain.
> +@var{str} of 'data-type' is type of data represented by this structure instance:
> +if 'hierarchy' is 'memory', 'data-type' is 'access|read|write' latency(nanoseconds)
is nanoseconds is right here? Looking at previous patches default value of suffix-less
should be picoseconds. I'd just drop '(nanoseconds)'. User will use appropriate suffix.

> +or 'access|read|write' bandwidth(MB/s) of the target memory; if 'hierarchy' is
ditto (MB/s), probably should be Bytes/s for default suffix-less value
(well, I'm not sure how to express it better)

> +'first-level|second-level|third-level', 'data-type' is 'access|read|write' hit latency
> +or 'access|read|write' hit bandwidth of the target memory side cache.
> +
> +@var{lat} of 'latency' is latency value, the possible value and units are
> +NUM[ps|ns|us] (picosecond|nanosecond|microsecond), the recommended unit is 'ns'. @var{bw}
> +is bandwidth value, the possible value and units are NUM[M|G|T], mean that

> +the bandwidth value are NUM MB/s, GB/s or TB/s. Note that max NUM is 65534,
> +if NUM is 0, means the corresponding latency or bandwidth information is not provided.
> +And if input numbers without any unit, the latency unit will be 'ps' and the bandwidth
> +will be MB/s.
 1st: above is applicable to both bw and lat values and should be documented as such
 2nd: 'max NUM is 65534' when different suffixes is fleeting target,
      spec says that entry with 0xFFFF is unreachable, so how about documenting
      unreachable value as 0xFFFFFFFFFFFFFFFF (then CLI parsing code will
      exclude it from range detection and acpi table building code translate it
      to internal 0xFFFF it could fit into the tables)

> +For example, the following option assigns NUMA node 0 and 1. Node 0 has 2 cpus and
> +a ram, node 1 has only a ram. The processors in node 0 access memory in node
> +0 with access-latency 5 nanoseconds, access-bandwidth is 200 MB/s;
> +The processors in NUMA node 0 access memory in NUMA node 1 with access-latency 10
> +nanoseconds, access-bandwidth is 100 MB/s.
> +@example
> +-machine hmat=on \
> +-m 2G \
> +-object memory-backend-ram,size=1G,id=m0 \
> +-object memory-backend-ram,size=1G,id=m1 \
> +-smp 2 \
> +-numa node,nodeid=0,memdev=m0 \
> +-numa node,nodeid=1,memdev=m1,initiator=0 \
> +-numa cpu,node-id=0,socket-id=0 \
> +-numa cpu,node-id=0,socket-id=1 \
> +-numa hmat-lb,initiator=0,target=0,hierarchy=memory,data-type=access-latency,latency=5ns \
> +-numa hmat-lb,initiator=0,target=0,hierarchy=memory,data-type=access-bandwidth,bandwidth=200M \
> +-numa hmat-lb,initiator=0,target=1,hierarchy=memory,data-type=access-latency,latency=10ns \
> +-numa hmat-lb,initiator=0,target=1,hierarchy=memory,data-type=access-bandwidth,bandwidth=100M
> +@end example
> +
>  ETEXI
>  
>  DEF("add-fd", HAS_ARG, QEMU_OPTION_add_fd,
Tao Xu Oct. 25, 2019, 6:33 a.m. UTC | #4
On 10/23/2019 11:28 PM, Igor Mammedov wrote:
> On Sun, 20 Oct 2019 19:11:19 +0800
> Tao Xu <tao3.xu@intel.com> wrote:
[...]
>> +#
>> +# @access-bandwidth: access bandwidth (MB/s)
>> +#
>> +# @read-bandwidth: read bandwidth (MB/s)
>> +#
>> +# @write-bandwidth: write bandwidth (MB/s)
> I think units here are not appropriate, values stored in fields are
> minimal base units only and nothing else (i.e. ps and B/s)
> 
Eric suggest me to drop picoseconds. So here I can use ns. For 
bandwidth, if we use B/s here, does it let user or developer to 
misunderstand that the smallest unit is B/s ?

>>   @item -numa node[,mem=@var{size}][,cpus=@var{firstcpu}[-@var{lastcpu}]][,nodeid=@var{node}][,initiator=@var{initiator}]
>>   @itemx -numa node[,memdev=@var{id}][,cpus=@var{firstcpu}[-@var{lastcpu}]][,nodeid=@var{node}][,initiator=@var{initiator}]
>>   @itemx -numa dist,src=@var{source},dst=@var{destination},val=@var{distance}
>>   @itemx -numa cpu,node-id=@var{node}[,socket-id=@var{x}][,core-id=@var{y}][,thread-id=@var{z}]
>> +@itemx -numa hmat-lb,initiator=@var{node},target=@var{node},hierarchy=@var{str},data-type=@var{str}[,latency=@var{lat}][,bandwidth=@var{bw}]
>                                                                                ^^^                 ^^^
> Using the same 'str' for 2 different enums is confusing.
> Suggest for 1st use 'level' and for the second just 'type'
> 
Ok

>>   @findex -numa
>>   Define a NUMA node and assign RAM and VCPUs to it.
>>   Set the NUMA distance from a source node to a destination node.
>> +Set the ACPI Heterogeneous Memory Attributes for the given nodes.
>>   
>>   Legacy VCPU assignment uses @samp{cpus} option where
>>   @var{firstcpu} and @var{lastcpu} are CPU indexes. Each
>> @@ -256,6 +259,50 @@ specified resources, it just assigns existing resources to NUMA
>>   nodes. This means that one still has to use the @option{-m},
>>   @option{-smp} options to allocate RAM and VCPUs respectively.
>>   
>> +Use @samp{hmat-lb} to set System Locality Latency and Bandwidth Information
>> +between initiator and target NUMA nodes in ACPI Heterogeneous Attribute Memory Table (HMAT).
>> +Initiator NUMA node can create memory requests, usually including one or more processors.
> s/including/it has/
> 
>> +Target NUMA node contains addressable memory.
>> +
>> +In @samp{hmat-lb} option, @var{node} are NUMA node IDs. @var{str} of 'hierarchy'
>> +is the memory hierarchy of the target NUMA node: if @var{str} is 'memory', the structure
>> +represents the memory performance; if @var{str} is 'first-level|second-level|third-level',
>> +this structure represents aggregated performance of memory side caches for each domain.
>> +@var{str} of 'data-type' is type of data represented by this structure instance:
>> +if 'hierarchy' is 'memory', 'data-type' is 'access|read|write' latency(nanoseconds)
> is nanoseconds is right here? Looking at previous patches default value of suffix-less
> should be picoseconds. I'd just drop '(nanoseconds)'. User will use appropriate suffix.
> 
OK, I will drop it.
>> +or 'access|read|write' bandwidth(MB/s) of the target memory; if 'hierarchy' is
> ditto (MB/s), probably should be Bytes/s for default suffix-less value
> (well, I'm not sure how to express it better)
> 

But last version, we let !QEMU_IS_ALIGNED(node->bandwidth, MiB) as error.
>> +'first-level|second-level|third-level', 'data-type' is 'access|read|write' hit latency
>> +or 'access|read|write' hit bandwidth of the target memory side cache.
>> +
>> +@var{lat} of 'latency' is latency value, the possible value and units are
>> +NUM[ps|ns|us] (picosecond|nanosecond|microsecond), the recommended unit is 'ns'. @var{bw}
>> +is bandwidth value, the possible value and units are NUM[M|G|T], mean that
> 
>> +the bandwidth value are NUM MB/s, GB/s or TB/s. Note that max NUM is 65534,
>> +if NUM is 0, means the corresponding latency or bandwidth information is not provided.
>> +And if input numbers without any unit, the latency unit will be 'ps' and the bandwidth
>> +will be MB/s.
>   1st: above is applicable to both bw and lat values and should be documented as such
>   2nd: 'max NUM is 65534' when different suffixes is fleeting target,
>        spec says that entry with 0xFFFF is unreachable, so how about documenting
>        unreachable value as 0xFFFFFFFFFFFFFFFF (then CLI parsing code will
>        exclude it from range detection and acpi table building code translate it
>        to internal 0xFFFF it could fit into the tables)
> 

If we input 0xFFFFFFFFFFFFFFFF, qemu will raise error that parameter 
expect a size value.
Igor Mammedov Oct. 25, 2019, 1:27 p.m. UTC | #5
On Fri, 25 Oct 2019 14:33:53 +0800
Tao Xu <tao3.xu@intel.com> wrote:

> On 10/23/2019 11:28 PM, Igor Mammedov wrote:
> > On Sun, 20 Oct 2019 19:11:19 +0800
> > Tao Xu <tao3.xu@intel.com> wrote:  
> [...]
> >> +#
> >> +# @access-bandwidth: access bandwidth (MB/s)
> >> +#
> >> +# @read-bandwidth: read bandwidth (MB/s)
> >> +#
> >> +# @write-bandwidth: write bandwidth (MB/s)  
> > I think units here are not appropriate, values stored in fields are
> > minimal base units only and nothing else (i.e. ps and B/s)
> >   
> Eric suggest me to drop picoseconds. So here I can use ns. For 
> bandwidth, if we use B/s here, does it let user or developer to 
> misunderstand that the smallest unit is B/s ?

It's not nanoseconds or MB/s stored in theses fields, isn't it?
I'd specify units in which value is stored or drop units altogether.

Maybe Eric and Markus can suggest a better way to describe fields.

> >>   @item -numa node[,mem=@var{size}][,cpus=@var{firstcpu}[-@var{lastcpu}]][,nodeid=@var{node}][,initiator=@var{initiator}]
> >>   @itemx -numa node[,memdev=@var{id}][,cpus=@var{firstcpu}[-@var{lastcpu}]][,nodeid=@var{node}][,initiator=@var{initiator}]
> >>   @itemx -numa dist,src=@var{source},dst=@var{destination},val=@var{distance}
> >>   @itemx -numa cpu,node-id=@var{node}[,socket-id=@var{x}][,core-id=@var{y}][,thread-id=@var{z}]
> >> +@itemx -numa hmat-lb,initiator=@var{node},target=@var{node},hierarchy=@var{str},data-type=@var{str}[,latency=@var{lat}][,bandwidth=@var{bw}]  
> >                                                                                ^^^                 ^^^
> > Using the same 'str' for 2 different enums is confusing.
> > Suggest for 1st use 'level' and for the second just 'type'
> >   
> Ok
> 
> >>   @findex -numa
> >>   Define a NUMA node and assign RAM and VCPUs to it.
> >>   Set the NUMA distance from a source node to a destination node.
> >> +Set the ACPI Heterogeneous Memory Attributes for the given nodes.
> >>   
> >>   Legacy VCPU assignment uses @samp{cpus} option where
> >>   @var{firstcpu} and @var{lastcpu} are CPU indexes. Each
> >> @@ -256,6 +259,50 @@ specified resources, it just assigns existing resources to NUMA
> >>   nodes. This means that one still has to use the @option{-m},
> >>   @option{-smp} options to allocate RAM and VCPUs respectively.
> >>   
> >> +Use @samp{hmat-lb} to set System Locality Latency and Bandwidth Information
> >> +between initiator and target NUMA nodes in ACPI Heterogeneous Attribute Memory Table (HMAT).
> >> +Initiator NUMA node can create memory requests, usually including one or more processors.  
> > s/including/it has/
> >   
> >> +Target NUMA node contains addressable memory.
> >> +
> >> +In @samp{hmat-lb} option, @var{node} are NUMA node IDs. @var{str} of 'hierarchy'
> >> +is the memory hierarchy of the target NUMA node: if @var{str} is 'memory', the structure
> >> +represents the memory performance; if @var{str} is 'first-level|second-level|third-level',
> >> +this structure represents aggregated performance of memory side caches for each domain.
> >> +@var{str} of 'data-type' is type of data represented by this structure instance:
> >> +if 'hierarchy' is 'memory', 'data-type' is 'access|read|write' latency(nanoseconds)  
> > is nanoseconds is right here? Looking at previous patches default value of suffix-less
> > should be picoseconds. I'd just drop '(nanoseconds)'. User will use appropriate suffix.
> >   
> OK, I will drop it.
> >> +or 'access|read|write' bandwidth(MB/s) of the target memory; if 'hierarchy' is  
> > ditto (MB/s), probably should be Bytes/s for default suffix-less value
> > (well, I'm not sure how to express it better)
> >   
> 
> But last version, we let !QEMU_IS_ALIGNED(node->bandwidth, MiB) as error.
it's alignment requirement and it doesn't mean that value could not be
represented in bytes/s.
And it is bytes/s if suffix isn't used.

As for alignment and precision of values you probably need to document
expectations here as well.

> >> +'first-level|second-level|third-level', 'data-type' is 'access|read|write' hit latency
> >> +or 'access|read|write' hit bandwidth of the target memory side cache.
> >> +
> >> +@var{lat} of 'latency' is latency value, the possible value and units are
> >> +NUM[ps|ns|us] (picosecond|nanosecond|microsecond), the recommended unit is 'ns'. @var{bw}
> >> +is bandwidth value, the possible value and units are NUM[M|G|T], mean that  
> >   
> >> +the bandwidth value are NUM MB/s, GB/s or TB/s. Note that max NUM is 65534,
> >> +if NUM is 0, means the corresponding latency or bandwidth information is not provided.
> >> +And if input numbers without any unit, the latency unit will be 'ps' and the bandwidth
> >> +will be MB/s.  
> >   1st: above is applicable to both bw and lat values and should be documented as such
> >   2nd: 'max NUM is 65534' when different suffixes is fleeting target,
> >        spec says that entry with 0xFFFF is unreachable, so how about documenting
> >        unreachable value as 0xFFFFFFFFFFFFFFFF (then CLI parsing code will
> >        exclude it from range detection and acpi table building code translate it
> >        to internal 0xFFFF it could fit into the tables)
> >   
> 
> If we input 0xFFFFFFFFFFFFFFFF, qemu will raise error that parameter 
> expect a size value.

opts_type_size() can't handle values >= 0xfffffffffffffc00

commit f46bfdbfc8f (util/cutils: Change qemu_strtosz*() from int64_t to uint64_t)

so behavior would change depending on if the value is parsed by CLI (size) or QMP (unit64) parsers.

we can cannibalize 0x0 as the unreachable value and an absent bandwidth/lat option
for not specified case.
It would be conflicting with matrix [1] values in spec, but CLI/QMP deals with
absolute values which are later processed into HMAT substructure.

Markus,
Can we make opts_type_size() handle full range of uint64_t?

1)
ACPI 6.3 spec:
5.2.27.4 System Locality Latency and Bandwidth Information Structure
Markus Armbruster Oct. 25, 2019, 7:44 p.m. UTC | #6
Igor Mammedov <imammedo@redhat.com> writes:

> On Fri, 25 Oct 2019 14:33:53 +0800
> Tao Xu <tao3.xu@intel.com> wrote:
>
>> On 10/23/2019 11:28 PM, Igor Mammedov wrote:
>> > On Sun, 20 Oct 2019 19:11:19 +0800
>> > Tao Xu <tao3.xu@intel.com> wrote:  
>> [...]
>> >> +#
>> >> +# @access-bandwidth: access bandwidth (MB/s)
>> >> +#
>> >> +# @read-bandwidth: read bandwidth (MB/s)
>> >> +#
>> >> +# @write-bandwidth: write bandwidth (MB/s)  
>> > I think units here are not appropriate, values stored in fields are
>> > minimal base units only and nothing else (i.e. ps and B/s)
>> >   
>> Eric suggest me to drop picoseconds. So here I can use ns. For 
>> bandwidth, if we use B/s here, does it let user or developer to 
>> misunderstand that the smallest unit is B/s ?
>
> It's not nanoseconds or MB/s stored in theses fields, isn't it?
> I'd specify units in which value is stored or drop units altogether.
>
> Maybe Eric and Markus can suggest a better way to describe fields.

This isn't review (yet), just an attempt to advise more quickly on
general QAPI/QMP conventions.

Unit prefixes like Mebi- are nice for humans, because 1MiB is clearer
than 1048576B.

QMP is for machines.  We eschew unit prefixes and unit symbols there.
The unit is implied.  Unit prefixes only complicate things.  Machines
can deal with 1048576 easily.  Also dealing 1024Ki and 1Mi is additional
work.  We therefore use JSON numbers for byte counts, not strings with
units.

The general rule is "always use the plainest implied unit that would
do."  There are exceptions, mostly due to review failure.

Byte rates should be in bytes per second.

For time, we've made a godawful mess.  The plainest unit is clearly the
second.  We commonly need sub-second granularity, though.
Floating-point seconds are unpopular for some reason :)  Instead we use
milli-, micro-, and nanoseconds, and even (seconds, microseconds) pairs.

QAPI schema documentation describes both the generated C and the QMP
wire protocol.  It must be written with the implied unit.  If you send a
byte rate in bytes per second via QMP, that's what you document.  Even
if a human interface lets you specify the byte rate in MiB/s.

Does this make sense?

>> >>   @item -numa node[,mem=@var{size}][,cpus=@var{firstcpu}[-@var{lastcpu}]][,nodeid=@var{node}][,initiator=@var{initiator}]
>> >>   @itemx -numa node[,memdev=@var{id}][,cpus=@var{firstcpu}[-@var{lastcpu}]][,nodeid=@var{node}][,initiator=@var{initiator}]
>> >>   @itemx -numa dist,src=@var{source},dst=@var{destination},val=@var{distance}
>> >>   @itemx -numa cpu,node-id=@var{node}[,socket-id=@var{x}][,core-id=@var{y}][,thread-id=@var{z}]
>> >> +@itemx -numa hmat-lb,initiator=@var{node},target=@var{node},hierarchy=@var{str},data-type=@var{str}[,latency=@var{lat}][,bandwidth=@var{bw}]  
>> >                                                                                ^^^                 ^^^
>> > Using the same 'str' for 2 different enums is confusing.
>> > Suggest for 1st use 'level' and for the second just 'type'
>> >   
>> Ok
>> 
>> >>   @findex -numa
>> >>   Define a NUMA node and assign RAM and VCPUs to it.
>> >>   Set the NUMA distance from a source node to a destination node.
>> >> +Set the ACPI Heterogeneous Memory Attributes for the given nodes.
>> >>   
>> >>   Legacy VCPU assignment uses @samp{cpus} option where
>> >>   @var{firstcpu} and @var{lastcpu} are CPU indexes. Each
>> >> @@ -256,6 +259,50 @@ specified resources, it just assigns existing resources to NUMA
>> >>   nodes. This means that one still has to use the @option{-m},
>> >>   @option{-smp} options to allocate RAM and VCPUs respectively.
>> >>   
>> >> +Use @samp{hmat-lb} to set System Locality Latency and Bandwidth Information
>> >> +between initiator and target NUMA nodes in ACPI Heterogeneous Attribute Memory Table (HMAT).
>> >> +Initiator NUMA node can create memory requests, usually including one or more processors.  
>> > s/including/it has/
>> >   
>> >> +Target NUMA node contains addressable memory.
>> >> +
>> >> +In @samp{hmat-lb} option, @var{node} are NUMA node IDs. @var{str} of 'hierarchy'
>> >> +is the memory hierarchy of the target NUMA node: if @var{str} is 'memory', the structure
>> >> +represents the memory performance; if @var{str} is 'first-level|second-level|third-level',
>> >> +this structure represents aggregated performance of memory side caches for each domain.
>> >> +@var{str} of 'data-type' is type of data represented by this structure instance:
>> >> +if 'hierarchy' is 'memory', 'data-type' is 'access|read|write' latency(nanoseconds)  
>> > is nanoseconds is right here? Looking at previous patches default value of suffix-less
>> > should be picoseconds. I'd just drop '(nanoseconds)'. User will use appropriate suffix.
>> >   
>> OK, I will drop it.
>> >> +or 'access|read|write' bandwidth(MB/s) of the target memory; if 'hierarchy' is  
>> > ditto (MB/s), probably should be Bytes/s for default suffix-less value
>> > (well, I'm not sure how to express it better)
>> >   
>> 
>> But last version, we let !QEMU_IS_ALIGNED(node->bandwidth, MiB) as error.
> it's alignment requirement and it doesn't mean that value could not be
> represented in bytes/s.
> And it is bytes/s if suffix isn't used.
>
> As for alignment and precision of values you probably need to document
> expectations here as well.
>
>> >> +'first-level|second-level|third-level', 'data-type' is 'access|read|write' hit latency
>> >> +or 'access|read|write' hit bandwidth of the target memory side cache.
>> >> +
>> >> +@var{lat} of 'latency' is latency value, the possible value and units are
>> >> +NUM[ps|ns|us] (picosecond|nanosecond|microsecond), the recommended unit is 'ns'. @var{bw}
>> >> +is bandwidth value, the possible value and units are NUM[M|G|T], mean that  
>> >   
>> >> +the bandwidth value are NUM MB/s, GB/s or TB/s. Note that max NUM is 65534,
>> >> +if NUM is 0, means the corresponding latency or bandwidth information is not provided.
>> >> +And if input numbers without any unit, the latency unit will be 'ps' and the bandwidth
>> >> +will be MB/s.  
>> >   1st: above is applicable to both bw and lat values and should be documented as such
>> >   2nd: 'max NUM is 65534' when different suffixes is fleeting target,
>> >        spec says that entry with 0xFFFF is unreachable, so how about documenting
>> >        unreachable value as 0xFFFFFFFFFFFFFFFF (then CLI parsing code will
>> >        exclude it from range detection and acpi table building code translate it
>> >        to internal 0xFFFF it could fit into the tables)
>> >   
>> 
>> If we input 0xFFFFFFFFFFFFFFFF, qemu will raise error that parameter 
>> expect a size value.
>
> opts_type_size() can't handle values >= 0xfffffffffffffc00
>
> commit f46bfdbfc8f (util/cutils: Change qemu_strtosz*() from int64_t to uint64_t)
>
> so behavior would change depending on if the value is parsed by CLI (size) or QMP (unit64) parsers.

Do these values matter?  Is there a use case for passing
18446744073709550593 via CLI?

> we can cannibalize 0x0 as the unreachable value and an absent bandwidth/lat option
> for not specified case.
> It would be conflicting with matrix [1] values in spec, but CLI/QMP deals with
> absolute values which are later processed into HMAT substructure.
>
> Markus,
> Can we make opts_type_size() handle full range of uint64_t?

Maybe.

> 1)
> ACPI 6.3 spec:
> 5.2.27.4 System Locality Latency and Bandwidth Information Structure
Eduardo Habkost Oct. 25, 2019, 8:51 p.m. UTC | #7
On Fri, Oct 25, 2019 at 09:44:50PM +0200, Markus Armbruster wrote:
> Igor Mammedov <imammedo@redhat.com> writes:
> 
> > On Fri, 25 Oct 2019 14:33:53 +0800
> > Tao Xu <tao3.xu@intel.com> wrote:
> >
> >> On 10/23/2019 11:28 PM, Igor Mammedov wrote:
> >> > On Sun, 20 Oct 2019 19:11:19 +0800
> >> > Tao Xu <tao3.xu@intel.com> wrote:  
> >> [...]
> >> >> +#
> >> >> +# @access-bandwidth: access bandwidth (MB/s)
> >> >> +#
> >> >> +# @read-bandwidth: read bandwidth (MB/s)
> >> >> +#
> >> >> +# @write-bandwidth: write bandwidth (MB/s)  
> >> > I think units here are not appropriate, values stored in fields are
> >> > minimal base units only and nothing else (i.e. ps and B/s)
> >> >   
> >> Eric suggest me to drop picoseconds. So here I can use ns. For 
> >> bandwidth, if we use B/s here, does it let user or developer to 
> >> misunderstand that the smallest unit is B/s ?
> >
> > It's not nanoseconds or MB/s stored in theses fields, isn't it?
> > I'd specify units in which value is stored or drop units altogether.
> >
> > Maybe Eric and Markus can suggest a better way to describe fields.
> 
> This isn't review (yet), just an attempt to advise more quickly on
> general QAPI/QMP conventions.
> 
> Unit prefixes like Mebi- are nice for humans, because 1MiB is clearer
> than 1048576B.
> 
> QMP is for machines.  We eschew unit prefixes and unit symbols there.
> The unit is implied.  Unit prefixes only complicate things.  Machines
> can deal with 1048576 easily.  Also dealing 1024Ki and 1Mi is additional
> work.  We therefore use JSON numbers for byte counts, not strings with
> units.
> 
> The general rule is "always use the plainest implied unit that would
> do."  There are exceptions, mostly due to review failure.
> 
> Byte rates should be in bytes per second.
> 
> For time, we've made a godawful mess.  The plainest unit is clearly the
> second.  We commonly need sub-second granularity, though.
> Floating-point seconds are unpopular for some reason :)  Instead we use
> milli-, micro-, and nanoseconds, and even (seconds, microseconds) pairs.
> 
> QAPI schema documentation describes both the generated C and the QMP
> wire protocol.  It must be written with the implied unit.  If you send a
> byte rate in bytes per second via QMP, that's what you document.  Even
> if a human interface lets you specify the byte rate in MiB/s.
> 
> Does this make sense?

This makes sense for the bandwidth fields.  We still need to
decide how to represent the latency field, though.

Seconds would be the obvious choice, if only it didn't risk
silently losing precision when converting numbers to floats.
Tao Xu Oct. 28, 2019, 2:05 a.m. UTC | #8
On 10/26/2019 4:51 AM, Eduardo Habkost wrote:
> On Fri, Oct 25, 2019 at 09:44:50PM +0200, Markus Armbruster wrote:
>> Igor Mammedov <imammedo@redhat.com> writes:
>>
>>> On Fri, 25 Oct 2019 14:33:53 +0800
>>> Tao Xu <tao3.xu@intel.com> wrote:
>>>
>>>> On 10/23/2019 11:28 PM, Igor Mammedov wrote:
>>>>> On Sun, 20 Oct 2019 19:11:19 +0800
>>>>> Tao Xu <tao3.xu@intel.com> wrote:
>>>> [...]
>>>>>> +#
>>>>>> +# @access-bandwidth: access bandwidth (MB/s)
>>>>>> +#
>>>>>> +# @read-bandwidth: read bandwidth (MB/s)
>>>>>> +#
>>>>>> +# @write-bandwidth: write bandwidth (MB/s)
>>>>> I think units here are not appropriate, values stored in fields are
>>>>> minimal base units only and nothing else (i.e. ps and B/s)
>>>>>    
>>>> Eric suggest me to drop picoseconds. So here I can use ns. For
>>>> bandwidth, if we use B/s here, does it let user or developer to
>>>> misunderstand that the smallest unit is B/s ?
>>>
>>> It's not nanoseconds or MB/s stored in theses fields, isn't it?
>>> I'd specify units in which value is stored or drop units altogether.
>>>
>>> Maybe Eric and Markus can suggest a better way to describe fields.
>>
>> This isn't review (yet), just an attempt to advise more quickly on
>> general QAPI/QMP conventions.
>>
>> Unit prefixes like Mebi- are nice for humans, because 1MiB is clearer
>> than 1048576B.
>>
>> QMP is for machines.  We eschew unit prefixes and unit symbols there.
>> The unit is implied.  Unit prefixes only complicate things.  Machines
>> can deal with 1048576 easily.  Also dealing 1024Ki and 1Mi is additional
>> work.  We therefore use JSON numbers for byte counts, not strings with
>> units.
>>
>> The general rule is "always use the plainest implied unit that would
>> do."  There are exceptions, mostly due to review failure.
>>
>> Byte rates should be in bytes per second.
>>
>> For time, we've made a godawful mess.  The plainest unit is clearly the
>> second.  We commonly need sub-second granularity, though.
>> Floating-point seconds are unpopular for some reason :)  Instead we use
>> milli-, micro-, and nanoseconds, and even (seconds, microseconds) pairs.
>>
>> QAPI schema documentation describes both the generated C and the QMP
>> wire protocol.  It must be written with the implied unit.  If you send a
>> byte rate in bytes per second via QMP, that's what you document.  Even
>> if a human interface lets you specify the byte rate in MiB/s.
>>
>> Does this make sense?
> 
> This makes sense for the bandwidth fields.  We still need to
> decide how to represent the latency field, though.
> 
> Seconds would be the obvious choice, if only it didn't risk
> silently losing precision when converting numbers to floats.
> 
Got it. I will use bytes per second for bandwidth here. Usually we use 
nanosecond for memory latency, so if we use second for latency, it may 
lose precision. So can I use nanosecond here, because we now use 
nanosecond as smallest time unit.
Markus Armbruster Oct. 28, 2019, 5:46 a.m. UTC | #9
Tao Xu <tao3.xu@intel.com> writes:

> Got it. I will use bytes per second for bandwidth here. Usually we use
> nanosecond for memory latency, so if we use second for latency, it may
> lose precision. So can I use nanosecond here, because we now use
> nanosecond as smallest time unit.

Sounds fair, go ahead.
Tao Xu Oct. 28, 2019, 7:25 a.m. UTC | #10
On 10/26/2019 3:44 AM, Markus Armbruster wrote:
> Igor Mammedov <imammedo@redhat.com> writes:
> 
[...]
>>>>    1st: above is applicable to both bw and lat values and should be documented as such
>>>>    2nd: 'max NUM is 65534' when different suffixes is fleeting target,
>>>>         spec says that entry with 0xFFFF is unreachable, so how about documenting
>>>>         unreachable value as 0xFFFFFFFFFFFFFFFF (then CLI parsing code will
>>>>         exclude it from range detection and acpi table building code translate it
>>>>         to internal 0xFFFF it could fit into the tables)
>>>>    
>>>
>>> If we input 0xFFFFFFFFFFFFFFFF, qemu will raise error that parameter
>>> expect a size value.
>>
>> opts_type_size() can't handle values >= 0xfffffffffffffc00
>>
>> commit f46bfdbfc8f (util/cutils: Change qemu_strtosz*() from int64_t to uint64_t)
>>
>> so behavior would change depending on if the value is parsed by CLI (size) or QMP (unit64) parsers.
> 
> Do these values matter?  Is there a use case for passing
> 18446744073709550593 via CLI?
> 

There is a case that we need to input 0xFFFF as ACPI HMAT entry (an 
unreachable case). But I am thinking drop this case because Linux kernel 
HMAT as blow:

          /*
          * Check for invalid and overflow values
          */
         if (entry == 0xffff || !entry)
                 return 0;
         else if (base > (UINT_MAX / (entry)))
                 return 0;

So 0xFFFF and 0 are the same.

>> we can cannibalize 0x0 as the unreachable value and an absent bandwidth/lat option
>> for not specified case.
>> It would be conflicting with matrix [1] values in spec, but CLI/QMP deals with
>> absolute values which are later processed into HMAT substructure.
>>
>> Markus,
>> Can we make opts_type_size() handle full range of uint64_t?
> 
> Maybe.
>

Patch
diff mbox series

diff --git a/hw/core/numa.c b/hw/core/numa.c
index eba66ab768..3cf77f6ac9 100644
--- a/hw/core/numa.c
+++ b/hw/core/numa.c
@@ -23,6 +23,7 @@ 
  */
 
 #include "qemu/osdep.h"
+#include "qemu/units.h"
 #include "sysemu/hostmem.h"
 #include "sysemu/numa.h"
 #include "sysemu/sysemu.h"
@@ -198,6 +199,119 @@  void parse_numa_distance(MachineState *ms, NumaDistOptions *dist, Error **errp)
     ms->numa_state->have_numa_distance = true;
 }
 
+void parse_numa_hmat_lb(NumaState *numa_state, NumaHmatLBOptions *node,
+                        Error **errp)
+{
+    int first_bit, last_bit;
+    uint64_t temp_latency;
+    NodeInfo *numa_info = numa_state->nodes;
+    HMAT_LB_Info *hmat_lb =
+        numa_state->hmat_lb[node->hierarchy][node->data_type];
+    HMAT_LB_Data lb_data;
+
+    /* Error checking */
+    if (node->initiator >= numa_state->num_nodes) {
+        error_setg(errp, "Invalid initiator=%d, it should be less than %d.",
+                   node->initiator, numa_state->num_nodes);
+        return;
+    }
+    if (node->target >= numa_state->num_nodes) {
+        error_setg(errp, "Invalid target=%d, it should be less than %d.",
+                   node->target, numa_state->num_nodes);
+        return;
+    }
+    if (!numa_info[node->initiator].has_cpu) {
+        error_setg(errp, "Invalid initiator=%d, it isn't an "
+                   "initiator proximity domain.", node->initiator);
+        return;
+    }
+    if (!numa_info[node->target].present) {
+        error_setg(errp, "Invalid target=%d, it hasn't a valid NUMA node.",
+                   node->target);
+        return;
+    }
+
+    if (!hmat_lb) {
+        hmat_lb = g_malloc0(sizeof(*hmat_lb));
+        numa_state->hmat_lb[node->hierarchy][node->data_type] = hmat_lb;
+        hmat_lb->latency = g_array_new(false, true, sizeof(HMAT_LB_Data));
+        hmat_lb->bandwidth = g_array_new(false, true, sizeof(HMAT_LB_Data));
+    }
+    hmat_lb->hierarchy = node->hierarchy;
+    hmat_lb->data_type = node->data_type;
+    lb_data.initiator = node->initiator;
+    lb_data.target = node->target;
+
+    /* Input latency data */
+    if (node->data_type <= HMATLB_DATA_TYPE_WRITE_LATENCY) {
+        if (!node->has_latency) {
+            error_setg(errp, "Missing 'latency' option.");
+            return;
+        }
+        if (node->has_bandwidth) {
+            error_setg(errp, "Invalid option 'bandwidth' since "
+                       "the data type is latency.");
+            return;
+        }
+
+        temp_latency = node->latency;
+        hmat_lb->base_latency = 1;
+        while (QEMU_IS_ALIGNED(temp_latency, 10)) {
+            temp_latency /= 10;
+            hmat_lb->base_latency *= 10;
+        }
+
+        if (temp_latency >= UINT64_MAX) {
+            error_setg(errp, "Latency %" PRIu64 " between initiator=%d and "
+                       "target=%d should not differ from previously entered "
+                       "values on more than %d.", node->latency,
+                       node->initiator, node->target, UINT16_MAX - 1);
+            return;
+        }
+        if (temp_latency > hmat_lb->range_left_la) {
+            hmat_lb->range_left_la = temp_latency;
+        }
+
+        lb_data.rawdata = node->latency;
+        g_array_append_val(hmat_lb->latency, lb_data);
+    }
+
+    /* Input bandwidth data */
+    if (node->data_type >= HMATLB_DATA_TYPE_ACCESS_BANDWIDTH) {
+        if (!node->has_bandwidth) {
+            error_setg(errp, "Missing 'bandwidth' option.");
+            return;
+        }
+        if (node->has_latency) {
+            error_setg(errp, "Invalid option 'latency' since "
+                       "the data type is bandwidth.");
+            return;
+        }
+        if (!QEMU_IS_ALIGNED(node->bandwidth, MiB)) {
+            error_setg(errp, "Bandwidth %" PRIu64 " between initiator=%d and "
+                       "target=%d should be 1MB aligned.", node->bandwidth,
+                       node->initiator, node->target);
+            return;
+        }
+
+        hmat_lb->range_bitmap_bw |= node->bandwidth;
+
+        first_bit = __builtin_ffs(hmat_lb->range_bitmap_bw);
+        last_bit = __builtin_ctz(hmat_lb->range_bitmap_bw);
+        if ((last_bit - first_bit) > UINT16_BITS) {
+            error_setg(errp, "Bandwidth %" PRIu64 " between initiator=%d and "
+                       "target=%d should not differ from previously entered "
+                       "values on more than %d.", node->bandwidth,
+                       node->initiator, node->target, UINT16_MAX - 1);
+            return;
+        }
+
+        hmat_lb->base_bandwidth = UINT64_C(1) << (first_bit - 1);
+        lb_data.rawdata = node->bandwidth;
+        g_array_append_val(hmat_lb->bandwidth, lb_data);
+    }
+}
+
 void set_numa_options(MachineState *ms, NumaOptions *object, Error **errp)
 {
     Error *err = NULL;
@@ -236,6 +350,19 @@  void set_numa_options(MachineState *ms, NumaOptions *object, Error **errp)
         machine_set_cpu_numa_node(ms, qapi_NumaCpuOptions_base(&object->u.cpu),
                                   &err);
         break;
+    case NUMA_OPTIONS_TYPE_HMAT_LB:
+        if (!ms->numa_state->hmat_enabled) {
+            error_setg(errp, "ACPI Heterogeneous Memory Attribute Table "
+                       "(HMAT) is disabled, enable it with -machine hmat=on "
+                       "before using any of hmat specific options.");
+            return;
+        }
+
+        parse_numa_hmat_lb(ms->numa_state, &object->u.hmat_lb, &err);
+        if (err) {
+            goto end;
+        }
+        break;
     default:
         abort();
     }
diff --git a/include/sysemu/numa.h b/include/sysemu/numa.h
index 788cbec7a2..b45afcb29e 100644
--- a/include/sysemu/numa.h
+++ b/include/sysemu/numa.h
@@ -14,6 +14,29 @@  struct CPUArchId;
 #define NUMA_DISTANCE_MAX         254
 #define NUMA_DISTANCE_UNREACHABLE 255
 
+/* the value of AcpiHmatLBInfo flags */
+enum {
+    HMAT_LB_MEM_MEMORY           = 0,
+    HMAT_LB_MEM_CACHE_1ST_LEVEL  = 1,
+    HMAT_LB_MEM_CACHE_2ND_LEVEL  = 2,
+    HMAT_LB_MEM_CACHE_3RD_LEVEL  = 3,
+};
+
+/* the value of AcpiHmatLBInfo data type */
+enum {
+    HMAT_LB_DATA_ACCESS_LATENCY   = 0,
+    HMAT_LB_DATA_READ_LATENCY     = 1,
+    HMAT_LB_DATA_WRITE_LATENCY    = 2,
+    HMAT_LB_DATA_ACCESS_BANDWIDTH = 3,
+    HMAT_LB_DATA_READ_BANDWIDTH   = 4,
+    HMAT_LB_DATA_WRITE_BANDWIDTH  = 5,
+};
+
+#define UINT16_BITS       16
+
+#define HMAT_LB_LEVELS    (HMAT_LB_MEM_CACHE_3RD_LEVEL + 1)
+#define HMAT_LB_TYPES     (HMAT_LB_DATA_WRITE_BANDWIDTH + 1)
+
 struct NodeInfo {
     uint64_t node_mem;
     struct HostMemoryBackend *node_memdev;
@@ -28,6 +51,46 @@  struct NumaNodeMem {
     uint64_t node_plugged_mem;
 };
 
+struct HMAT_LB_Data {
+    uint8_t     initiator;
+    uint8_t     target;
+    uint64_t    rawdata;
+};
+typedef struct HMAT_LB_Data HMAT_LB_Data;
+
+struct HMAT_LB_Info {
+    /* Indicates it's memory or the specified level memory side cache. */
+    uint8_t     hierarchy;
+
+    /* Present the type of data, access/read/write latency or bandwidth. */
+    uint8_t     data_type;
+
+    /* The left range of latency for calculating common latency base */
+    uint64_t    range_left_la;
+
+    /* The range bitmap of bandwidth for calculating common bandwidth base */
+    uint64_t    range_bitmap_bw;
+
+    /* The common base unit for latencies */
+    uint64_t    base_latency;
+
+    /* The common base unit for bandwidths */
+    uint64_t    base_bandwidth;
+
+    /* Array to store the compressed latencies */
+    uint16_t    *entry_latency;
+
+    /* Array to store the compressed latencies */
+    uint16_t    *entry_bandwidth;
+
+    /* Array to store the latencies */
+    GArray      *latency;
+
+    /* Array to store the bandwidthes */
+    GArray      *bandwidth;
+};
+typedef struct HMAT_LB_Info HMAT_LB_Info;
+
 struct NumaState {
     /* Number of NUMA nodes */
     int num_nodes;
@@ -40,11 +103,16 @@  struct NumaState {
 
     /* NUMA nodes information */
     NodeInfo nodes[MAX_NODES];
+
+    /* NUMA nodes HMAT Locality Latency and Bandwidth Information */
+    HMAT_LB_Info *hmat_lb[HMAT_LB_LEVELS][HMAT_LB_TYPES];
 };
 typedef struct NumaState NumaState;
 
 void set_numa_options(MachineState *ms, NumaOptions *object, Error **errp);
 void parse_numa_opts(MachineState *ms);
+void parse_numa_hmat_lb(NumaState *numa_state, NumaHmatLBOptions *node,
+                        Error **errp);
 void numa_complete_configuration(MachineState *ms);
 void query_numa_node_mem(NumaNodeMem node_mem[], MachineState *ms);
 extern QemuOptsList qemu_numa_opts;
diff --git a/qapi/machine.json b/qapi/machine.json
index f1b07b3486..9ca008810b 100644
--- a/qapi/machine.json
+++ b/qapi/machine.json
@@ -426,10 +426,12 @@ 
 #
 # @cpu: property based CPU(s) to node mapping (Since: 2.10)
 #
+# @hmat-lb: memory latency and bandwidth information (Since: 4.2)
+#
 # Since: 2.1
 ##
 { 'enum': 'NumaOptionsType',
-  'data': [ 'node', 'dist', 'cpu' ] }
+  'data': [ 'node', 'dist', 'cpu', 'hmat-lb' ] }
 
 ##
 # @NumaOptions:
@@ -444,7 +446,8 @@ 
   'data': {
     'node': 'NumaNodeOptions',
     'dist': 'NumaDistOptions',
-    'cpu': 'NumaCpuOptions' }}
+    'cpu': 'NumaCpuOptions',
+    'hmat-lb': 'NumaHmatLBOptions' }}
 
 ##
 # @NumaNodeOptions:
@@ -557,6 +560,94 @@ 
    'base': 'CpuInstanceProperties',
    'data' : {} }
 
+##
+# @HmatLBMemoryHierarchy:
+#
+# The memory hierarchy in the System Locality Latency
+# and Bandwidth Information Structure of HMAT (Heterogeneous
+# Memory Attribute Table)
+#
+# For more information of @HmatLBMemoryHierarchy see
+# the chapter 5.2.27.4: Table 5-142: Field "Flags" of ACPI 6.3 spec.
+#
+# @memory: the structure represents the memory performance
+#
+# @first-level: first level memory of memory side cached memory
+#
+# @second-level: second level memory of memory side cached memory
+#
+# @third-level: third level memory of memory side cached memory
+#
+# Since: 4.2
+##
+{ 'enum': 'HmatLBMemoryHierarchy',
+  'data': [ 'memory', 'first-level', 'second-level', 'third-level' ] }
+
+##
+# @HmatLBDataType:
+#
+# Data type in the System Locality Latency
+# and Bandwidth Information Structure of HMAT (Heterogeneous
+# Memory Attribute Table)
+#
+# For more information of @HmatLBDataType see
+# the chapter 5.2.27.4: Table 5-142:  Field "Data Type" of ACPI 6.3 spec.
+#
+# @access-latency: access latency (nanoseconds)
+#
+# @read-latency: read latency (nanoseconds)
+#
+# @write-latency: write latency (nanoseconds)
+#
+# @access-bandwidth: access bandwidth (MB/s)
+#
+# @read-bandwidth: read bandwidth (MB/s)
+#
+# @write-bandwidth: write bandwidth (MB/s)
+#
+# Since: 4.2
+##
+{ 'enum': 'HmatLBDataType',
+  'data': [ 'access-latency', 'read-latency', 'write-latency',
+            'access-bandwidth', 'read-bandwidth', 'write-bandwidth' ] }
+
+##
+# @NumaHmatLBOptions:
+#
+# Set the system locality latency and bandwidth information
+# between Initiator and Target proximity Domains.
+#
+# For more information of @NumaHmatLBOptions see
+# the chapter 5.2.27.4: Table 5-142 of ACPI 6.3 spec.
+#
+# @initiator: the Initiator Proximity Domain.
+#
+# @target: the Target Proximity Domain.
+#
+# @hierarchy: the Memory Hierarchy. Indicates the performance
+#             of memory or side cache.
+#
+# @data-type: presents the type of data, access/read/write
+#             latency or hit latency.
+#
+# @latency: the value of latency from @initiator to @target proximity domain,
+#           the latency units are "ps(picosecond)", "ns(nanosecond)" or
+#           "us(microsecond)".
+#
+# @bandwidth: the value of bandwidth between @initiator and @target proximity
+#             domain, the bandwidth units are "MB(/s)","GB(/s)" or "TB(/s)".
+#
+# Since: 4.2
+##
+{ 'struct': 'NumaHmatLBOptions',
+    'data': {
+    'initiator': 'uint16',
+    'target': 'uint16',
+    'hierarchy': 'HmatLBMemoryHierarchy',
+    'data-type': 'HmatLBDataType',
+    '*latency': 'time',
+    '*bandwidth': 'size' }}
+
 ##
 # @HostMemPolicy:
 #
diff --git a/qemu-options.hx b/qemu-options.hx
index 1f96399521..de97939f9a 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -168,16 +168,19 @@  DEF("numa", HAS_ARG, QEMU_OPTION_numa,
     "-numa node[,mem=size][,cpus=firstcpu[-lastcpu]][,nodeid=node][,initiator=node]\n"
     "-numa node[,memdev=id][,cpus=firstcpu[-lastcpu]][,nodeid=node][,initiator=node]\n"
     "-numa dist,src=source,dst=destination,val=distance\n"
-    "-numa cpu,node-id=node[,socket-id=x][,core-id=y][,thread-id=z]\n",
+    "-numa cpu,node-id=node[,socket-id=x][,core-id=y][,thread-id=z]\n"
+    "-numa hmat-lb,initiator=node,target=node,hierarchy=memory|first-level|second-level|third-level,data-type=access-latency|read-latency|write-latency[,latency=lat][,bandwidth=bw]\n",
     QEMU_ARCH_ALL)
 STEXI
 @item -numa node[,mem=@var{size}][,cpus=@var{firstcpu}[-@var{lastcpu}]][,nodeid=@var{node}][,initiator=@var{initiator}]
 @itemx -numa node[,memdev=@var{id}][,cpus=@var{firstcpu}[-@var{lastcpu}]][,nodeid=@var{node}][,initiator=@var{initiator}]
 @itemx -numa dist,src=@var{source},dst=@var{destination},val=@var{distance}
 @itemx -numa cpu,node-id=@var{node}[,socket-id=@var{x}][,core-id=@var{y}][,thread-id=@var{z}]
+@itemx -numa hmat-lb,initiator=@var{node},target=@var{node},hierarchy=@var{str},data-type=@var{str}[,latency=@var{lat}][,bandwidth=@var{bw}]
 @findex -numa
 Define a NUMA node and assign RAM and VCPUs to it.
 Set the NUMA distance from a source node to a destination node.
+Set the ACPI Heterogeneous Memory Attributes for the given nodes.
 
 Legacy VCPU assignment uses @samp{cpus} option where
 @var{firstcpu} and @var{lastcpu} are CPU indexes. Each
@@ -256,6 +259,50 @@  specified resources, it just assigns existing resources to NUMA
 nodes. This means that one still has to use the @option{-m},
 @option{-smp} options to allocate RAM and VCPUs respectively.
 
+Use @samp{hmat-lb} to set System Locality Latency and Bandwidth Information
+between initiator and target NUMA nodes in ACPI Heterogeneous Attribute Memory Table (HMAT).
+Initiator NUMA node can create memory requests, usually including one or more processors.
+Target NUMA node contains addressable memory.
+
+In @samp{hmat-lb} option, @var{node} are NUMA node IDs. @var{str} of 'hierarchy'
+is the memory hierarchy of the target NUMA node: if @var{str} is 'memory', the structure
+represents the memory performance; if @var{str} is 'first-level|second-level|third-level',
+this structure represents aggregated performance of memory side caches for each domain.
+@var{str} of 'data-type' is type of data represented by this structure instance:
+if 'hierarchy' is 'memory', 'data-type' is 'access|read|write' latency(nanoseconds)
+or 'access|read|write' bandwidth(MB/s) of the target memory; if 'hierarchy' is
+'first-level|second-level|third-level', 'data-type' is 'access|read|write' hit latency
+or 'access|read|write' hit bandwidth of the target memory side cache.
+
+@var{lat} of 'latency' is latency value, the possible value and units are
+NUM[ps|ns|us] (picosecond|nanosecond|microsecond), the recommended unit is 'ns'. @var{bw}
+is bandwidth value, the possible value and units are NUM[M|G|T], mean that
+the bandwidth value are NUM MB/s, GB/s or TB/s. Note that max NUM is 65534,
+if NUM is 0, means the corresponding latency or bandwidth information is not provided.
+And if input numbers without any unit, the latency unit will be 'ps' and the bandwidth
+will be MB/s.
+
+For example, the following option assigns NUMA node 0 and 1. Node 0 has 2 cpus and
+a ram, node 1 has only a ram. The processors in node 0 access memory in node
+0 with access-latency 5 nanoseconds, access-bandwidth is 200 MB/s;
+The processors in NUMA node 0 access memory in NUMA node 1 with access-latency 10
+nanoseconds, access-bandwidth is 100 MB/s.
+@example
+-machine hmat=on \
+-m 2G \
+-object memory-backend-ram,size=1G,id=m0 \
+-object memory-backend-ram,size=1G,id=m1 \
+-smp 2 \
+-numa node,nodeid=0,memdev=m0 \
+-numa node,nodeid=1,memdev=m1,initiator=0 \
+-numa cpu,node-id=0,socket-id=0 \
+-numa cpu,node-id=0,socket-id=1 \
+-numa hmat-lb,initiator=0,target=0,hierarchy=memory,data-type=access-latency,latency=5ns \
+-numa hmat-lb,initiator=0,target=0,hierarchy=memory,data-type=access-bandwidth,bandwidth=200M \
+-numa hmat-lb,initiator=0,target=1,hierarchy=memory,data-type=access-latency,latency=10ns \
+-numa hmat-lb,initiator=0,target=1,hierarchy=memory,data-type=access-bandwidth,bandwidth=100M
+@end example
+
 ETEXI
 
 DEF("add-fd", HAS_ARG, QEMU_OPTION_add_fd,