Patchwork [3/7] NUMA: parse guest numa nodes memory policy

login
register
mail settings
Submitter Wanlong Gao
Date June 18, 2013, 8:09 a.m.
Message ID <1371542991-15911-4-git-send-email-gaowanlong@cn.fujitsu.com>
Download mbox | patch
Permalink /patch/252128/
State New
Headers show

Comments

Wanlong Gao - June 18, 2013, 8:09 a.m.
The memory policy setting format is like:
{membind|interleave|preferred}=[+|!]{all|N-N}
And we are adding this setting as a suboption of "-numa",
the memory policy then can be set like following:
 -numa node,nodeid=0,mem=1024,cpus=0,membind=0-1
 -numa node,nodeid=1,mem=1024,cpus=1,interleave=1

Signed-off-by: Andre Przywara <andre.przywara@amd.com>
Signed-off-by: Wanlong Gao <gaowanlong@cn.fujitsu.com>
---
 include/sysemu/sysemu.h |  8 ++++++
 vl.c                    | 76 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 84 insertions(+)
Paolo Bonzini - June 18, 2013, 9:20 a.m.
Il 18/06/2013 10:09, Wanlong Gao ha scritto:
> +static unsigned int numa_node_parse_mpol(const char *str, unsigned long *bm)
> +{
> +    unsigned long long value, endvalue;
> +    char *endptr;
> +    unsigned int flags = 0;
> +
> +    if (str[0] == '!') {
> +        flags |= 2;

clear = true;

> +        bitmap_fill(bm, MAX_CPUMASK_BITS);
> +        str++;
> +    }
> +    if (str[0] == '+') {
> +        flags |= 1;

flags = NODE_HOST_RELATIVE

> +        str++;
> +    }
> +
> +    if (!strcmp(str, "all")) {
> +        bitmap_fill(bm, MAX_CPUMASK_BITS);
> +        return flags;
> +    }
> +
> +    if (parse_uint(str, &value, &endptr, 10) < 0)
> +        goto error;
> +    if (*endptr == '-') {
> +        if (parse_uint_full(endptr + 1, &endvalue, 10) < 0) {
> +            goto error;
> +        }
> +    } else if (*endptr == '\0') {
> +        endvalue = value;
> +    } else {
> +        goto error;
> +    }
> +
> +    if (endvalue >= MAX_CPUMASK_BITS) {
> +        endvalue = MAX_CPUMASK_BITS - 1;
> +        fprintf(stderr,
> +            "qemu: NUMA: A max of %d host nodes are supported\n",
> +             MAX_CPUMASK_BITS);
> +    }
> +
> +    if (endvalue < value) {
> +        goto error;
> +    }
> +
> +    if (flags & 2)

if (clear)

> +        bitmap_clear(bm, value, endvalue - value + 1);
> +    else
> +        bitmap_set(bm, value, endvalue - value + 1);
> +
> +    return flags;
> +
> +error:
> +    fprintf(stderr, "qemu: Invalid host NUMA nodes range: %s\n", str);

Please change the functions (numa_add and numa_node_parse_mpol) to
accept an Error *.  This will make it much easier to reuse them for e.g.
memory hotplug in the future.

> +    return 4;

return -EINVAL;

> +}

> +        if (get_param_value(option, 128, "interleave", optarg) != 0)
> +            numa_info[nodenr].flags |= NODE_HOST_INTERLEAVE;
> +        else if (get_param_value(option, 128, "preferred", optarg) != 0)
> +            numa_info[nodenr].flags |= NODE_HOST_PREFERRED;
> +        else if (get_param_value(option, 128, "membind", optarg) != 0)
> +            numa_info[nodenr].flags |= NODE_HOST_BIND;

You're not handling the case where someone specifies more than one option.

What about:

   policy={interleave,preferred,bind},mem-hostnode=0

?

Also, please use QemuOpts instead of yet another homegrown parser.
Eduardo, I think you had the most recent attempt to convert -numa to
QemuOpts?

> +        if (option[0] != 0) {
> +            ret = numa_node_parse_mpol(option, numa_info[nodenr].host_mem);
> +            if (ret == 4) {

if (ret < 0)

> +                exit(1);
> +            } else if (ret & 1) {
> +                numa_info[nodenr].flags |= NODE_HOST_RELATIVE;

else {
    numa_info[nodenr].flags |= ret;
}

> +            }
> +        }
> +

Paolo
Wanlong Gao - June 18, 2013, 9:54 a.m.
On 06/18/2013 05:20 PM, Paolo Bonzini wrote:
> Il 18/06/2013 10:09, Wanlong Gao ha scritto:
>> +static unsigned int numa_node_parse_mpol(const char *str, unsigned long *bm)
>> +{
>> +    unsigned long long value, endvalue;
>> +    char *endptr;
>> +    unsigned int flags = 0;
>> +
>> +    if (str[0] == '!') {
>> +        flags |= 2;
> 
> clear = true;
> 
>> +        bitmap_fill(bm, MAX_CPUMASK_BITS);
>> +        str++;
>> +    }
>> +    if (str[0] == '+') {
>> +        flags |= 1;
> 
> flags = NODE_HOST_RELATIVE
> 
>> +        str++;
>> +    }
>> +
>> +    if (!strcmp(str, "all")) {
>> +        bitmap_fill(bm, MAX_CPUMASK_BITS);
>> +        return flags;
>> +    }
>> +
>> +    if (parse_uint(str, &value, &endptr, 10) < 0)
>> +        goto error;
>> +    if (*endptr == '-') {
>> +        if (parse_uint_full(endptr + 1, &endvalue, 10) < 0) {
>> +            goto error;
>> +        }
>> +    } else if (*endptr == '\0') {
>> +        endvalue = value;
>> +    } else {
>> +        goto error;
>> +    }
>> +
>> +    if (endvalue >= MAX_CPUMASK_BITS) {
>> +        endvalue = MAX_CPUMASK_BITS - 1;
>> +        fprintf(stderr,
>> +            "qemu: NUMA: A max of %d host nodes are supported\n",
>> +             MAX_CPUMASK_BITS);
>> +    }
>> +
>> +    if (endvalue < value) {
>> +        goto error;
>> +    }
>> +
>> +    if (flags & 2)
> 
> if (clear)
> 
>> +        bitmap_clear(bm, value, endvalue - value + 1);
>> +    else
>> +        bitmap_set(bm, value, endvalue - value + 1);
>> +
>> +    return flags;
>> +
>> +error:
>> +    fprintf(stderr, "qemu: Invalid host NUMA nodes range: %s\n", str);
> 
> Please change the functions (numa_add and numa_node_parse_mpol) to
> accept an Error *.  This will make it much easier to reuse them for e.g.
> memory hotplug in the future.

Got it, I'll try. Thank you.

> 
>> +    return 4;
> 
> return -EINVAL;
> 
>> +}
> 
>> +        if (get_param_value(option, 128, "interleave", optarg) != 0)
>> +            numa_info[nodenr].flags |= NODE_HOST_INTERLEAVE;
>> +        else if (get_param_value(option, 128, "preferred", optarg) != 0)
>> +            numa_info[nodenr].flags |= NODE_HOST_PREFERRED;
>> +        else if (get_param_value(option, 128, "membind", optarg) != 0)
>> +            numa_info[nodenr].flags |= NODE_HOST_BIND;
> 
> You're not handling the case where someone specifies more than one option.
> 
> What about:
> 
>    policy={interleave,preferred,bind},mem-hostnode=0
> 
> ?

OK, will follow this, thank you.

> 
> Also, please use QemuOpts instead of yet another homegrown parser.
> Eduardo, I think you had the most recent attempt to convert -numa to
> QemuOpts?

So, any patches I can based on or change it myself?  Eduardo?


Thanks,
Wanlong Gao

> 
>> +        if (option[0] != 0) {
>> +            ret = numa_node_parse_mpol(option, numa_info[nodenr].host_mem);
>> +            if (ret == 4) {
> 
> if (ret < 0)
> 
>> +                exit(1);
>> +            } else if (ret & 1) {
>> +                numa_info[nodenr].flags |= NODE_HOST_RELATIVE;
> 
> else {
>     numa_info[nodenr].flags |= ret;
> }
> 
>> +            }
>> +        }
>> +
> 
> Paolo
>
Eduardo Habkost - June 18, 2013, 7 p.m.
On Tue, Jun 18, 2013 at 11:20:37AM +0200, Paolo Bonzini wrote:
[...]
> Also, please use QemuOpts instead of yet another homegrown parser.
> Eduardo, I think you had the most recent attempt to convert -numa to
> QemuOpts?

I had one, but I believe it is more complex than it should have been. I
was creating a "numa-node" config section while keeping "-numa" just for
compatbility, but I don't think we really need to do that.

If you want to take a look, the old attemp is at:
  git://github.com/ehabkost/qemu-hacks.git work/old-numa-node-config-section-experiment
Bandan Das - June 18, 2013, 8:19 p.m.
Eduardo Habkost <ehabkost@redhat.com> writes:

> On Tue, Jun 18, 2013 at 11:20:37AM +0200, Paolo Bonzini wrote:
> [...]
>> Also, please use QemuOpts instead of yet another homegrown parser.
>> Eduardo, I think you had the most recent attempt to convert -numa to
>> QemuOpts?
>
> I had one, but I believe it is more complex than it should have been. I
> was creating a "numa-node" config section while keeping "-numa" just for
> compatbility, but I don't think we really need to do that.

Ah, I was working on an update to Eduardo's earlier proposals for multiple CPU ranges
and part of the change was to convert to QemuOpts. 

Probably needs more testing but posted it anyway since we are already discussing this :
[PATCH v3] vl.c: Support multiple CPU ranges on -numa option
(hasn't shown up in the archives yet)

Bandan

> If you want to take a look, the old attemp is at:
>   git://github.com/ehabkost/qemu-hacks.git work/old-numa-node-config-section-experiment
Wanlong Gao - June 19, 2013, 8:01 a.m.
On 06/19/2013 04:19 AM, Bandan Das wrote:
> Eduardo Habkost <ehabkost@redhat.com> writes:
> 
>> On Tue, Jun 18, 2013 at 11:20:37AM +0200, Paolo Bonzini wrote:
>> [...]
>>> Also, please use QemuOpts instead of yet another homegrown parser.
>>> Eduardo, I think you had the most recent attempt to convert -numa to
>>> QemuOpts?
>>
>> I had one, but I believe it is more complex than it should have been. I
>> was creating a "numa-node" config section while keeping "-numa" just for
>> compatbility, but I don't think we really need to do that.
> 
> Ah, I was working on an update to Eduardo's earlier proposals for multiple CPU ranges
> and part of the change was to convert to QemuOpts. 
> 
> Probably needs more testing but posted it anyway since we are already discussing this :
> [PATCH v3] vl.c: Support multiple CPU ranges on -numa option
> (hasn't shown up in the archives yet)

Here is the archive: http://thread.gmane.org/gmane.comp.emulators.qemu/217491

So, are you all ACK with this? And we are not considering compatible thing by using
"cpu" instead of "cpus" here?


Thanks,
Wanlong Gao

> 
> Bandan
> 
>> If you want to take a look, the old attemp is at:
>>   git://github.com/ehabkost/qemu-hacks.git work/old-numa-node-config-section-experiment
>
Paolo Bonzini - June 19, 2013, 5:39 p.m.
Il 19/06/2013 10:01, Wanlong Gao ha scritto:
> On 06/19/2013 04:19 AM, Bandan Das wrote:
>> Eduardo Habkost <ehabkost@redhat.com> writes:
>>
>>> On Tue, Jun 18, 2013 at 11:20:37AM +0200, Paolo Bonzini wrote:
>>> [...]
>>>> Also, please use QemuOpts instead of yet another homegrown parser.
>>>> Eduardo, I think you had the most recent attempt to convert -numa to
>>>> QemuOpts?
>>>
>>> I had one, but I believe it is more complex than it should have been. I
>>> was creating a "numa-node" config section while keeping "-numa" just for
>>> compatbility, but I don't think we really need to do that.
>>
>> Ah, I was working on an update to Eduardo's earlier proposals for multiple CPU ranges
>> and part of the change was to convert to QemuOpts. 
>>
>> Probably needs more testing but posted it anyway since we are already discussing this :
>> [PATCH v3] vl.c: Support multiple CPU ranges on -numa option
>> (hasn't shown up in the archives yet)
> 
> Here is the archive: http://thread.gmane.org/gmane.comp.emulators.qemu/217491
> 
> So, are you all ACK with this? And we are not considering compatible thing by using
> "cpu" instead of "cpus" here?

No; as Eduardo pointed out, the "cpus" must be kept.  But apart from
that, picking up Bandan's patch in v2 of this series should be fine.

Paolo
Wanlong Gao - June 20, 2013, 12:01 a.m.
On 06/20/2013 01:39 AM, Paolo Bonzini wrote:
> Il 19/06/2013 10:01, Wanlong Gao ha scritto:
>> On 06/19/2013 04:19 AM, Bandan Das wrote:
>>> Eduardo Habkost <ehabkost@redhat.com> writes:
>>>
>>>> On Tue, Jun 18, 2013 at 11:20:37AM +0200, Paolo Bonzini wrote:
>>>> [...]
>>>>> Also, please use QemuOpts instead of yet another homegrown parser.
>>>>> Eduardo, I think you had the most recent attempt to convert -numa to
>>>>> QemuOpts?
>>>>
>>>> I had one, but I believe it is more complex than it should have been. I
>>>> was creating a "numa-node" config section while keeping "-numa" just for
>>>> compatbility, but I don't think we really need to do that.
>>>
>>> Ah, I was working on an update to Eduardo's earlier proposals for multiple CPU ranges
>>> and part of the change was to convert to QemuOpts. 
>>>
>>> Probably needs more testing but posted it anyway since we are already discussing this :
>>> [PATCH v3] vl.c: Support multiple CPU ranges on -numa option
>>> (hasn't shown up in the archives yet)
>>
>> Here is the archive: http://thread.gmane.org/gmane.comp.emulators.qemu/217491
>>
>> So, are you all ACK with this? And we are not considering compatible thing by using
>> "cpu" instead of "cpus" here?
> 
> No; as Eduardo pointed out, the "cpus" must be kept.  But apart from
> that, picking up Bandan's patch in v2 of this series should be fine.

Got it, thank you.

Regards,
Wanlong Gao

> 
> Paolo
> 
>

Patch

diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h
index 70fd2ed..993b8e0 100644
--- a/include/sysemu/sysemu.h
+++ b/include/sysemu/sysemu.h
@@ -130,10 +130,18 @@  extern QEMUClock *rtc_clock;
 
 #define MAX_NODES 64
 #define MAX_CPUMASK_BITS 255
+#define NODE_HOST_NONE        0x00
+#define NODE_HOST_BIND        0x01
+#define NODE_HOST_INTERLEAVE  0x02
+#define NODE_HOST_PREFERRED   0x03
+#define NODE_HOST_POLICY_MASK 0x03
+#define NODE_HOST_RELATIVE    0x04
 extern int nb_numa_nodes;
 struct node_info {
     uint64_t node_mem;
     DECLARE_BITMAP(node_cpu, MAX_CPUMASK_BITS);
+    DECLARE_BITMAP(host_mem, MAX_CPUMASK_BITS);
+    unsigned int flags;
 };
 extern struct node_info numa_info[MAX_NODES];
 
diff --git a/vl.c b/vl.c
index 42dec5e..ada9fb2 100644
--- a/vl.c
+++ b/vl.c
@@ -1348,11 +1348,68 @@  error:
     exit(1);
 }
 
+static unsigned int numa_node_parse_mpol(const char *str, unsigned long *bm)
+{
+    unsigned long long value, endvalue;
+    char *endptr;
+    unsigned int flags = 0;
+
+    if (str[0] == '!') {
+        flags |= 2;
+        bitmap_fill(bm, MAX_CPUMASK_BITS);
+        str++;
+    }
+    if (str[0] == '+') {
+        flags |= 1;
+        str++;
+    }
+
+    if (!strcmp(str, "all")) {
+        bitmap_fill(bm, MAX_CPUMASK_BITS);
+        return flags;
+    }
+
+    if (parse_uint(str, &value, &endptr, 10) < 0)
+        goto error;
+    if (*endptr == '-') {
+        if (parse_uint_full(endptr + 1, &endvalue, 10) < 0) {
+            goto error;
+        }
+    } else if (*endptr == '\0') {
+        endvalue = value;
+    } else {
+        goto error;
+    }
+
+    if (endvalue >= MAX_CPUMASK_BITS) {
+        endvalue = MAX_CPUMASK_BITS - 1;
+        fprintf(stderr,
+            "qemu: NUMA: A max of %d host nodes are supported\n",
+             MAX_CPUMASK_BITS);
+    }
+
+    if (endvalue < value) {
+        goto error;
+    }
+
+    if (flags & 2)
+        bitmap_clear(bm, value, endvalue - value + 1);
+    else
+        bitmap_set(bm, value, endvalue - value + 1);
+
+    return flags;
+
+error:
+    fprintf(stderr, "qemu: Invalid host NUMA nodes range: %s\n", str);
+    return 4;
+}
+
 static void numa_add(const char *optarg)
 {
     char option[128];
     char *endptr;
     unsigned long long nodenr;
+    unsigned int ret;
 
     optarg = get_opt_name(option, 128, optarg, ',');
     if (*optarg == ',') {
@@ -1393,6 +1450,23 @@  static void numa_add(const char *optarg)
         if (get_param_value(option, 128, "cpus", optarg) != 0) {
             numa_node_parse_cpus(nodenr, option);
         }
+
+        option[0] = 0;
+        if (get_param_value(option, 128, "interleave", optarg) != 0)
+            numa_info[nodenr].flags |= NODE_HOST_INTERLEAVE;
+        else if (get_param_value(option, 128, "preferred", optarg) != 0)
+            numa_info[nodenr].flags |= NODE_HOST_PREFERRED;
+        else if (get_param_value(option, 128, "membind", optarg) != 0)
+            numa_info[nodenr].flags |= NODE_HOST_BIND;
+        if (option[0] != 0) {
+            ret = numa_node_parse_mpol(option, numa_info[nodenr].host_mem);
+            if (ret == 4) {
+                exit(1);
+            } else if (ret & 1) {
+                numa_info[nodenr].flags |= NODE_HOST_RELATIVE;
+            }
+        }
+
         nb_numa_nodes++;
     } else {
         fprintf(stderr, "Invalid -numa option: %s\n", option);
@@ -2922,6 +2996,8 @@  int main(int argc, char **argv, char **envp)
     for (i = 0; i < MAX_NODES; i++) {
         numa_info[i].node_mem = 0;
         bitmap_zero(numa_info[i].node_cpu, MAX_CPUMASK_BITS);
+        bitmap_zero(numa_info[i].host_mem, MAX_CPUMASK_BITS);
+        numa_info[i].flags = NODE_HOST_NONE;
     }
 
     nb_numa_nodes = 0;