diff mbox

[for-2.3] numa: pc: fix default VCPU to node mapping

Message ID 1426607318-22728-1-git-send-email-imammedo@redhat.com
State New
Headers show

Commit Message

Igor Mammedov March 17, 2015, 3:48 p.m. UTC
since commit
   dd0247e0 pc: acpi: mark all possible CPUs as enabled in SRAT
Linux kernel actually tries to use CPU to Node mapping from
QEMU provided SRAT table instead of discarding it, and that
in some cases breaks build_sched_domains() which expects
sane mapping where cores/threads belonging to the same socket
are on the same NUMA node.

With current default round-robin mapping of VCPUs to nodes
guest ends-up with cores/threads belonging to the same socket
being on different NUMA nodes.

For example with following CLI:
qemu-kvm -m 4G -smp 5,sockets=1,cores=4,threads=1,maxcpus=8 \
         -numa node,nodeid=0 -numa node,nodeid=1
2.6.32 based kernels will hang on boot due to incorrectly build
sched_group-s list in update_sd_lb_stats()
so comment in QEMU justifying dumb default mapping:
 "
  guest OSes must cope with this anyway, because there are BIOSes
  out there in real machines which also use this scheme.
 "
isn't really valid.

Replacing default mapping withi a manual, where VCPUs belonging to
the same socket are on the same NUMA node, fixes issue for
guests which can't handle nonsense topology i.e. cnaging CLI to:
  -numa node,nodeid=0,cpus=0-3 -numa node,nodeid=1,cpus=4-7

So instead of simply scattering VCPUs around nodes, map
the same socket VCPUs to the same NUMA node, which is what
guest would expect from a sane hardware/BIOS.

Signed-off-by: Igor Mammedov <imammedo@redhat.com>
---
 include/sysemu/cpus.h                 |  3 +++
 numa.c                                | 14 ++++++++++----
 stubs/Makefile.objs                   |  1 +
 stubs/qemu_cpu_socket_id_from_index.c |  6 ++++++
 target-i386/cpu.c                     | 11 +++++++++++
 5 files changed, 31 insertions(+), 4 deletions(-)
 create mode 100644 stubs/qemu_cpu_socket_id_from_index.c

Comments

Eduardo Habkost March 17, 2015, 4:42 p.m. UTC | #1
On Tue, Mar 17, 2015 at 03:48:38PM +0000, Igor Mammedov wrote:
> since commit
>    dd0247e0 pc: acpi: mark all possible CPUs as enabled in SRAT
> Linux kernel actually tries to use CPU to Node mapping from
> QEMU provided SRAT table instead of discarding it, and that
> in some cases breaks build_sched_domains() which expects
> sane mapping where cores/threads belonging to the same socket
> are on the same NUMA node.
> 
> With current default round-robin mapping of VCPUs to nodes
> guest ends-up with cores/threads belonging to the same socket
> being on different NUMA nodes.
> 
> For example with following CLI:
> qemu-kvm -m 4G -smp 5,sockets=1,cores=4,threads=1,maxcpus=8 \
>          -numa node,nodeid=0 -numa node,nodeid=1
> 2.6.32 based kernels will hang on boot due to incorrectly build
> sched_group-s list in update_sd_lb_stats()
> so comment in QEMU justifying dumb default mapping:
>  "
>   guest OSes must cope with this anyway, because there are BIOSes
>   out there in real machines which also use this scheme.
>  "
> isn't really valid.
> 
> Replacing default mapping withi a manual, where VCPUs belonging to
> the same socket are on the same NUMA node, fixes issue for
> guests which can't handle nonsense topology i.e. cnaging CLI to:
>   -numa node,nodeid=0,cpus=0-3 -numa node,nodeid=1,cpus=4-7
> 
> So instead of simply scattering VCPUs around nodes, map
> the same socket VCPUs to the same NUMA node, which is what
> guest would expect from a sane hardware/BIOS.
> 
> Signed-off-by: Igor Mammedov <imammedo@redhat.com>

I believe the proposed behavior is much better. But if we are going to
break compatibility, shouldn't we at least do that before the first -rc
so we get feedback in case it break existing configurations?

About qemu_cpu_socket_id_from_index(): all qemu-system-* binaries have
smp_cores and smp_threads available (even if machines ignore it), but
the default stub can return values that are larger than the number of
sockets if smp_cores*smp_threads > 1, which would be obviously
incorrect. Isn't it easier to simply make
"cpu_index/(smp_cores*smp_sockets)" be the default cpu_index->socket
mapping function, and allow machine-specific (not arch-specific)
overrides if necessary?


> ---
>  include/sysemu/cpus.h                 |  3 +++
>  numa.c                                | 14 ++++++++++----
>  stubs/Makefile.objs                   |  1 +
>  stubs/qemu_cpu_socket_id_from_index.c |  6 ++++++
>  target-i386/cpu.c                     | 11 +++++++++++
>  5 files changed, 31 insertions(+), 4 deletions(-)
>  create mode 100644 stubs/qemu_cpu_socket_id_from_index.c
> 
> diff --git a/include/sysemu/cpus.h b/include/sysemu/cpus.h
> index 3f162a9..aacabcb 100644
> --- a/include/sysemu/cpus.h
> +++ b/include/sysemu/cpus.h
> @@ -1,5 +1,6 @@
>  #ifndef QEMU_CPUS_H
>  #define QEMU_CPUS_H
> +#include "qemu-common.h"
>  
>  /* cpus.c */
>  void qemu_init_cpu_loop(void);
> @@ -18,6 +19,8 @@ void qtest_clock_warp(int64_t dest);
>  /* vl.c */
>  extern int smp_cores;
>  extern int smp_threads;
> +
> +unsigned qemu_cpu_socket_id_from_index(unsigned int cpu_index);
>  #else
>  /* *-user doesn't have configurable SMP topology */
>  #define smp_cores   1
> diff --git a/numa.c b/numa.c
> index ffbec68..5297749 100644
> --- a/numa.c
> +++ b/numa.c
> @@ -26,6 +26,7 @@
>  #include "exec/cpu-common.h"
>  #include "qemu/bitmap.h"
>  #include "qom/cpu.h"
> +#include "sysemu/cpus.h"
>  #include "qemu/error-report.h"
>  #include "include/exec/cpu-common.h" /* for RAM_ADDR_FMT */
>  #include "qapi-visit.h"
> @@ -233,13 +234,18 @@ void parse_numa_opts(void)
>                  break;
>              }
>          }
> -        /* assigning the VCPUs round-robin is easier to implement, guest OSes
> -         * must cope with this anyway, because there are BIOSes out there in
> -         * real machines which also use this scheme.
> +        /* Assign VCPUs from the same socket to the same node.
> +         * Since mapping is arch dependent, target that care about
> +         * correct mapping of VCPUs to node should implement
> +         * qemu_cpu_socket_id_from_index() function that maps cpu_index to
> +         * a socket #, for all other cases legacy round-robin mode
> +         * will be used.
>           */
>          if (i == nb_numa_nodes) {
>              for (i = 0; i < max_cpus; i++) {
> -                set_bit(i, numa_info[i % nb_numa_nodes].node_cpu);
> +                unsigned socket_id = qemu_cpu_socket_id_from_index(i);
> +
> +                set_bit(i, numa_info[socket_id % nb_numa_nodes].node_cpu);
>              }
>          }
>      }
> diff --git a/stubs/Makefile.objs b/stubs/Makefile.objs
> index 8beff4c..86b8060 100644
> --- a/stubs/Makefile.objs
> +++ b/stubs/Makefile.objs
> @@ -39,3 +39,4 @@ stub-obj-$(CONFIG_WIN32) += fd-register.o
>  stub-obj-y += cpus.o
>  stub-obj-y += kvm.o
>  stub-obj-y += qmp_pc_dimm_device_list.o
> +stub-obj-y += qemu_cpu_socket_id_from_index.o
> diff --git a/stubs/qemu_cpu_socket_id_from_index.c b/stubs/qemu_cpu_socket_id_from_index.c
> new file mode 100644
> index 0000000..3d8ea8b
> --- /dev/null
> +++ b/stubs/qemu_cpu_socket_id_from_index.c
> @@ -0,0 +1,6 @@
> +#include "sysemu/cpus.h"
> +
> +unsigned qemu_cpu_socket_id_from_index(unsigned int cpu_index)
> +{
> +    return cpu_index;
> +}
> diff --git a/target-i386/cpu.c b/target-i386/cpu.c
> index ed7e5d5..7a7e236 100644
> --- a/target-i386/cpu.c
> +++ b/target-i386/cpu.c
> @@ -47,6 +47,7 @@
>  #include "hw/xen/xen.h"
>  #include "hw/i386/apic_internal.h"
>  #endif
> +#include "hw/i386/topology.h"
>  
>  
>  /* Cache topology CPUID constants: */
> @@ -2822,6 +2823,16 @@ out:
>      }
>  }
>  
> +#ifndef CONFIG_USER_ONLY
> +unsigned qemu_cpu_socket_id_from_index(unsigned int cpu_index)
> +{
> +    unsigned pkg_id, core_id, smt_id;
> +    x86_topo_ids_from_idx(smp_cores, smp_threads, cpu_index,
> +                          &pkg_id, &core_id, &smt_id);
> +    return pkg_id;
> +}
> +#endif
> +
>  static void x86_cpu_initfn(Object *obj)
>  {
>      CPUState *cs = CPU(obj);
> -- 
> 1.8.3.1
>
Andreas Färber March 17, 2015, 4:59 p.m. UTC | #2
Am 17.03.2015 um 17:42 schrieb Eduardo Habkost:
> On Tue, Mar 17, 2015 at 03:48:38PM +0000, Igor Mammedov wrote:
>> since commit
>>    dd0247e0 pc: acpi: mark all possible CPUs as enabled in SRAT
>> Linux kernel actually tries to use CPU to Node mapping from
>> QEMU provided SRAT table instead of discarding it, and that
>> in some cases breaks build_sched_domains() which expects
>> sane mapping where cores/threads belonging to the same socket
>> are on the same NUMA node.
>>
>> With current default round-robin mapping of VCPUs to nodes
>> guest ends-up with cores/threads belonging to the same socket
>> being on different NUMA nodes.
>>
>> For example with following CLI:
>> qemu-kvm -m 4G -smp 5,sockets=1,cores=4,threads=1,maxcpus=8 \
>>          -numa node,nodeid=0 -numa node,nodeid=1
>> 2.6.32 based kernels will hang on boot due to incorrectly build
>> sched_group-s list in update_sd_lb_stats()
>> so comment in QEMU justifying dumb default mapping:
>>  "
>>   guest OSes must cope with this anyway, because there are BIOSes
>>   out there in real machines which also use this scheme.
>>  "
>> isn't really valid.
>>
>> Replacing default mapping withi a manual, where VCPUs belonging to
>> the same socket are on the same NUMA node, fixes issue for
>> guests which can't handle nonsense topology i.e. cnaging CLI to:
>>   -numa node,nodeid=0,cpus=0-3 -numa node,nodeid=1,cpus=4-7
>>
>> So instead of simply scattering VCPUs around nodes, map
>> the same socket VCPUs to the same NUMA node, which is what
>> guest would expect from a sane hardware/BIOS.
>>
>> Signed-off-by: Igor Mammedov <imammedo@redhat.com>
> 
> I believe the proposed behavior is much better. But if we are going to
> break compatibility, shouldn't we at least do that before the first -rc
> so we get feedback in case it break existing configurations?
> 
> About qemu_cpu_socket_id_from_index(): all qemu-system-* binaries have
> smp_cores and smp_threads available (even if machines ignore it), but
> the default stub can return values that are larger than the number of
> sockets if smp_cores*smp_threads > 1, which would be obviously
> incorrect. Isn't it easier to simply make
> "cpu_index/(smp_cores*smp_sockets)" be the default cpu_index->socket
> mapping function, and allow machine-specific (not arch-specific)
> overrides if necessary?

Agree that the proposed stub solution is not so nice. Can you propose a
MachineClass based solution instead?

The example I keep bringing up for x86 is that the Galileo boards or
even the Minnow boards don't really have sockets, being a SoC.

Thanks,
Andreas
Igor Mammedov March 18, 2015, 11:04 a.m. UTC | #3
On Tue, 17 Mar 2015 13:42:36 -0300
Eduardo Habkost <ehabkost@redhat.com> wrote:

> On Tue, Mar 17, 2015 at 03:48:38PM +0000, Igor Mammedov wrote:
> > since commit
> >    dd0247e0 pc: acpi: mark all possible CPUs as enabled in SRAT
> > Linux kernel actually tries to use CPU to Node mapping from
> > QEMU provided SRAT table instead of discarding it, and that
> > in some cases breaks build_sched_domains() which expects
> > sane mapping where cores/threads belonging to the same socket
> > are on the same NUMA node.
> > 
> > With current default round-robin mapping of VCPUs to nodes
> > guest ends-up with cores/threads belonging to the same socket
> > being on different NUMA nodes.
> > 
> > For example with following CLI:
> > qemu-kvm -m 4G -smp 5,sockets=1,cores=4,threads=1,maxcpus=8 \
> >          -numa node,nodeid=0 -numa node,nodeid=1
> > 2.6.32 based kernels will hang on boot due to incorrectly build
> > sched_group-s list in update_sd_lb_stats()
> > so comment in QEMU justifying dumb default mapping:
> >  "
> >   guest OSes must cope with this anyway, because there are BIOSes
> >   out there in real machines which also use this scheme.
> >  "
> > isn't really valid.
> > 
> > Replacing default mapping withi a manual, where VCPUs belonging to
> > the same socket are on the same NUMA node, fixes issue for
> > guests which can't handle nonsense topology i.e. cnaging CLI to:
> >   -numa node,nodeid=0,cpus=0-3 -numa node,nodeid=1,cpus=4-7
> > 
> > So instead of simply scattering VCPUs around nodes, map
> > the same socket VCPUs to the same NUMA node, which is what
> > guest would expect from a sane hardware/BIOS.
> > 
> > Signed-off-by: Igor Mammedov <imammedo@redhat.com>
> 
> I believe the proposed behavior is much better. But if we are going to
> break compatibility, shouldn't we at least do that before the first -rc
> so we get feedback in case it break existing configurations?
Aren't existing setups that actually care about node mapping typically do not use default mapping in favor of manually specifying which CPUs go to which node?
BTW what it would break on already broken configuration?

My thinking on this was that we prefer to not keep bugs in ACPI tables and
fix them instead, since it doesn't affect currently running (migrating)
guest and allows restarted guest use fixed ACPI tables.

Considering that CPU info from SRAT is actually used only since 2.2,
I'd prefer to fix it in 2.3 and 2.2 stable then allow default buggy SRAT
spread over several releases.


> 
> About qemu_cpu_socket_id_from_index(): all qemu-system-* binaries have
> smp_cores and smp_threads available (even if machines ignore it), but
> the default stub can return values that are larger than the number of
> sockets if smp_cores*smp_threads > 1, which would be obviously
> incorrect. Isn't it easier to simply make
> "cpu_index/(smp_cores*smp_sockets)" be the default cpu_index->socket
> mapping function, and allow machine-specific (not arch-specific)
> overrides if necessary?
Since it's hard-freeze now, patch focuses on fixing PC machine (which I can test)
and do not affecting other targets (i.e. keep legacy behavior for them)
that's why stub returns cpu_index. So I'd prefer hold off global
"cpu_index/(smp_cores*smp_sockets)" change for now.

So would something like below acceptable 2.3?

 node_id = cpu_index % nb_numa_nodes;
 if (machine_class->socket_from_cpu_index) {
     node_id = machine_class->socket_from_cpu_index(cpu_index) % nb_numa_nodes;
 }
  


> 
> 
> > ---
> >  include/sysemu/cpus.h                 |  3 +++
> >  numa.c                                | 14 ++++++++++----
> >  stubs/Makefile.objs                   |  1 +
> >  stubs/qemu_cpu_socket_id_from_index.c |  6 ++++++
> >  target-i386/cpu.c                     | 11 +++++++++++
> >  5 files changed, 31 insertions(+), 4 deletions(-)
> >  create mode 100644 stubs/qemu_cpu_socket_id_from_index.c
> > 
> > diff --git a/include/sysemu/cpus.h b/include/sysemu/cpus.h
> > index 3f162a9..aacabcb 100644
> > --- a/include/sysemu/cpus.h
> > +++ b/include/sysemu/cpus.h
> > @@ -1,5 +1,6 @@
> >  #ifndef QEMU_CPUS_H
> >  #define QEMU_CPUS_H
> > +#include "qemu-common.h"
> >  
> >  /* cpus.c */
> >  void qemu_init_cpu_loop(void);
> > @@ -18,6 +19,8 @@ void qtest_clock_warp(int64_t dest);
> >  /* vl.c */
> >  extern int smp_cores;
> >  extern int smp_threads;
> > +
> > +unsigned qemu_cpu_socket_id_from_index(unsigned int cpu_index);
> >  #else
> >  /* *-user doesn't have configurable SMP topology */
> >  #define smp_cores   1
> > diff --git a/numa.c b/numa.c
> > index ffbec68..5297749 100644
> > --- a/numa.c
> > +++ b/numa.c
> > @@ -26,6 +26,7 @@
> >  #include "exec/cpu-common.h"
> >  #include "qemu/bitmap.h"
> >  #include "qom/cpu.h"
> > +#include "sysemu/cpus.h"
> >  #include "qemu/error-report.h"
> >  #include "include/exec/cpu-common.h" /* for RAM_ADDR_FMT */
> >  #include "qapi-visit.h"
> > @@ -233,13 +234,18 @@ void parse_numa_opts(void)
> >                  break;
> >              }
> >          }
> > -        /* assigning the VCPUs round-robin is easier to implement, guest OSes
> > -         * must cope with this anyway, because there are BIOSes out there in
> > -         * real machines which also use this scheme.
> > +        /* Assign VCPUs from the same socket to the same node.
> > +         * Since mapping is arch dependent, target that care about
> > +         * correct mapping of VCPUs to node should implement
> > +         * qemu_cpu_socket_id_from_index() function that maps cpu_index to
> > +         * a socket #, for all other cases legacy round-robin mode
> > +         * will be used.
> >           */
> >          if (i == nb_numa_nodes) {
> >              for (i = 0; i < max_cpus; i++) {
> > -                set_bit(i, numa_info[i % nb_numa_nodes].node_cpu);
> > +                unsigned socket_id = qemu_cpu_socket_id_from_index(i);
> > +
> > +                set_bit(i, numa_info[socket_id % nb_numa_nodes].node_cpu);
> >              }
> >          }
> >      }
> > diff --git a/stubs/Makefile.objs b/stubs/Makefile.objs
> > index 8beff4c..86b8060 100644
> > --- a/stubs/Makefile.objs
> > +++ b/stubs/Makefile.objs
> > @@ -39,3 +39,4 @@ stub-obj-$(CONFIG_WIN32) += fd-register.o
> >  stub-obj-y += cpus.o
> >  stub-obj-y += kvm.o
> >  stub-obj-y += qmp_pc_dimm_device_list.o
> > +stub-obj-y += qemu_cpu_socket_id_from_index.o
> > diff --git a/stubs/qemu_cpu_socket_id_from_index.c b/stubs/qemu_cpu_socket_id_from_index.c
> > new file mode 100644
> > index 0000000..3d8ea8b
> > --- /dev/null
> > +++ b/stubs/qemu_cpu_socket_id_from_index.c
> > @@ -0,0 +1,6 @@
> > +#include "sysemu/cpus.h"
> > +
> > +unsigned qemu_cpu_socket_id_from_index(unsigned int cpu_index)
> > +{
> > +    return cpu_index;
> > +}
> > diff --git a/target-i386/cpu.c b/target-i386/cpu.c
> > index ed7e5d5..7a7e236 100644
> > --- a/target-i386/cpu.c
> > +++ b/target-i386/cpu.c
> > @@ -47,6 +47,7 @@
> >  #include "hw/xen/xen.h"
> >  #include "hw/i386/apic_internal.h"
> >  #endif
> > +#include "hw/i386/topology.h"
> >  
> >  
> >  /* Cache topology CPUID constants: */
> > @@ -2822,6 +2823,16 @@ out:
> >      }
> >  }
> >  
> > +#ifndef CONFIG_USER_ONLY
> > +unsigned qemu_cpu_socket_id_from_index(unsigned int cpu_index)
> > +{
> > +    unsigned pkg_id, core_id, smt_id;
> > +    x86_topo_ids_from_idx(smp_cores, smp_threads, cpu_index,
> > +                          &pkg_id, &core_id, &smt_id);
> > +    return pkg_id;
> > +}
> > +#endif
> > +
> >  static void x86_cpu_initfn(Object *obj)
> >  {
> >      CPUState *cs = CPU(obj);
> > -- 
> > 1.8.3.1
> > 
>
Igor Mammedov March 18, 2015, 11:05 a.m. UTC | #4
On Tue, 17 Mar 2015 17:59:48 +0100
Andreas Färber <afaerber@suse.de> wrote:

> Am 17.03.2015 um 17:42 schrieb Eduardo Habkost:
> > On Tue, Mar 17, 2015 at 03:48:38PM +0000, Igor Mammedov wrote:
> >> since commit
> >>    dd0247e0 pc: acpi: mark all possible CPUs as enabled in SRAT
> >> Linux kernel actually tries to use CPU to Node mapping from
> >> QEMU provided SRAT table instead of discarding it, and that
> >> in some cases breaks build_sched_domains() which expects
> >> sane mapping where cores/threads belonging to the same socket
> >> are on the same NUMA node.
> >>
> >> With current default round-robin mapping of VCPUs to nodes
> >> guest ends-up with cores/threads belonging to the same socket
> >> being on different NUMA nodes.
> >>
> >> For example with following CLI:
> >> qemu-kvm -m 4G -smp 5,sockets=1,cores=4,threads=1,maxcpus=8 \
> >>          -numa node,nodeid=0 -numa node,nodeid=1
> >> 2.6.32 based kernels will hang on boot due to incorrectly build
> >> sched_group-s list in update_sd_lb_stats()
> >> so comment in QEMU justifying dumb default mapping:
> >>  "
> >>   guest OSes must cope with this anyway, because there are BIOSes
> >>   out there in real machines which also use this scheme.
> >>  "
> >> isn't really valid.
> >>
> >> Replacing default mapping withi a manual, where VCPUs belonging to
> >> the same socket are on the same NUMA node, fixes issue for
> >> guests which can't handle nonsense topology i.e. cnaging CLI to:
> >>   -numa node,nodeid=0,cpus=0-3 -numa node,nodeid=1,cpus=4-7
> >>
> >> So instead of simply scattering VCPUs around nodes, map
> >> the same socket VCPUs to the same NUMA node, which is what
> >> guest would expect from a sane hardware/BIOS.
> >>
> >> Signed-off-by: Igor Mammedov <imammedo@redhat.com>
> > 
> > I believe the proposed behavior is much better. But if we are going to
> > break compatibility, shouldn't we at least do that before the first -rc
> > so we get feedback in case it break existing configurations?
> > 
> > About qemu_cpu_socket_id_from_index(): all qemu-system-* binaries have
> > smp_cores and smp_threads available (even if machines ignore it), but
> > the default stub can return values that are larger than the number of
> > sockets if smp_cores*smp_threads > 1, which would be obviously
> > incorrect. Isn't it easier to simply make
> > "cpu_index/(smp_cores*smp_sockets)" be the default cpu_index->socket
> > mapping function, and allow machine-specific (not arch-specific)
> > overrides if necessary?
> 
> Agree that the proposed stub solution is not so nice. Can you propose a
> MachineClass based solution instead?
sure

> 
> The example I keep bringing up for x86 is that the Galileo boards or
> even the Minnow boards don't really have sockets, being a SoC.
> 
> Thanks,
> Andreas
>
diff mbox

Patch

diff --git a/include/sysemu/cpus.h b/include/sysemu/cpus.h
index 3f162a9..aacabcb 100644
--- a/include/sysemu/cpus.h
+++ b/include/sysemu/cpus.h
@@ -1,5 +1,6 @@ 
 #ifndef QEMU_CPUS_H
 #define QEMU_CPUS_H
+#include "qemu-common.h"
 
 /* cpus.c */
 void qemu_init_cpu_loop(void);
@@ -18,6 +19,8 @@  void qtest_clock_warp(int64_t dest);
 /* vl.c */
 extern int smp_cores;
 extern int smp_threads;
+
+unsigned qemu_cpu_socket_id_from_index(unsigned int cpu_index);
 #else
 /* *-user doesn't have configurable SMP topology */
 #define smp_cores   1
diff --git a/numa.c b/numa.c
index ffbec68..5297749 100644
--- a/numa.c
+++ b/numa.c
@@ -26,6 +26,7 @@ 
 #include "exec/cpu-common.h"
 #include "qemu/bitmap.h"
 #include "qom/cpu.h"
+#include "sysemu/cpus.h"
 #include "qemu/error-report.h"
 #include "include/exec/cpu-common.h" /* for RAM_ADDR_FMT */
 #include "qapi-visit.h"
@@ -233,13 +234,18 @@  void parse_numa_opts(void)
                 break;
             }
         }
-        /* assigning the VCPUs round-robin is easier to implement, guest OSes
-         * must cope with this anyway, because there are BIOSes out there in
-         * real machines which also use this scheme.
+        /* Assign VCPUs from the same socket to the same node.
+         * Since mapping is arch dependent, target that care about
+         * correct mapping of VCPUs to node should implement
+         * qemu_cpu_socket_id_from_index() function that maps cpu_index to
+         * a socket #, for all other cases legacy round-robin mode
+         * will be used.
          */
         if (i == nb_numa_nodes) {
             for (i = 0; i < max_cpus; i++) {
-                set_bit(i, numa_info[i % nb_numa_nodes].node_cpu);
+                unsigned socket_id = qemu_cpu_socket_id_from_index(i);
+
+                set_bit(i, numa_info[socket_id % nb_numa_nodes].node_cpu);
             }
         }
     }
diff --git a/stubs/Makefile.objs b/stubs/Makefile.objs
index 8beff4c..86b8060 100644
--- a/stubs/Makefile.objs
+++ b/stubs/Makefile.objs
@@ -39,3 +39,4 @@  stub-obj-$(CONFIG_WIN32) += fd-register.o
 stub-obj-y += cpus.o
 stub-obj-y += kvm.o
 stub-obj-y += qmp_pc_dimm_device_list.o
+stub-obj-y += qemu_cpu_socket_id_from_index.o
diff --git a/stubs/qemu_cpu_socket_id_from_index.c b/stubs/qemu_cpu_socket_id_from_index.c
new file mode 100644
index 0000000..3d8ea8b
--- /dev/null
+++ b/stubs/qemu_cpu_socket_id_from_index.c
@@ -0,0 +1,6 @@ 
+#include "sysemu/cpus.h"
+
+unsigned qemu_cpu_socket_id_from_index(unsigned int cpu_index)
+{
+    return cpu_index;
+}
diff --git a/target-i386/cpu.c b/target-i386/cpu.c
index ed7e5d5..7a7e236 100644
--- a/target-i386/cpu.c
+++ b/target-i386/cpu.c
@@ -47,6 +47,7 @@ 
 #include "hw/xen/xen.h"
 #include "hw/i386/apic_internal.h"
 #endif
+#include "hw/i386/topology.h"
 
 
 /* Cache topology CPUID constants: */
@@ -2822,6 +2823,16 @@  out:
     }
 }
 
+#ifndef CONFIG_USER_ONLY
+unsigned qemu_cpu_socket_id_from_index(unsigned int cpu_index)
+{
+    unsigned pkg_id, core_id, smt_id;
+    x86_topo_ids_from_idx(smp_cores, smp_threads, cpu_index,
+                          &pkg_id, &core_id, &smt_id);
+    return pkg_id;
+}
+#endif
+
 static void x86_cpu_initfn(Object *obj)
 {
     CPUState *cs = CPU(obj);