diff mbox series

[ovs-dev,v4,1/2] ovs-numa: Support non-contiguous numa nodes and offline CPU cores

Message ID 20210519001008.23831-2-dwilder@us.ibm.com
State Changes Requested
Headers show
Series Support for non-contiguous numa nodes and core ids. | expand

Commit Message

David Wilder May 19, 2021, 12:10 a.m. UTC
This change removes the assumption that numa nodes and cores are numbered
contiguously in linux.  This change is required to support some Power
systems.

An additional check has been added to verify that cores are online,
offline cores result in non-contiguously numbered cores.

I manually verified that all the exported functions defined in
ovs-numa.h function correctly with these changes on Power and amd64
systems.

Tests for dpif-netdev and pmd that define numa systems using the
--dummy-numa option have been modified to use both contiguous and
non-contiguous numbered nodes.

Signed-off-by: David Wilder <dwilder@us.ibm.com>
Reviewed-by: David Christensen <drc@linux.vnet.ibm.com>
---
 lib/ovs-numa.c       | 47 +++++++++++++++++++++++++++-----------
 tests/dpif-netdev.at | 64 +++++++++++++++++++++++++++++-----------------------
 tests/pmd.at         | 20 ++++++++--------
 3 files changed, 80 insertions(+), 51 deletions(-)

Comments

Emma Finn May 20, 2021, 9:17 a.m. UTC | #1
-----Original Message-----
From: dev <ovs-dev-bounces@openvswitch.org> On Behalf Of David Wilder
Sent: Wednesday 19 May 2021 01:10
To: ovs-dev@openvswitch.org
Cc: daniele.di.proietto@gmail.com; wilder@us.ibm.com; i.maximets@ovn.org
Subject: [ovs-dev] [PATCH v4 1/2] ovs-numa: Support non-contiguous numa nodes and offline CPU cores

This change removes the assumption that numa nodes and cores are numbered contiguously in linux.  This change is required to support some Power systems.

An additional check has been added to verify that cores are online, offline cores result in non-contiguously numbered cores.

I manually verified that all the exported functions defined in ovs-numa.h function correctly with these changes on Power and amd64 systems.

Tests for dpif-netdev and pmd that define numa systems using the --dummy-numa option have been modified to use both contiguous and non-contiguous numbered nodes.

Signed-off-by: David Wilder <dwilder@us.ibm.com>
Reviewed-by: David Christensen <drc@linux.vnet.ibm.com>
---
 lib/ovs-numa.c       | 47 +++++++++++++++++++++++++++-----------
 tests/dpif-netdev.at | 64 +++++++++++++++++++++++++++++-----------------------
 tests/pmd.at         | 20 ++++++++--------
 3 files changed, 80 insertions(+), 51 deletions(-)

diff --git a/lib/ovs-numa.c b/lib/ovs-numa.c index 6d0a685..d2e7cc6 100644
--- a/lib/ovs-numa.c
+++ b/lib/ovs-numa.c
@@ -42,16 +42,18 @@ VLOG_DEFINE_THIS_MODULE(ovs_numa);
  * This module stores the affinity information of numa nodes and cpu cores.
  * It also provides functions to bookkeep the pin of threads on cpu cores.
  *
- * It is assumed that the numa node ids and cpu core ids all start from 0 and
- * range continuously.  So, for example, if 'ovs_numa_get_n_cores()' returns N,
- * user can assume core ids from 0 to N-1 are all valid and there is a
- * 'struct cpu_core' for each id.
+ * It is assumed that the numa node ids and cpu core ids all start from 0.
+ * There is no guarantee that node and cpu ids are numbered 
+ consecutively
+ * (this is a change from earlier version of the code). So, for 
+ example,
+ * if two nodes exist with ids 0 and 8, 'ovs_numa_get_n_nodes()' will
+ * return 2, no assumption of node numbering should be made.
  *
  * NOTE, this module should only be used by the main thread.
  *
- * NOTE, the assumption above will fail when cpu hotplug is used.  In that
- * case ovs-numa will not function correctly.  For now, add a TODO entry
- * for addressing it in the future.
+ * NOTE, if cpu hotplug is used 'all_numa_nodes' and 'all_cpu_cores' 
+ must be
+ * invalidated when ever the system topology changes. Support for 
+ detecting
+ * topology changes has not been included. For now, add a TODO entry 
+ for
+ * addressing it in the future.
  *
  * TODO: Fix ovs-numa when cpu hotplug is used.
  */
@@ -130,8 +132,8 @@ insert_new_cpu_core(struct numa_node *n, unsigned core_id)
  * - "0,0,0,0": four cores on numa socket 0.
  * - "0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1": 16 cores on two numa sockets.
  * - "0,0,0,0,1,1,1,1": 8 cores on two numa sockets.
- *
- * The different numa ids must be consecutives or the function will abort. */
+ * - "0,0,0,0,8,8,8,8": 8 cores on two numa sockets, non-contiguous.
+ */
 static void
 discover_numa_and_core_dummy(void)
 {
@@ -169,10 +171,27 @@ discover_numa_and_core_dummy(void)
 
     free(conf);
 
-    if (max_numa_id + 1 != hmap_count(&all_numa_nodes)) {
-        ovs_fatal(0, "dummy numa contains non consecutive numa ids");
+}
+
+#ifdef __linux__
+/* Check if a cpu is detected and online */ static int 
+cpu_detected(unsigned int core_id) {
+    char path[PATH_MAX];
+    int len = snprintf(path,
+              sizeof(path), "/sys/devices/system/cpu/cpu%d/topology/core_id" ,
+              core_id);
+    if (len <= 0 || (unsigned) len >= sizeof(path)) {
+        return 0;
+    }
+    if (access(path, F_OK) != 0) {
+        return 0;
     }
+
+    return 1;
 }
+#endif /* __linux__ */
 
 /* Discovers all numa nodes and the corresponding cpu cores.
  * Constructs the 'struct numa_node' and 'struct cpu_core'. */ @@ -219,7 +238,9 @@ discover_numa_and_core(void)
                     unsigned core_id;
 
                     core_id = strtoul(subdir->d_name + 3, NULL, 10);
-                    insert_new_cpu_core(n, core_id);
+                    if (cpu_detected(core_id)) {
+                        insert_new_cpu_core(n, core_id);
+                    }
                 }
             }
             closedir(dir);
@@ -229,7 +250,7 @@ discover_numa_and_core(void)
         }
 
         free(path);
-        if (!dir || !numa_supported) {
+        if (!numa_supported) {
             break;
         }
     }
diff --git a/tests/dpif-netdev.at b/tests/dpif-netdev.at index 57cae38..f8782e6 100644
--- a/tests/dpif-netdev.at
+++ b/tests/dpif-netdev.at
@@ -85,7 +85,7 @@ AT_CLEANUP
 
 
 m4_define([DPIF_NETDEV_DUMMY_IFACE],
-  [AT_SETUP([dpif-netdev - $1 interface])
+  [AT_SETUP([dpif-netdev - $1 interface $2])
    # Create br0 with interfaces p1 and p7
    #    and br1 with interfaces p2 and p8
    # with p1 and p2 connected via unix domain socket @@ -98,7 +98,7 @@ m4_define([DPIF_NETDEV_DUMMY_IFACE],
                      fail-mode=secure -- \
       add-port br1 p2 -- set interface p2 type=$1 options:stream=unix:$OVS_RUNDIR/p0.sock ofport_request=2 -- \
       add-port br1 p8 -- set interface p8 ofport_request=8 type=$1 --], [], [],
-      [m4_if([$1], [dummy-pmd], [--dummy-numa="0,0,0,0,1,1,1,1"], [])])
+      [m4_if([$2], [(contiguous NUMA nodes)], 
+ [--dummy-numa="0,0,0,0,1,1,1,1"], [$2], [(non-contiguous NUMA nodes)], 
+ [--dummy-numa="0,0,0,0,8,8,8,8"], [])])
    AT_CHECK([ovs-appctl vlog/set dpif:dbg dpif_netdev:dbg])
 
    AT_CHECK([ovs-ofctl add-flow br0 action=normal]) @@ -120,17 +120,18 @@ recirc_id(0),in_port(8),packet_type(ns=0,id=0),eth(src=50:54:00:00:00:0b,dst=50:
    OVS_VSWITCHD_STOP
    AT_CLEANUP])
 
-DPIF_NETDEV_DUMMY_IFACE([dummy])
-DPIF_NETDEV_DUMMY_IFACE([dummy-pmd])
+DPIF_NETDEV_DUMMY_IFACE([dummy], [])
+DPIF_NETDEV_DUMMY_IFACE([dummy-pmd], [(contiguous NUMA nodes)]) 
+DPIF_NETDEV_DUMMY_IFACE([dummy-pmd], [(non-contiguous NUMA nodes)])
 
 m4_define([DPIF_NETDEV_MISS_FLOW_INSTALL],
-  [AT_SETUP([dpif-netdev - miss upcall key matches flow_install - $1])
+  [AT_SETUP([dpif-netdev - miss upcall key matches flow_install - $1 
+ $2])
    OVS_VSWITCHD_START(
      [add-port br0 p1 \
       -- set interface p1 type=$1 options:pstream=punix:$OVS_RUNDIR/p0.sock \
       -- set bridge br0 datapath-type=dummy \
                         other-config:datapath-id=1234 fail-mode=secure], [], [],
-      [m4_if([$1], [dummy-pmd], [--dummy-numa="0,0,0,0,1,1,1,1"], [])])
+      [m4_if([$2], [(contiguous NUMA nodes)], 
+ [--dummy-numa="0,0,0,0,1,1,1,1"], [$2], [(non-contiguous NUMA nodes)], 
+ [--dummy-numa="0,0,0,0,8,8,8,8"], [])])
    AT_CHECK([ovs-appctl vlog/set dpif:dbg dpif_netdev:dbg])
 
    AT_CHECK([ovs-ofctl add-flow br0 action=normal]) @@ -162,17 +163,18 @@ skb_priority(0),skb_mark(0),ct_state(-new-est-rel-rpl-inv-trk-snat-dnat),ct_zone
    OVS_VSWITCHD_STOP
    AT_CLEANUP])
 
-DPIF_NETDEV_MISS_FLOW_INSTALL([dummy])
-DPIF_NETDEV_MISS_FLOW_INSTALL([dummy-pmd])
+DPIF_NETDEV_MISS_FLOW_INSTALL([dummy], []) 
+DPIF_NETDEV_MISS_FLOW_INSTALL([dummy-pmd], [(contiguous NUMA nodes)]) 
+DPIF_NETDEV_MISS_FLOW_INSTALL([dummy-pmd], [(non-contiguous NUMA 
+nodes)])
 
 m4_define([DPIF_NETDEV_FLOW_PUT_MODIFY],
-  [AT_SETUP([dpif-netdev - datapath flow modification - $1])
+  [AT_SETUP([dpif-netdev - datapath flow modification - $1 $2])
    OVS_VSWITCHD_START(
      [add-port br0 p1 -- set interface p1 type=$1 ofport_request=1 options:pstream=punix:$OVS_RUNDIR/p1.sock -- \
       add-port br0 p2 -- set interface p2 type=$1 ofport_request=2 options:pstream=punix:$OVS_RUNDIR/p2.sock -- \
       set bridge br0 datapath-type=dummy \
                      other-config:datapath-id=1234 fail-mode=secure], [], [],
-      [m4_if([$1], [dummy-pmd], [--dummy-numa="0,0,0,0,1,1,1,1"], [])])
+      [m4_if([$2], [(contiguous NUMA nodes)], 
+ [--dummy-numa="0,0,0,0,1,1,1,1"], [$2], [(non-contiguous NUMA nodes)], 
+ [--dummy-numa="0,0,0,0,8,8,8,8"], [])])
    AT_CHECK([ovs-appctl vlog/set dpif:file:dbg dpif_netdev:file:dbg])
 
    # Add a flow that directs some packets received on p1 to p2 and the @@ -215,18 +217,20 @@ recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth(src=00:06:07:08:09:0a,dst=00:
    OVS_VSWITCHD_STOP
    AT_CLEANUP])
 
-DPIF_NETDEV_FLOW_PUT_MODIFY([dummy])
-DPIF_NETDEV_FLOW_PUT_MODIFY([dummy-pmd])
+DPIF_NETDEV_FLOW_PUT_MODIFY([dummy], []) 
+DPIF_NETDEV_FLOW_PUT_MODIFY([dummy-pmd], [(contiguous NUMA nodes)]) 
+DPIF_NETDEV_FLOW_PUT_MODIFY([dummy-pmd], [(non-contiguous NUMA nodes)])
 
 
 m4_define([DPIF_NETDEV_MISS_FLOW_DUMP],
-  [AT_SETUP([dpif-netdev - miss upcall key matches flow_dump - $1])
+  [AT_SETUP([dpif-netdev - miss upcall key matches flow_dump - $1 $2])
    OVS_VSWITCHD_START(
      [add-port br0 p1 \
       -- set interface p1 type=$1 options:pstream=punix:$OVS_RUNDIR/p0.sock \
       -- set bridge br0 datapath-type=dummy \
                         other-config:datapath-id=1234 fail-mode=secure], [], [],
-      [m4_if([$1], [dummy-pmd], [--dummy-numa="0,0,0,0,1,1,1,1"], [])])
+      [m4_if([$2], [(contiguous NUMA nodes)], 
+ [--dummy-numa="0,0,0,0,1,1,1,1"], [$2], [(non-contiguous NUMA nodes)], 
+ [--dummy-numa="0,0,0,0,8,8,8,8"], [])])
+
    AT_CHECK([ovs-appctl upcall/disable-ufid], [0], [Datapath dumping tersely using UFID disabled  ], [])
    AT_CHECK([ovs-appctl vlog/set dpif:dbg dpif_netdev:dbg]) @@ -263,8 +267,9 @@ skb_priority(0/0),skb_mark(0/0),ct_state(0/0),ct_zone(0/0),ct_mark(0/0),ct_label
    OVS_VSWITCHD_STOP
    AT_CLEANUP])
 
-DPIF_NETDEV_MISS_FLOW_DUMP([dummy])
-DPIF_NETDEV_MISS_FLOW_DUMP([dummy-pmd])
+DPIF_NETDEV_MISS_FLOW_DUMP([dummy], []) 
+DPIF_NETDEV_MISS_FLOW_DUMP([dummy-pmd], [(contiguous NUMA nodes)]) 
+DPIF_NETDEV_MISS_FLOW_DUMP([dummy-pmd], [(non-contiguous NUMA nodes)])
 
 AT_SETUP([dpif-netdev - meters])
 # Create br0 with interfaces p1 and p7
@@ -378,13 +383,13 @@ OVS_VSWITCHD_STOP
 AT_CLEANUP
 
 m4_define([DPIF_NETDEV_FLOW_HW_OFFLOAD],
-  [AT_SETUP([dpif-netdev - partial hw offload - $1])
+  [AT_SETUP([dpif-netdev - partial hw offload - $1 $2])
    OVS_VSWITCHD_START(
      [add-port br0 p1 -- \
       set interface p1 type=$1 ofport_request=1 options:pstream=punix:$OVS_RUNDIR/p1.sock options:ifindex=1100 -- \
       set bridge br0 datapath-type=dummy \
                      other-config:datapath-id=1234 fail-mode=secure], [], [],
-      [m4_if([$1], [dummy-pmd], [--dummy-numa="0,0,0,0,1,1,1,1"], [])])
+      [m4_if([$2], [(contiguous NUMA nodes)], 
+ [--dummy-numa="0,0,0,0,1,1,1,1"], [$2], [(non-contiguous NUMA nodes)], 
+ [--dummy-numa="0,0,0,0,8,8,8,8"], [])])
    AT_CHECK([ovs-appctl vlog/set dpif:file:dbg dpif_netdev:file:dbg netdev_dummy:file:dbg])
 
    AT_CHECK([ovs-vsctl set Open_vSwitch . other_config:hw-offload=true]) @@ -436,18 +441,19 @@ p1: flow del: mark: 1
    OVS_VSWITCHD_STOP
    AT_CLEANUP])
 
-DPIF_NETDEV_FLOW_HW_OFFLOAD([dummy])
-DPIF_NETDEV_FLOW_HW_OFFLOAD([dummy-pmd])
+DPIF_NETDEV_FLOW_HW_OFFLOAD([dummy], []) 
+DPIF_NETDEV_FLOW_HW_OFFLOAD([dummy-pmd], [(contiguous NUMA nodes)]) 
+DPIF_NETDEV_FLOW_HW_OFFLOAD([dummy-pmd], [(non-contiguous NUMA nodes)])
 
 
 m4_define([DPIF_NETDEV_FLOW_HW_OFFLOAD_OFFSETS],
-  [AT_SETUP([dpif-netdev - partial hw offload with packet modifications - $1])
+  [AT_SETUP([dpif-netdev - partial hw offload with packet modifications 
+ - $1 $2])
    OVS_VSWITCHD_START(
      [add-port br0 p1 -- \
       set interface p1 type=$1 ofport_request=1 options:pcap=p1.pcap options:ifindex=1101 -- \
       set bridge br0 datapath-type=dummy \
                      other-config:datapath-id=1234 fail-mode=secure], [], [],
-      [m4_if([$1], [dummy-pmd], [--dummy-numa="0,0,0,0,1,1,1,1"], [])])
+      [m4_if([$2], [(contiguous NUMA nodes)], 
+ [--dummy-numa="0,0,0,0,1,1,1,1"], [$2], [(non-contiguous NUMA nodes)], 
+ [--dummy-numa="0,0,0,0,8,8,8,8"], [])])
    AT_CHECK([ovs-appctl vlog/set dpif:file:dbg dpif_netdev:file:dbg netdev_dummy:file:dbg])
 
    AT_CHECK([ovs-vsctl set Open_vSwitch . other_config:hw-offload=true]) @@ -514,17 +520,18 @@ udp,in_port=ANY,dl_vlan=99,dl_vlan_pcp=7,vlan_tci1=0x0000,dl_src=00:06:07:08:09:
    OVS_VSWITCHD_STOP
    AT_CLEANUP])
 
-DPIF_NETDEV_FLOW_HW_OFFLOAD_OFFSETS([dummy])
-DPIF_NETDEV_FLOW_HW_OFFLOAD_OFFSETS([dummy-pmd])
+DPIF_NETDEV_FLOW_HW_OFFLOAD_OFFSETS([dummy], []) 
+DPIF_NETDEV_FLOW_HW_OFFLOAD_OFFSETS([dummy-pmd], [(contiguous NUMA 
+nodes)]) DPIF_NETDEV_FLOW_HW_OFFLOAD_OFFSETS([dummy-pmd], 
+[(non-contiguous NUMA nodes)])
 
 m4_define([DPIF_NETDEV_FLOW_HW_OFFLOAD_OFFSETS_VID_ARP],
-  [AT_SETUP([dpif-netdev - partial hw offload with arp vlan id packet modifications - $1])
+  [AT_SETUP([dpif-netdev - partial hw offload with arp vlan id packet 
+ modifications - $1 $2])
    OVS_VSWITCHD_START(
      [add-port br0 p1 -- \
       set interface p1 type=$1 ofport_request=1 options:pcap=p1.pcap options:ifindex=1102 -- \
       set bridge br0 datapath-type=dummy \
                      other-config:datapath-id=1234 fail-mode=secure], [], [],
-      [m4_if([$1], [dummy-pmd], [--dummy-numa="0,0,0,0,1,1,1,1"], [])])
+      [m4_if([$2], [(contiguous NUMA nodes)], 
+ [--dummy-numa="0,0,0,0,1,1,1,1"], [$2], [(non-contiguous NUMA nodes)], 
+ [--dummy-numa="0,0,0,0,8,8,8,8"], [])])
    AT_CHECK([ovs-appctl vlog/set dpif:file:dbg dpif_netdev:file:dbg netdev_dummy:file:dbg])
 
    AT_CHECK([ovs-vsctl set Open_vSwitch . other_config:hw-offload=true]) @@ -591,8 +598,9 @@ arp,in_port=ANY,dl_vlan=11,dl_vlan_pcp=7,vlan_tci1=0x0000,dl_src=00:06:07:08:09:
    OVS_VSWITCHD_STOP
    AT_CLEANUP])
 
-DPIF_NETDEV_FLOW_HW_OFFLOAD_OFFSETS_VID_ARP([dummy])
-DPIF_NETDEV_FLOW_HW_OFFLOAD_OFFSETS_VID_ARP([dummy-pmd])
+DPIF_NETDEV_FLOW_HW_OFFLOAD_OFFSETS_VID_ARP([dummy], []) 
+DPIF_NETDEV_FLOW_HW_OFFLOAD_OFFSETS_VID_ARP([dummy-pmd], [(contiguous 
+NUMA nodes)]) DPIF_NETDEV_FLOW_HW_OFFLOAD_OFFSETS_VID_ARP([dummy-pmd], 
+[(non-contiguous NUMA nodes)])
 
 AT_SETUP([dpif-netdev - check dpctl/add-flow in_port exact match])  OVS_VSWITCHD_START( diff --git a/tests/pmd.at b/tests/pmd.at index cc5371d..fa58ef2 100644
--- a/tests/pmd.at
+++ b/tests/pmd.at
@@ -362,7 +362,7 @@ OVS_VSWITCHD_START(
   [add-port br0 p1 -- set Interface p1 type=dummy-pmd ofport_request=1 options:n_rxq=2 -- \
    add-port br0 p2 -- set Interface p2 type=dummy-pmd ofport_request=2 options:n_rxq=2 -- \
    set Open_vSwitch . other_config:pmd-cpu-mask=3 -], [], [], [--dummy-numa 0,1])
+], [], [], [--dummy-numa 0,8])
 AT_CHECK([ovs-appctl vlog/set dpif:dbg dpif_netdev:dbg])
 
 AT_CHECK([ovs-ofctl add-flow br0 action=controller]) @@ -399,13 +399,13 @@ NXT_PACKET_IN2 (xid=0x0): cookie=0x0 total_len=106 in_port=2 (via action) data_l
 icmp,vlan_tci=0x0000,dl_src=50:54:00:00:00:09,dl_dst=50:54:00:00:00:0a,nw_src=10.0.0.2,nw_dst=10.0.0.1,nw_tos=0,nw_ecn=0,nw_ttl=64,icmp_type=8,icmp_code=0 icmp_csum:13fc
 ])
 
-AT_CHECK([ovs-vsctl set Interface p2 options:numa_id=1])
+AT_CHECK([ovs-vsctl set Interface p2 options:numa_id=8])
 
 AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | parse_pmd_rxq_show], [0], [dnl
 p1 0 0 0
 p1 1 0 0
-p2 0 1 1
-p2 1 1 1
+p2 0 8 1
+p2 1 8 1
 ])
 
 AT_CHECK([ovs-ofctl monitor br0 65534 invalid_ttl --detach --no-chdir --pidfile 2> ofctl_monitor.log]) @@ -584,7 +584,7 @@ AT_CLEANUP
 
 AT_SETUP([PMD - rxq affinity - NUMA])
 OVS_VSWITCHD_START(
-  [], [], [], [--dummy-numa 0,0,0,1,1])
+  [], [], [], [--dummy-numa 0,0,0,8,8])
 AT_CHECK([ovs-appctl vlog/set dpif:dbg dpif_netdev:dbg])
 
 AT_CHECK([ovs-ofctl add-flow br0 actions=controller]) @@ -604,15 +604,15 @@ AT_CHECK([ovs-vsctl set Interface p1 other_config:pmd-rxq-affinity="0:3,1:4"])
 dnl We moved the queues to different numa node. Expecting threads on  dnl NUMA node 1 to be created.
 AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | parse_pmd_rxq_show], [0], [dnl
-p1 0 1 3
-p1 1 1 4
+p1 0 8 3
+p1 1 8 4
 ])
 
 AT_CHECK([ovs-vsctl set Interface p1 other_config:pmd-rxq-affinity="0:3,1:1"])
 
 dnl Queues splitted between NUMA nodes.
 AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | parse_pmd_rxq_show], [0], [dnl
-p1 0 1 3
+p1 0 8 3
 p1 1 0 1
 ])
 
@@ -628,10 +628,10 @@ p1 1
 
 AT_CHECK([ovs-vsctl set Interface p1 other_config:pmd-rxq-affinity='0:3'])
 
-dnl We explicitly requesting NUMA node 1 for queue 0.
+dnl We explicitly requesting NUMA node 8 for queue 0.
 dnl Queue 1 should be polled by thread from NUMA node 0.
 AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | parse_pmd_rxq_show | cut -f 1,2,3 -d ' '], [0], [dnl
-p1 0 1
+p1 0 8
 p1 1 0
 ])
 
--
1.8.3.1


Tested-by: Emma Finn <emma.finn@intel.com>

Thanks, 
Emma
Kevin Traynor May 21, 2021, 5:08 p.m. UTC | #2
On 19/05/2021 01:10, David Wilder wrote:
> This change removes the assumption that numa nodes and cores are numbered
> contiguously in linux.  This change is required to support some Power
> systems.
> 
> An additional check has been added to verify that cores are online,
> offline cores result in non-contiguously numbered cores.
> 
> I manually verified that all the exported functions defined in
> ovs-numa.h function correctly with these changes on Power and amd64
> systems.
> 
> Tests for dpif-netdev and pmd that define numa systems using the
> --dummy-numa option have been modified to use both contiguous and
> non-contiguous numbered nodes.
> 

Hi David, a few comments below,

thanks,
Kevin.

> Signed-off-by: David Wilder <dwilder@us.ibm.com>
> Reviewed-by: David Christensen <drc@linux.vnet.ibm.com>
> ---
>  lib/ovs-numa.c       | 47 +++++++++++++++++++++++++++-----------
>  tests/dpif-netdev.at | 64 +++++++++++++++++++++++++++++-----------------------
>  tests/pmd.at         | 20 ++++++++--------
>  3 files changed, 80 insertions(+), 51 deletions(-)
> 
> diff --git a/lib/ovs-numa.c b/lib/ovs-numa.c
> index 6d0a685..d2e7cc6 100644
> --- a/lib/ovs-numa.c
> +++ b/lib/ovs-numa.c
> @@ -42,16 +42,18 @@ VLOG_DEFINE_THIS_MODULE(ovs_numa);
>   * This module stores the affinity information of numa nodes and cpu cores.
>   * It also provides functions to bookkeep the pin of threads on cpu cores.
>   *
> - * It is assumed that the numa node ids and cpu core ids all start from 0 and
> - * range continuously.  So, for example, if 'ovs_numa_get_n_cores()' returns N,
> - * user can assume core ids from 0 to N-1 are all valid and there is a
> - * 'struct cpu_core' for each id.
> + * It is assumed that the numa node ids and cpu core ids all start from 0.
> + * There is no guarantee that node and cpu ids are numbered consecutively
> + * (this is a change from earlier version of the code). So, for example,
> + * if two nodes exist with ids 0 and 8, 'ovs_numa_get_n_nodes()' will
> + * return 2, no assumption of node numbering should be made.
>   *
>   * NOTE, this module should only be used by the main thread.
>   *
> - * NOTE, the assumption above will fail when cpu hotplug is used.  In that
> - * case ovs-numa will not function correctly.  For now, add a TODO entry
> - * for addressing it in the future.
> + * NOTE, if cpu hotplug is used 'all_numa_nodes' and 'all_cpu_cores' must be
> + * invalidated when ever the system topology changes. Support for detecting
> + * topology changes has not been included. For now, add a TODO entry for
> + * addressing it in the future.
>   *
>   * TODO: Fix ovs-numa when cpu hotplug is used.
>   */
> @@ -130,8 +132,8 @@ insert_new_cpu_core(struct numa_node *n, unsigned core_id)
>   * - "0,0,0,0": four cores on numa socket 0.
>   * - "0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1": 16 cores on two numa sockets.
>   * - "0,0,0,0,1,1,1,1": 8 cores on two numa sockets.
> - *
> - * The different numa ids must be consecutives or the function will abort. */
> + * - "0,0,0,0,8,8,8,8": 8 cores on two numa sockets, non-contiguous.
> + */
>  static void
>  discover_numa_and_core_dummy(void)
>  {
> @@ -169,10 +171,27 @@ discover_numa_and_core_dummy(void)
>  
>      free(conf);
>  
> -    if (max_numa_id + 1 != hmap_count(&all_numa_nodes)) {
> -        ovs_fatal(0, "dummy numa contains non consecutive numa ids");

There is no need for max_numa_id now you are removing this check.

> +}
> +
> +#ifdef __linux__
> +/* Check if a cpu is detected and online */
> +static int
> +cpu_detected(unsigned int core_id)
> +{
> +    char path[PATH_MAX];
> +    int len = snprintf(path,
> +              sizeof(path), "/sys/devices/system/cpu/cpu%d/topology/core_id" ,
> +              core_id);

minor: you can put 'snprintf(path, sizeof(path)' on one line

> +    if (len <= 0 || (unsigned) len >= sizeof(path)) {
> +        return 0;
> +    }
> +    if (access(path, F_OK) != 0) {

Won't this path exist if the cpu is offline too? Should you check the
contents of 'online' instead?

> +        return 0;
>      }
> +
> +    return 1;
>  }
> +#endif /* __linux__ */
>  
>  /* Discovers all numa nodes and the corresponding cpu cores.
>   * Constructs the 'struct numa_node' and 'struct cpu_core'. */
> @@ -219,7 +238,9 @@ discover_numa_and_core(void)
>                      unsigned core_id;
>  
>                      core_id = strtoul(subdir->d_name + 3, NULL, 10);
> -                    insert_new_cpu_core(n, core_id);
> +                    if (cpu_detected(core_id)) {
> +                        insert_new_cpu_core(n, core_id);
> +                    }
>                  }
>              }
>              closedir(dir);
> @@ -229,7 +250,7 @@ discover_numa_and_core(void)
>          }
>  

I guess that PPC does't have /sys/devices/system/node/nodeX if it has
skipped X, so i'm wondering if there will be a a bunch of warnings:

else if (errno != ENOENT) {
            VLOG_WARN("opendir(%s) failed (%s)", path,
                      ovs_strerror(errno));

>          free(path);
> -        if (!dir || !numa_supported) {
> +        if (!numa_supported) {
>              break;
>          }
>      }
> diff --git a/tests/dpif-netdev.at b/tests/dpif-netdev.at
> index 57cae38..f8782e6 100644
> --- a/tests/dpif-netdev.at
> +++ b/tests/dpif-netdev.at
> @@ -85,7 +85,7 @@ AT_CLEANUP
>  

I'm not really sure there is a need to have every test using both contig
and non-contig numa nodes. It will use the same numa discovery code in
each one, so unless there is some other difference in the test later, it
seems like it could be a bit overkill.

>  
>  m4_define([DPIF_NETDEV_DUMMY_IFACE],
> -  [AT_SETUP([dpif-netdev - $1 interface])
> +  [AT_SETUP([dpif-netdev - $1 interface $2])
>     # Create br0 with interfaces p1 and p7
>     #    and br1 with interfaces p2 and p8
>     # with p1 and p2 connected via unix domain socket
> @@ -98,7 +98,7 @@ m4_define([DPIF_NETDEV_DUMMY_IFACE],
>                       fail-mode=secure -- \
>        add-port br1 p2 -- set interface p2 type=$1 options:stream=unix:$OVS_RUNDIR/p0.sock ofport_request=2 -- \
>        add-port br1 p8 -- set interface p8 ofport_request=8 type=$1 --], [], [],
> -      [m4_if([$1], [dummy-pmd], [--dummy-numa="0,0,0,0,1,1,1,1"], [])])
> +      [m4_if([$2], [(contiguous NUMA nodes)], [--dummy-numa="0,0,0,0,1,1,1,1"], [$2], [(non-contiguous NUMA nodes)], [--dummy-numa="0,0,0,0,8,8,8,8"], [])])
>     AT_CHECK([ovs-appctl vlog/set dpif:dbg dpif_netdev:dbg])
>  
>     AT_CHECK([ovs-ofctl add-flow br0 action=normal])
> @@ -120,17 +120,18 @@ recirc_id(0),in_port(8),packet_type(ns=0,id=0),eth(src=50:54:00:00:00:0b,dst=50:
>     OVS_VSWITCHD_STOP
>     AT_CLEANUP])
>  
> -DPIF_NETDEV_DUMMY_IFACE([dummy])
> -DPIF_NETDEV_DUMMY_IFACE([dummy-pmd])
> +DPIF_NETDEV_DUMMY_IFACE([dummy], [])
> +DPIF_NETDEV_DUMMY_IFACE([dummy-pmd], [(contiguous NUMA nodes)])
> +DPIF_NETDEV_DUMMY_IFACE([dummy-pmd], [(non-contiguous NUMA nodes)])
>  
>  m4_define([DPIF_NETDEV_MISS_FLOW_INSTALL],
> -  [AT_SETUP([dpif-netdev - miss upcall key matches flow_install - $1])
> +  [AT_SETUP([dpif-netdev - miss upcall key matches flow_install - $1 $2])
>     OVS_VSWITCHD_START(
>       [add-port br0 p1 \
>        -- set interface p1 type=$1 options:pstream=punix:$OVS_RUNDIR/p0.sock \
>        -- set bridge br0 datapath-type=dummy \
>                          other-config:datapath-id=1234 fail-mode=secure], [], [],
> -      [m4_if([$1], [dummy-pmd], [--dummy-numa="0,0,0,0,1,1,1,1"], [])])
> +      [m4_if([$2], [(contiguous NUMA nodes)], [--dummy-numa="0,0,0,0,1,1,1,1"], [$2], [(non-contiguous NUMA nodes)], [--dummy-numa="0,0,0,0,8,8,8,8"], [])])
>     AT_CHECK([ovs-appctl vlog/set dpif:dbg dpif_netdev:dbg])
>  
>     AT_CHECK([ovs-ofctl add-flow br0 action=normal])
> @@ -162,17 +163,18 @@ skb_priority(0),skb_mark(0),ct_state(-new-est-rel-rpl-inv-trk-snat-dnat),ct_zone
>     OVS_VSWITCHD_STOP
>     AT_CLEANUP])
>  
> -DPIF_NETDEV_MISS_FLOW_INSTALL([dummy])
> -DPIF_NETDEV_MISS_FLOW_INSTALL([dummy-pmd])
> +DPIF_NETDEV_MISS_FLOW_INSTALL([dummy], [])
> +DPIF_NETDEV_MISS_FLOW_INSTALL([dummy-pmd], [(contiguous NUMA nodes)])
> +DPIF_NETDEV_MISS_FLOW_INSTALL([dummy-pmd], [(non-contiguous NUMA nodes)])
>  
>  m4_define([DPIF_NETDEV_FLOW_PUT_MODIFY],
> -  [AT_SETUP([dpif-netdev - datapath flow modification - $1])
> +  [AT_SETUP([dpif-netdev - datapath flow modification - $1 $2])
>     OVS_VSWITCHD_START(
>       [add-port br0 p1 -- set interface p1 type=$1 ofport_request=1 options:pstream=punix:$OVS_RUNDIR/p1.sock -- \
>        add-port br0 p2 -- set interface p2 type=$1 ofport_request=2 options:pstream=punix:$OVS_RUNDIR/p2.sock -- \
>        set bridge br0 datapath-type=dummy \
>                       other-config:datapath-id=1234 fail-mode=secure], [], [],
> -      [m4_if([$1], [dummy-pmd], [--dummy-numa="0,0,0,0,1,1,1,1"], [])])
> +      [m4_if([$2], [(contiguous NUMA nodes)], [--dummy-numa="0,0,0,0,1,1,1,1"], [$2], [(non-contiguous NUMA nodes)], [--dummy-numa="0,0,0,0,8,8,8,8"], [])])
>     AT_CHECK([ovs-appctl vlog/set dpif:file:dbg dpif_netdev:file:dbg])
>  
>     # Add a flow that directs some packets received on p1 to p2 and the
> @@ -215,18 +217,20 @@ recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth(src=00:06:07:08:09:0a,dst=00:
>     OVS_VSWITCHD_STOP
>     AT_CLEANUP])
>  
> -DPIF_NETDEV_FLOW_PUT_MODIFY([dummy])
> -DPIF_NETDEV_FLOW_PUT_MODIFY([dummy-pmd])
> +DPIF_NETDEV_FLOW_PUT_MODIFY([dummy], [])
> +DPIF_NETDEV_FLOW_PUT_MODIFY([dummy-pmd], [(contiguous NUMA nodes)])
> +DPIF_NETDEV_FLOW_PUT_MODIFY([dummy-pmd], [(non-contiguous NUMA nodes)])
>  
>  
>  m4_define([DPIF_NETDEV_MISS_FLOW_DUMP],
> -  [AT_SETUP([dpif-netdev - miss upcall key matches flow_dump - $1])
> +  [AT_SETUP([dpif-netdev - miss upcall key matches flow_dump - $1 $2])
>     OVS_VSWITCHD_START(
>       [add-port br0 p1 \
>        -- set interface p1 type=$1 options:pstream=punix:$OVS_RUNDIR/p0.sock \
>        -- set bridge br0 datapath-type=dummy \
>                          other-config:datapath-id=1234 fail-mode=secure], [], [],
> -      [m4_if([$1], [dummy-pmd], [--dummy-numa="0,0,0,0,1,1,1,1"], [])])
> +      [m4_if([$2], [(contiguous NUMA nodes)], [--dummy-numa="0,0,0,0,1,1,1,1"], [$2], [(non-contiguous NUMA nodes)], [--dummy-numa="0,0,0,0,8,8,8,8"], [])])
> +
>     AT_CHECK([ovs-appctl upcall/disable-ufid], [0], [Datapath dumping tersely using UFID disabled
>  ], [])
>     AT_CHECK([ovs-appctl vlog/set dpif:dbg dpif_netdev:dbg])
> @@ -263,8 +267,9 @@ skb_priority(0/0),skb_mark(0/0),ct_state(0/0),ct_zone(0/0),ct_mark(0/0),ct_label
>     OVS_VSWITCHD_STOP
>     AT_CLEANUP])
>  
> -DPIF_NETDEV_MISS_FLOW_DUMP([dummy])
> -DPIF_NETDEV_MISS_FLOW_DUMP([dummy-pmd])
> +DPIF_NETDEV_MISS_FLOW_DUMP([dummy], [])
> +DPIF_NETDEV_MISS_FLOW_DUMP([dummy-pmd], [(contiguous NUMA nodes)])
> +DPIF_NETDEV_MISS_FLOW_DUMP([dummy-pmd], [(non-contiguous NUMA nodes)])
>  
>  AT_SETUP([dpif-netdev - meters])
>  # Create br0 with interfaces p1 and p7
> @@ -378,13 +383,13 @@ OVS_VSWITCHD_STOP
>  AT_CLEANUP
>  
>  m4_define([DPIF_NETDEV_FLOW_HW_OFFLOAD],
> -  [AT_SETUP([dpif-netdev - partial hw offload - $1])
> +  [AT_SETUP([dpif-netdev - partial hw offload - $1 $2])
>     OVS_VSWITCHD_START(
>       [add-port br0 p1 -- \
>        set interface p1 type=$1 ofport_request=1 options:pstream=punix:$OVS_RUNDIR/p1.sock options:ifindex=1100 -- \
>        set bridge br0 datapath-type=dummy \
>                       other-config:datapath-id=1234 fail-mode=secure], [], [],
> -      [m4_if([$1], [dummy-pmd], [--dummy-numa="0,0,0,0,1,1,1,1"], [])])
> +      [m4_if([$2], [(contiguous NUMA nodes)], [--dummy-numa="0,0,0,0,1,1,1,1"], [$2], [(non-contiguous NUMA nodes)], [--dummy-numa="0,0,0,0,8,8,8,8"], [])])
>     AT_CHECK([ovs-appctl vlog/set dpif:file:dbg dpif_netdev:file:dbg netdev_dummy:file:dbg])
>  
>     AT_CHECK([ovs-vsctl set Open_vSwitch . other_config:hw-offload=true])
> @@ -436,18 +441,19 @@ p1: flow del: mark: 1
>     OVS_VSWITCHD_STOP
>     AT_CLEANUP])
>  
> -DPIF_NETDEV_FLOW_HW_OFFLOAD([dummy])
> -DPIF_NETDEV_FLOW_HW_OFFLOAD([dummy-pmd])
> +DPIF_NETDEV_FLOW_HW_OFFLOAD([dummy], [])
> +DPIF_NETDEV_FLOW_HW_OFFLOAD([dummy-pmd], [(contiguous NUMA nodes)])
> +DPIF_NETDEV_FLOW_HW_OFFLOAD([dummy-pmd], [(non-contiguous NUMA nodes)])
>  
>  
>  m4_define([DPIF_NETDEV_FLOW_HW_OFFLOAD_OFFSETS],
> -  [AT_SETUP([dpif-netdev - partial hw offload with packet modifications - $1])
> +  [AT_SETUP([dpif-netdev - partial hw offload with packet modifications - $1 $2])
>     OVS_VSWITCHD_START(
>       [add-port br0 p1 -- \
>        set interface p1 type=$1 ofport_request=1 options:pcap=p1.pcap options:ifindex=1101 -- \
>        set bridge br0 datapath-type=dummy \
>                       other-config:datapath-id=1234 fail-mode=secure], [], [],
> -      [m4_if([$1], [dummy-pmd], [--dummy-numa="0,0,0,0,1,1,1,1"], [])])
> +      [m4_if([$2], [(contiguous NUMA nodes)], [--dummy-numa="0,0,0,0,1,1,1,1"], [$2], [(non-contiguous NUMA nodes)], [--dummy-numa="0,0,0,0,8,8,8,8"], [])])
>     AT_CHECK([ovs-appctl vlog/set dpif:file:dbg dpif_netdev:file:dbg netdev_dummy:file:dbg])
>  
>     AT_CHECK([ovs-vsctl set Open_vSwitch . other_config:hw-offload=true])
> @@ -514,17 +520,18 @@ udp,in_port=ANY,dl_vlan=99,dl_vlan_pcp=7,vlan_tci1=0x0000,dl_src=00:06:07:08:09:
>     OVS_VSWITCHD_STOP
>     AT_CLEANUP])
>  
> -DPIF_NETDEV_FLOW_HW_OFFLOAD_OFFSETS([dummy])
> -DPIF_NETDEV_FLOW_HW_OFFLOAD_OFFSETS([dummy-pmd])
> +DPIF_NETDEV_FLOW_HW_OFFLOAD_OFFSETS([dummy], [])
> +DPIF_NETDEV_FLOW_HW_OFFLOAD_OFFSETS([dummy-pmd], [(contiguous NUMA nodes)])
> +DPIF_NETDEV_FLOW_HW_OFFLOAD_OFFSETS([dummy-pmd], [(non-contiguous NUMA nodes)])
>  
>  m4_define([DPIF_NETDEV_FLOW_HW_OFFLOAD_OFFSETS_VID_ARP],
> -  [AT_SETUP([dpif-netdev - partial hw offload with arp vlan id packet modifications - $1])
> +  [AT_SETUP([dpif-netdev - partial hw offload with arp vlan id packet modifications - $1 $2])
>     OVS_VSWITCHD_START(
>       [add-port br0 p1 -- \
>        set interface p1 type=$1 ofport_request=1 options:pcap=p1.pcap options:ifindex=1102 -- \
>        set bridge br0 datapath-type=dummy \
>                       other-config:datapath-id=1234 fail-mode=secure], [], [],
> -      [m4_if([$1], [dummy-pmd], [--dummy-numa="0,0,0,0,1,1,1,1"], [])])
> +      [m4_if([$2], [(contiguous NUMA nodes)], [--dummy-numa="0,0,0,0,1,1,1,1"], [$2], [(non-contiguous NUMA nodes)], [--dummy-numa="0,0,0,0,8,8,8,8"], [])])
>     AT_CHECK([ovs-appctl vlog/set dpif:file:dbg dpif_netdev:file:dbg netdev_dummy:file:dbg])
>  
>     AT_CHECK([ovs-vsctl set Open_vSwitch . other_config:hw-offload=true])
> @@ -591,8 +598,9 @@ arp,in_port=ANY,dl_vlan=11,dl_vlan_pcp=7,vlan_tci1=0x0000,dl_src=00:06:07:08:09:
>     OVS_VSWITCHD_STOP
>     AT_CLEANUP])
>  
> -DPIF_NETDEV_FLOW_HW_OFFLOAD_OFFSETS_VID_ARP([dummy])
> -DPIF_NETDEV_FLOW_HW_OFFLOAD_OFFSETS_VID_ARP([dummy-pmd])
> +DPIF_NETDEV_FLOW_HW_OFFLOAD_OFFSETS_VID_ARP([dummy], [])
> +DPIF_NETDEV_FLOW_HW_OFFLOAD_OFFSETS_VID_ARP([dummy-pmd], [(contiguous NUMA nodes)])
> +DPIF_NETDEV_FLOW_HW_OFFLOAD_OFFSETS_VID_ARP([dummy-pmd], [(non-contiguous NUMA nodes)])
>  
>  AT_SETUP([dpif-netdev - check dpctl/add-flow in_port exact match])
>  OVS_VSWITCHD_START(
> diff --git a/tests/pmd.at b/tests/pmd.at
> index cc5371d..fa58ef2 100644
> --- a/tests/pmd.at
> +++ b/tests/pmd.at

It is a bit useful to have non-contig numa here as the numa id is also
used during the rxq_scheduling(). However, now there is no tests with
contig numa left, and probably just one test with both contig/non-contig
combinations would be enough. Changing other tests probably won't add
any more code coverage.

> @@ -362,7 +362,7 @@ OVS_VSWITCHD_START(
>    [add-port br0 p1 -- set Interface p1 type=dummy-pmd ofport_request=1 options:n_rxq=2 -- \
>     add-port br0 p2 -- set Interface p2 type=dummy-pmd ofport_request=2 options:n_rxq=2 -- \
>     set Open_vSwitch . other_config:pmd-cpu-mask=3
> -], [], [], [--dummy-numa 0,1])
> +], [], [], [--dummy-numa 0,8])
>  AT_CHECK([ovs-appctl vlog/set dpif:dbg dpif_netdev:dbg])
>  
>  AT_CHECK([ovs-ofctl add-flow br0 action=controller])
> @@ -399,13 +399,13 @@ NXT_PACKET_IN2 (xid=0x0): cookie=0x0 total_len=106 in_port=2 (via action) data_l
>  icmp,vlan_tci=0x0000,dl_src=50:54:00:00:00:09,dl_dst=50:54:00:00:00:0a,nw_src=10.0.0.2,nw_dst=10.0.0.1,nw_tos=0,nw_ecn=0,nw_ttl=64,icmp_type=8,icmp_code=0 icmp_csum:13fc
>  ])
>  
> -AT_CHECK([ovs-vsctl set Interface p2 options:numa_id=1])
> +AT_CHECK([ovs-vsctl set Interface p2 options:numa_id=8])
>  
>  AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | parse_pmd_rxq_show], [0], [dnl
>  p1 0 0 0
>  p1 1 0 0
> -p2 0 1 1
> -p2 1 1 1
> +p2 0 8 1
> +p2 1 8 1
>  ])
>  
>  AT_CHECK([ovs-ofctl monitor br0 65534 invalid_ttl --detach --no-chdir --pidfile 2> ofctl_monitor.log])
> @@ -584,7 +584,7 @@ AT_CLEANUP
>  
>  AT_SETUP([PMD - rxq affinity - NUMA])
>  OVS_VSWITCHD_START(
> -  [], [], [], [--dummy-numa 0,0,0,1,1])
> +  [], [], [], [--dummy-numa 0,0,0,8,8])
>  AT_CHECK([ovs-appctl vlog/set dpif:dbg dpif_netdev:dbg])
>  
>  AT_CHECK([ovs-ofctl add-flow br0 actions=controller])
> @@ -604,15 +604,15 @@ AT_CHECK([ovs-vsctl set Interface p1 other_config:pmd-rxq-affinity="0:3,1:4"])
>  dnl We moved the queues to different numa node. Expecting threads on
>  dnl NUMA node 1 to be created.
>  AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | parse_pmd_rxq_show], [0], [dnl
> -p1 0 1 3
> -p1 1 1 4
> +p1 0 8 3
> +p1 1 8 4
>  ])
>  
>  AT_CHECK([ovs-vsctl set Interface p1 other_config:pmd-rxq-affinity="0:3,1:1"])
>  
>  dnl Queues splitted between NUMA nodes.
>  AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | parse_pmd_rxq_show], [0], [dnl
> -p1 0 1 3
> +p1 0 8 3
>  p1 1 0 1
>  ])
>  
> @@ -628,10 +628,10 @@ p1 1
>  
>  AT_CHECK([ovs-vsctl set Interface p1 other_config:pmd-rxq-affinity='0:3'])
>  
> -dnl We explicitly requesting NUMA node 1 for queue 0.
> +dnl We explicitly requesting NUMA node 8 for queue 0.
>  dnl Queue 1 should be polled by thread from NUMA node 0.
>  AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | parse_pmd_rxq_show | cut -f 1,2,3 -d ' '], [0], [dnl
> -p1 0 1
> +p1 0 8
>  p1 1 0
>  ])
>  
>
diff mbox series

Patch

diff --git a/lib/ovs-numa.c b/lib/ovs-numa.c
index 6d0a685..d2e7cc6 100644
--- a/lib/ovs-numa.c
+++ b/lib/ovs-numa.c
@@ -42,16 +42,18 @@  VLOG_DEFINE_THIS_MODULE(ovs_numa);
  * This module stores the affinity information of numa nodes and cpu cores.
  * It also provides functions to bookkeep the pin of threads on cpu cores.
  *
- * It is assumed that the numa node ids and cpu core ids all start from 0 and
- * range continuously.  So, for example, if 'ovs_numa_get_n_cores()' returns N,
- * user can assume core ids from 0 to N-1 are all valid and there is a
- * 'struct cpu_core' for each id.
+ * It is assumed that the numa node ids and cpu core ids all start from 0.
+ * There is no guarantee that node and cpu ids are numbered consecutively
+ * (this is a change from earlier version of the code). So, for example,
+ * if two nodes exist with ids 0 and 8, 'ovs_numa_get_n_nodes()' will
+ * return 2, no assumption of node numbering should be made.
  *
  * NOTE, this module should only be used by the main thread.
  *
- * NOTE, the assumption above will fail when cpu hotplug is used.  In that
- * case ovs-numa will not function correctly.  For now, add a TODO entry
- * for addressing it in the future.
+ * NOTE, if cpu hotplug is used 'all_numa_nodes' and 'all_cpu_cores' must be
+ * invalidated when ever the system topology changes. Support for detecting
+ * topology changes has not been included. For now, add a TODO entry for
+ * addressing it in the future.
  *
  * TODO: Fix ovs-numa when cpu hotplug is used.
  */
@@ -130,8 +132,8 @@  insert_new_cpu_core(struct numa_node *n, unsigned core_id)
  * - "0,0,0,0": four cores on numa socket 0.
  * - "0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1": 16 cores on two numa sockets.
  * - "0,0,0,0,1,1,1,1": 8 cores on two numa sockets.
- *
- * The different numa ids must be consecutives or the function will abort. */
+ * - "0,0,0,0,8,8,8,8": 8 cores on two numa sockets, non-contiguous.
+ */
 static void
 discover_numa_and_core_dummy(void)
 {
@@ -169,10 +171,27 @@  discover_numa_and_core_dummy(void)
 
     free(conf);
 
-    if (max_numa_id + 1 != hmap_count(&all_numa_nodes)) {
-        ovs_fatal(0, "dummy numa contains non consecutive numa ids");
+}
+
+#ifdef __linux__
+/* Check if a cpu is detected and online */
+static int
+cpu_detected(unsigned int core_id)
+{
+    char path[PATH_MAX];
+    int len = snprintf(path,
+              sizeof(path), "/sys/devices/system/cpu/cpu%d/topology/core_id" ,
+              core_id);
+    if (len <= 0 || (unsigned) len >= sizeof(path)) {
+        return 0;
+    }
+    if (access(path, F_OK) != 0) {
+        return 0;
     }
+
+    return 1;
 }
+#endif /* __linux__ */
 
 /* Discovers all numa nodes and the corresponding cpu cores.
  * Constructs the 'struct numa_node' and 'struct cpu_core'. */
@@ -219,7 +238,9 @@  discover_numa_and_core(void)
                     unsigned core_id;
 
                     core_id = strtoul(subdir->d_name + 3, NULL, 10);
-                    insert_new_cpu_core(n, core_id);
+                    if (cpu_detected(core_id)) {
+                        insert_new_cpu_core(n, core_id);
+                    }
                 }
             }
             closedir(dir);
@@ -229,7 +250,7 @@  discover_numa_and_core(void)
         }
 
         free(path);
-        if (!dir || !numa_supported) {
+        if (!numa_supported) {
             break;
         }
     }
diff --git a/tests/dpif-netdev.at b/tests/dpif-netdev.at
index 57cae38..f8782e6 100644
--- a/tests/dpif-netdev.at
+++ b/tests/dpif-netdev.at
@@ -85,7 +85,7 @@  AT_CLEANUP
 
 
 m4_define([DPIF_NETDEV_DUMMY_IFACE],
-  [AT_SETUP([dpif-netdev - $1 interface])
+  [AT_SETUP([dpif-netdev - $1 interface $2])
    # Create br0 with interfaces p1 and p7
    #    and br1 with interfaces p2 and p8
    # with p1 and p2 connected via unix domain socket
@@ -98,7 +98,7 @@  m4_define([DPIF_NETDEV_DUMMY_IFACE],
                      fail-mode=secure -- \
       add-port br1 p2 -- set interface p2 type=$1 options:stream=unix:$OVS_RUNDIR/p0.sock ofport_request=2 -- \
       add-port br1 p8 -- set interface p8 ofport_request=8 type=$1 --], [], [],
-      [m4_if([$1], [dummy-pmd], [--dummy-numa="0,0,0,0,1,1,1,1"], [])])
+      [m4_if([$2], [(contiguous NUMA nodes)], [--dummy-numa="0,0,0,0,1,1,1,1"], [$2], [(non-contiguous NUMA nodes)], [--dummy-numa="0,0,0,0,8,8,8,8"], [])])
    AT_CHECK([ovs-appctl vlog/set dpif:dbg dpif_netdev:dbg])
 
    AT_CHECK([ovs-ofctl add-flow br0 action=normal])
@@ -120,17 +120,18 @@  recirc_id(0),in_port(8),packet_type(ns=0,id=0),eth(src=50:54:00:00:00:0b,dst=50:
    OVS_VSWITCHD_STOP
    AT_CLEANUP])
 
-DPIF_NETDEV_DUMMY_IFACE([dummy])
-DPIF_NETDEV_DUMMY_IFACE([dummy-pmd])
+DPIF_NETDEV_DUMMY_IFACE([dummy], [])
+DPIF_NETDEV_DUMMY_IFACE([dummy-pmd], [(contiguous NUMA nodes)])
+DPIF_NETDEV_DUMMY_IFACE([dummy-pmd], [(non-contiguous NUMA nodes)])
 
 m4_define([DPIF_NETDEV_MISS_FLOW_INSTALL],
-  [AT_SETUP([dpif-netdev - miss upcall key matches flow_install - $1])
+  [AT_SETUP([dpif-netdev - miss upcall key matches flow_install - $1 $2])
    OVS_VSWITCHD_START(
      [add-port br0 p1 \
       -- set interface p1 type=$1 options:pstream=punix:$OVS_RUNDIR/p0.sock \
       -- set bridge br0 datapath-type=dummy \
                         other-config:datapath-id=1234 fail-mode=secure], [], [],
-      [m4_if([$1], [dummy-pmd], [--dummy-numa="0,0,0,0,1,1,1,1"], [])])
+      [m4_if([$2], [(contiguous NUMA nodes)], [--dummy-numa="0,0,0,0,1,1,1,1"], [$2], [(non-contiguous NUMA nodes)], [--dummy-numa="0,0,0,0,8,8,8,8"], [])])
    AT_CHECK([ovs-appctl vlog/set dpif:dbg dpif_netdev:dbg])
 
    AT_CHECK([ovs-ofctl add-flow br0 action=normal])
@@ -162,17 +163,18 @@  skb_priority(0),skb_mark(0),ct_state(-new-est-rel-rpl-inv-trk-snat-dnat),ct_zone
    OVS_VSWITCHD_STOP
    AT_CLEANUP])
 
-DPIF_NETDEV_MISS_FLOW_INSTALL([dummy])
-DPIF_NETDEV_MISS_FLOW_INSTALL([dummy-pmd])
+DPIF_NETDEV_MISS_FLOW_INSTALL([dummy], [])
+DPIF_NETDEV_MISS_FLOW_INSTALL([dummy-pmd], [(contiguous NUMA nodes)])
+DPIF_NETDEV_MISS_FLOW_INSTALL([dummy-pmd], [(non-contiguous NUMA nodes)])
 
 m4_define([DPIF_NETDEV_FLOW_PUT_MODIFY],
-  [AT_SETUP([dpif-netdev - datapath flow modification - $1])
+  [AT_SETUP([dpif-netdev - datapath flow modification - $1 $2])
    OVS_VSWITCHD_START(
      [add-port br0 p1 -- set interface p1 type=$1 ofport_request=1 options:pstream=punix:$OVS_RUNDIR/p1.sock -- \
       add-port br0 p2 -- set interface p2 type=$1 ofport_request=2 options:pstream=punix:$OVS_RUNDIR/p2.sock -- \
       set bridge br0 datapath-type=dummy \
                      other-config:datapath-id=1234 fail-mode=secure], [], [],
-      [m4_if([$1], [dummy-pmd], [--dummy-numa="0,0,0,0,1,1,1,1"], [])])
+      [m4_if([$2], [(contiguous NUMA nodes)], [--dummy-numa="0,0,0,0,1,1,1,1"], [$2], [(non-contiguous NUMA nodes)], [--dummy-numa="0,0,0,0,8,8,8,8"], [])])
    AT_CHECK([ovs-appctl vlog/set dpif:file:dbg dpif_netdev:file:dbg])
 
    # Add a flow that directs some packets received on p1 to p2 and the
@@ -215,18 +217,20 @@  recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth(src=00:06:07:08:09:0a,dst=00:
    OVS_VSWITCHD_STOP
    AT_CLEANUP])
 
-DPIF_NETDEV_FLOW_PUT_MODIFY([dummy])
-DPIF_NETDEV_FLOW_PUT_MODIFY([dummy-pmd])
+DPIF_NETDEV_FLOW_PUT_MODIFY([dummy], [])
+DPIF_NETDEV_FLOW_PUT_MODIFY([dummy-pmd], [(contiguous NUMA nodes)])
+DPIF_NETDEV_FLOW_PUT_MODIFY([dummy-pmd], [(non-contiguous NUMA nodes)])
 
 
 m4_define([DPIF_NETDEV_MISS_FLOW_DUMP],
-  [AT_SETUP([dpif-netdev - miss upcall key matches flow_dump - $1])
+  [AT_SETUP([dpif-netdev - miss upcall key matches flow_dump - $1 $2])
    OVS_VSWITCHD_START(
      [add-port br0 p1 \
       -- set interface p1 type=$1 options:pstream=punix:$OVS_RUNDIR/p0.sock \
       -- set bridge br0 datapath-type=dummy \
                         other-config:datapath-id=1234 fail-mode=secure], [], [],
-      [m4_if([$1], [dummy-pmd], [--dummy-numa="0,0,0,0,1,1,1,1"], [])])
+      [m4_if([$2], [(contiguous NUMA nodes)], [--dummy-numa="0,0,0,0,1,1,1,1"], [$2], [(non-contiguous NUMA nodes)], [--dummy-numa="0,0,0,0,8,8,8,8"], [])])
+
    AT_CHECK([ovs-appctl upcall/disable-ufid], [0], [Datapath dumping tersely using UFID disabled
 ], [])
    AT_CHECK([ovs-appctl vlog/set dpif:dbg dpif_netdev:dbg])
@@ -263,8 +267,9 @@  skb_priority(0/0),skb_mark(0/0),ct_state(0/0),ct_zone(0/0),ct_mark(0/0),ct_label
    OVS_VSWITCHD_STOP
    AT_CLEANUP])
 
-DPIF_NETDEV_MISS_FLOW_DUMP([dummy])
-DPIF_NETDEV_MISS_FLOW_DUMP([dummy-pmd])
+DPIF_NETDEV_MISS_FLOW_DUMP([dummy], [])
+DPIF_NETDEV_MISS_FLOW_DUMP([dummy-pmd], [(contiguous NUMA nodes)])
+DPIF_NETDEV_MISS_FLOW_DUMP([dummy-pmd], [(non-contiguous NUMA nodes)])
 
 AT_SETUP([dpif-netdev - meters])
 # Create br0 with interfaces p1 and p7
@@ -378,13 +383,13 @@  OVS_VSWITCHD_STOP
 AT_CLEANUP
 
 m4_define([DPIF_NETDEV_FLOW_HW_OFFLOAD],
-  [AT_SETUP([dpif-netdev - partial hw offload - $1])
+  [AT_SETUP([dpif-netdev - partial hw offload - $1 $2])
    OVS_VSWITCHD_START(
      [add-port br0 p1 -- \
       set interface p1 type=$1 ofport_request=1 options:pstream=punix:$OVS_RUNDIR/p1.sock options:ifindex=1100 -- \
       set bridge br0 datapath-type=dummy \
                      other-config:datapath-id=1234 fail-mode=secure], [], [],
-      [m4_if([$1], [dummy-pmd], [--dummy-numa="0,0,0,0,1,1,1,1"], [])])
+      [m4_if([$2], [(contiguous NUMA nodes)], [--dummy-numa="0,0,0,0,1,1,1,1"], [$2], [(non-contiguous NUMA nodes)], [--dummy-numa="0,0,0,0,8,8,8,8"], [])])
    AT_CHECK([ovs-appctl vlog/set dpif:file:dbg dpif_netdev:file:dbg netdev_dummy:file:dbg])
 
    AT_CHECK([ovs-vsctl set Open_vSwitch . other_config:hw-offload=true])
@@ -436,18 +441,19 @@  p1: flow del: mark: 1
    OVS_VSWITCHD_STOP
    AT_CLEANUP])
 
-DPIF_NETDEV_FLOW_HW_OFFLOAD([dummy])
-DPIF_NETDEV_FLOW_HW_OFFLOAD([dummy-pmd])
+DPIF_NETDEV_FLOW_HW_OFFLOAD([dummy], [])
+DPIF_NETDEV_FLOW_HW_OFFLOAD([dummy-pmd], [(contiguous NUMA nodes)])
+DPIF_NETDEV_FLOW_HW_OFFLOAD([dummy-pmd], [(non-contiguous NUMA nodes)])
 
 
 m4_define([DPIF_NETDEV_FLOW_HW_OFFLOAD_OFFSETS],
-  [AT_SETUP([dpif-netdev - partial hw offload with packet modifications - $1])
+  [AT_SETUP([dpif-netdev - partial hw offload with packet modifications - $1 $2])
    OVS_VSWITCHD_START(
      [add-port br0 p1 -- \
       set interface p1 type=$1 ofport_request=1 options:pcap=p1.pcap options:ifindex=1101 -- \
       set bridge br0 datapath-type=dummy \
                      other-config:datapath-id=1234 fail-mode=secure], [], [],
-      [m4_if([$1], [dummy-pmd], [--dummy-numa="0,0,0,0,1,1,1,1"], [])])
+      [m4_if([$2], [(contiguous NUMA nodes)], [--dummy-numa="0,0,0,0,1,1,1,1"], [$2], [(non-contiguous NUMA nodes)], [--dummy-numa="0,0,0,0,8,8,8,8"], [])])
    AT_CHECK([ovs-appctl vlog/set dpif:file:dbg dpif_netdev:file:dbg netdev_dummy:file:dbg])
 
    AT_CHECK([ovs-vsctl set Open_vSwitch . other_config:hw-offload=true])
@@ -514,17 +520,18 @@  udp,in_port=ANY,dl_vlan=99,dl_vlan_pcp=7,vlan_tci1=0x0000,dl_src=00:06:07:08:09:
    OVS_VSWITCHD_STOP
    AT_CLEANUP])
 
-DPIF_NETDEV_FLOW_HW_OFFLOAD_OFFSETS([dummy])
-DPIF_NETDEV_FLOW_HW_OFFLOAD_OFFSETS([dummy-pmd])
+DPIF_NETDEV_FLOW_HW_OFFLOAD_OFFSETS([dummy], [])
+DPIF_NETDEV_FLOW_HW_OFFLOAD_OFFSETS([dummy-pmd], [(contiguous NUMA nodes)])
+DPIF_NETDEV_FLOW_HW_OFFLOAD_OFFSETS([dummy-pmd], [(non-contiguous NUMA nodes)])
 
 m4_define([DPIF_NETDEV_FLOW_HW_OFFLOAD_OFFSETS_VID_ARP],
-  [AT_SETUP([dpif-netdev - partial hw offload with arp vlan id packet modifications - $1])
+  [AT_SETUP([dpif-netdev - partial hw offload with arp vlan id packet modifications - $1 $2])
    OVS_VSWITCHD_START(
      [add-port br0 p1 -- \
       set interface p1 type=$1 ofport_request=1 options:pcap=p1.pcap options:ifindex=1102 -- \
       set bridge br0 datapath-type=dummy \
                      other-config:datapath-id=1234 fail-mode=secure], [], [],
-      [m4_if([$1], [dummy-pmd], [--dummy-numa="0,0,0,0,1,1,1,1"], [])])
+      [m4_if([$2], [(contiguous NUMA nodes)], [--dummy-numa="0,0,0,0,1,1,1,1"], [$2], [(non-contiguous NUMA nodes)], [--dummy-numa="0,0,0,0,8,8,8,8"], [])])
    AT_CHECK([ovs-appctl vlog/set dpif:file:dbg dpif_netdev:file:dbg netdev_dummy:file:dbg])
 
    AT_CHECK([ovs-vsctl set Open_vSwitch . other_config:hw-offload=true])
@@ -591,8 +598,9 @@  arp,in_port=ANY,dl_vlan=11,dl_vlan_pcp=7,vlan_tci1=0x0000,dl_src=00:06:07:08:09:
    OVS_VSWITCHD_STOP
    AT_CLEANUP])
 
-DPIF_NETDEV_FLOW_HW_OFFLOAD_OFFSETS_VID_ARP([dummy])
-DPIF_NETDEV_FLOW_HW_OFFLOAD_OFFSETS_VID_ARP([dummy-pmd])
+DPIF_NETDEV_FLOW_HW_OFFLOAD_OFFSETS_VID_ARP([dummy], [])
+DPIF_NETDEV_FLOW_HW_OFFLOAD_OFFSETS_VID_ARP([dummy-pmd], [(contiguous NUMA nodes)])
+DPIF_NETDEV_FLOW_HW_OFFLOAD_OFFSETS_VID_ARP([dummy-pmd], [(non-contiguous NUMA nodes)])
 
 AT_SETUP([dpif-netdev - check dpctl/add-flow in_port exact match])
 OVS_VSWITCHD_START(
diff --git a/tests/pmd.at b/tests/pmd.at
index cc5371d..fa58ef2 100644
--- a/tests/pmd.at
+++ b/tests/pmd.at
@@ -362,7 +362,7 @@  OVS_VSWITCHD_START(
   [add-port br0 p1 -- set Interface p1 type=dummy-pmd ofport_request=1 options:n_rxq=2 -- \
    add-port br0 p2 -- set Interface p2 type=dummy-pmd ofport_request=2 options:n_rxq=2 -- \
    set Open_vSwitch . other_config:pmd-cpu-mask=3
-], [], [], [--dummy-numa 0,1])
+], [], [], [--dummy-numa 0,8])
 AT_CHECK([ovs-appctl vlog/set dpif:dbg dpif_netdev:dbg])
 
 AT_CHECK([ovs-ofctl add-flow br0 action=controller])
@@ -399,13 +399,13 @@  NXT_PACKET_IN2 (xid=0x0): cookie=0x0 total_len=106 in_port=2 (via action) data_l
 icmp,vlan_tci=0x0000,dl_src=50:54:00:00:00:09,dl_dst=50:54:00:00:00:0a,nw_src=10.0.0.2,nw_dst=10.0.0.1,nw_tos=0,nw_ecn=0,nw_ttl=64,icmp_type=8,icmp_code=0 icmp_csum:13fc
 ])
 
-AT_CHECK([ovs-vsctl set Interface p2 options:numa_id=1])
+AT_CHECK([ovs-vsctl set Interface p2 options:numa_id=8])
 
 AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | parse_pmd_rxq_show], [0], [dnl
 p1 0 0 0
 p1 1 0 0
-p2 0 1 1
-p2 1 1 1
+p2 0 8 1
+p2 1 8 1
 ])
 
 AT_CHECK([ovs-ofctl monitor br0 65534 invalid_ttl --detach --no-chdir --pidfile 2> ofctl_monitor.log])
@@ -584,7 +584,7 @@  AT_CLEANUP
 
 AT_SETUP([PMD - rxq affinity - NUMA])
 OVS_VSWITCHD_START(
-  [], [], [], [--dummy-numa 0,0,0,1,1])
+  [], [], [], [--dummy-numa 0,0,0,8,8])
 AT_CHECK([ovs-appctl vlog/set dpif:dbg dpif_netdev:dbg])
 
 AT_CHECK([ovs-ofctl add-flow br0 actions=controller])
@@ -604,15 +604,15 @@  AT_CHECK([ovs-vsctl set Interface p1 other_config:pmd-rxq-affinity="0:3,1:4"])
 dnl We moved the queues to different numa node. Expecting threads on
 dnl NUMA node 1 to be created.
 AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | parse_pmd_rxq_show], [0], [dnl
-p1 0 1 3
-p1 1 1 4
+p1 0 8 3
+p1 1 8 4
 ])
 
 AT_CHECK([ovs-vsctl set Interface p1 other_config:pmd-rxq-affinity="0:3,1:1"])
 
 dnl Queues splitted between NUMA nodes.
 AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | parse_pmd_rxq_show], [0], [dnl
-p1 0 1 3
+p1 0 8 3
 p1 1 0 1
 ])
 
@@ -628,10 +628,10 @@  p1 1
 
 AT_CHECK([ovs-vsctl set Interface p1 other_config:pmd-rxq-affinity='0:3'])
 
-dnl We explicitly requesting NUMA node 1 for queue 0.
+dnl We explicitly requesting NUMA node 8 for queue 0.
 dnl Queue 1 should be polled by thread from NUMA node 0.
 AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | parse_pmd_rxq_show | cut -f 1,2,3 -d ' '], [0], [dnl
-p1 0 1
+p1 0 8
 p1 1 0
 ])