diff mbox series

[ovs-dev,PATCHv6] netdev-linux: Detect numa node id.

Message ID 1572549681-53246-1-git-send-email-u9012063@gmail.com
State Superseded
Headers show
Series [ovs-dev,PATCHv6] netdev-linux: Detect numa node id. | expand

Commit Message

William Tu Oct. 31, 2019, 7:21 p.m. UTC
The patch detects the numa node id from the name of the netdev,
by reading the '/sys/class/net/<devname>/device/numa_node'.
If not available, ex: virtual device, or any error happens,
return numa id 0.  Currently only the afxdp netdev type uses it,
other linux netdev types are disabled due to no use case.

Signed-off-by: William Tu <u9012063@gmail.com>
Acked-by: Eelco Chaudron <echaudro@redhat.com>

---
v6:
  Feedbacks from Ilya
  - add thread safety check at netdev_linux_get_numa_id__, and
    pass netdev_linux
  - preserve numa cache on netlink updates
  - Tested-at: https://travis-ci.org/williamtu/ovs-travis/builds/605634673

v5:
  Feedbacks from Ilya
  - reafactor the error handling
  - add mutex lock
  - Tested-at: https://travis-ci.org/williamtu/ovs-travis/builds/601947245

v4:
  Feedbacks from Eelco
  - Tested-at: https://travis-ci.org/williamtu/ovs-travis/builds/599308893

v3:
  Feedbacks from Ilya and Eelco
  - update doc, afxdp.rst
  - fix coding style
  - fix limit of numa node max, by using ovs_numa_numa_id_is_valid
  - move the function to netdev-linux
  - cache the result of numa_id
  - add security check for netdev name
  - use fscanf instead of read and convert to int
  - revise some error message content

v2:
  address feedback from Eelco
	fix memory leak of xaspintf
	log using INFO instead of WARN
---
 Documentation/intro/install/afxdp.rst |  1 -
 lib/netdev-afxdp.c                    | 11 ------
 lib/netdev-linux-private.h            |  2 ++
 lib/netdev-linux.c                    | 67 +++++++++++++++++++++++++++++++++--
 4 files changed, 66 insertions(+), 15 deletions(-)

Comments

0-day Robot Oct. 31, 2019, 7:59 p.m. UTC | #1
Bleep bloop.  Greetings William Tu, I am a robot and I have tried out your patch.
Thanks for your contribution.

I encountered some error that I wasn't expecting.  See the details below.


checkpatch:
WARNING: Line lacks whitespace around operator
#121 FILE: lib/netdev-linux.c:1419:
    numa_node_path = xasprintf("/sys/class/net/%s/device/numa_node", name);

Lines checked: 171, Warnings: 1, Errors: 0


Please check this out.  If you feel there has been an error, please email aconole@redhat.com

Thanks,
0-day Robot
William Tu Nov. 19, 2019, 9:39 p.m. UTC | #2
Hi,
Any feedback on this patch?
Thanks
William

On Thu, Oct 31, 2019 at 12:21:21PM -0700, William Tu wrote:
> The patch detects the numa node id from the name of the netdev,
> by reading the '/sys/class/net/<devname>/device/numa_node'.
> If not available, ex: virtual device, or any error happens,
> return numa id 0.  Currently only the afxdp netdev type uses it,
> other linux netdev types are disabled due to no use case.
> 
> Signed-off-by: William Tu <u9012063@gmail.com>
> Acked-by: Eelco Chaudron <echaudro@redhat.com>
> 
> ---
> v6:
>   Feedbacks from Ilya
>   - add thread safety check at netdev_linux_get_numa_id__, and
>     pass netdev_linux
>   - preserve numa cache on netlink updates
>   - Tested-at: https://travis-ci.org/williamtu/ovs-travis/builds/605634673
> 
> v5:
>   Feedbacks from Ilya
>   - reafactor the error handling
>   - add mutex lock
>   - Tested-at: https://travis-ci.org/williamtu/ovs-travis/builds/601947245
> 
> v4:
>   Feedbacks from Eelco
>   - Tested-at: https://travis-ci.org/williamtu/ovs-travis/builds/599308893
> 
> v3:
>   Feedbacks from Ilya and Eelco
>   - update doc, afxdp.rst
>   - fix coding style
>   - fix limit of numa node max, by using ovs_numa_numa_id_is_valid
>   - move the function to netdev-linux
>   - cache the result of numa_id
>   - add security check for netdev name
>   - use fscanf instead of read and convert to int
>   - revise some error message content
> 
> v2:
>   address feedback from Eelco
> 	fix memory leak of xaspintf
> 	log using INFO instead of WARN
> ---
>  Documentation/intro/install/afxdp.rst |  1 -
>  lib/netdev-afxdp.c                    | 11 ------
>  lib/netdev-linux-private.h            |  2 ++
>  lib/netdev-linux.c                    | 67 +++++++++++++++++++++++++++++++++--
>  4 files changed, 66 insertions(+), 15 deletions(-)
> 
> diff --git a/Documentation/intro/install/afxdp.rst b/Documentation/intro/install/afxdp.rst
> index a136db0c950a..fa898912b65b 100644
> --- a/Documentation/intro/install/afxdp.rst
> +++ b/Documentation/intro/install/afxdp.rst
> @@ -317,7 +317,6 @@ Below is a script using namespaces and veth peer::
>  
>  Limitations/Known Issues
>  ------------------------
> -#. Device's numa ID is always 0, need a way to find numa id from a netdev.
>  #. No QoS support because AF_XDP netdev by-pass the Linux TC layer. A possible
>     work-around is to use OpenFlow meter action.
>  #. Most of the tests are done using i40e single port. Multiple ports and
> diff --git a/lib/netdev-afxdp.c b/lib/netdev-afxdp.c
> index af654d498a88..270a5ab0c69b 100644
> --- a/lib/netdev-afxdp.c
> +++ b/lib/netdev-afxdp.c
> @@ -626,17 +626,6 @@ out:
>      return err;
>  }
>  
> -int
> -netdev_afxdp_get_numa_id(const struct netdev *netdev)
> -{
> -    /* FIXME: Get netdev's PCIe device ID, then find
> -     * its NUMA node id.
> -     */
> -    VLOG_INFO("FIXME: Device %s always use numa id 0.",
> -              netdev_get_name(netdev));
> -    return 0;
> -}
> -
>  static void
>  xsk_remove_xdp_program(uint32_t ifindex, int xdpmode)
>  {
> diff --git a/lib/netdev-linux-private.h b/lib/netdev-linux-private.h
> index c14f2fb81bb0..41079ae0c1b5 100644
> --- a/lib/netdev-linux-private.h
> +++ b/lib/netdev-linux-private.h
> @@ -96,6 +96,8 @@ struct netdev_linux {
>      /* LAG information. */
>      bool is_lag_master;         /* True if the netdev is a LAG master. */
>  
> +    int numa_id;                /* NUMA node id. */
> +
>  #ifdef HAVE_AF_XDP
>      /* AF_XDP information. */
>      struct xsk_socket_info **xsks;
> diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c
> index f4819237370a..69b85a006462 100644
> --- a/lib/netdev-linux.c
> +++ b/lib/netdev-linux.c
> @@ -236,6 +236,7 @@ enum {
>      VALID_VPORT_STAT_ERROR  = 1 << 5,
>      VALID_DRVINFO           = 1 << 6,
>      VALID_FEATURES          = 1 << 7,
> +    VALID_NUMA_ID           = 1 << 8,
>  };
>  
>  struct linux_lag_slave {
> @@ -820,9 +821,9 @@ netdev_linux_update__(struct netdev_linux *dev,
>  {
>      if (rtnetlink_type_is_rtnlgrp_link(change->nlmsg_type)) {
>          if (change->nlmsg_type == RTM_NEWLINK) {
> -            /* Keep drv-info, and ip addresses. */
> +            /* Keep drv-info, ip addresses, and NUMA id. */
>              netdev_linux_changed(dev, change->ifi_flags,
> -                                 VALID_DRVINFO | VALID_IN);
> +                                 VALID_DRVINFO | VALID_IN | VALID_NUMA_ID);
>  
>              /* Update netdev from rtnl-change msg. */
>              if (change->mtu) {
> @@ -1391,6 +1392,66 @@ netdev_linux_tap_batch_send(struct netdev *netdev_,
>      return 0;
>  }
>  
> +static int
> +netdev_linux_get_numa_id__(struct netdev_linux *netdev)
> +    OVS_REQUIRES(netdev->mutex)
> +{
> +    char *numa_node_path;
> +    const char *name;
> +    int node_id;
> +    FILE *stream;
> +
> +    if (netdev->cache_valid & VALID_NUMA_ID) {
> +        return netdev->numa_id;
> +    }
> +
> +    netdev->numa_id = 0;
> +    netdev->cache_valid |= VALID_NUMA_ID;
> +
> +    name = netdev_get_name(&netdev->up);
> +    if (strpbrk(name, "/\\")) {
> +        VLOG_ERR_RL(&rl, "\"%s\" is not a valid name for a port. "
> +                    "A valid name must not include '/' or '\\'."
> +                    "Using numa_id 0", name);
> +        return 0;
> +    }
> +
> +    numa_node_path = xasprintf("/sys/class/net/%s/device/numa_node", name);
> +
> +    stream = fopen(numa_node_path, "r");
> +    if (!stream) {
> +        /* Virtual device does not have this info. */
> +        VLOG_INFO_RL(&rl, "%s: Can't open '%s': %s, using numa_id 0",
> +                     name, numa_node_path, ovs_strerror(errno));
> +        free(numa_node_path);
> +        return 0;
> +    }
> +
> +    if (fscanf(stream, "%d", &node_id) != 1
> +        || !ovs_numa_numa_id_is_valid(node_id))  {
> +        VLOG_WARN_RL(&rl, "%s: Can't detect NUMA node, using numa_id 0", name);
> +        node_id = 0;
> +    }
> +
> +    netdev->numa_id = node_id;
> +    fclose(stream);
> +    free(numa_node_path);
> +    return node_id;
> +}
> +
> +static int OVS_UNUSED
> +netdev_linux_get_numa_id(const struct netdev *netdev_)
> +{
> +    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
> +    int numa_id;
> +
> +    ovs_mutex_lock(&netdev->mutex);
> +    numa_id = netdev_linux_get_numa_id__(netdev);
> +    ovs_mutex_unlock(&netdev->mutex);
> +
> +    return numa_id;
> +}
> +
>  /* Sends 'batch' on 'netdev'.  Returns 0 if successful, otherwise a positive
>   * errno value.  Returns EAGAIN without blocking if the packet cannot be queued
>   * immediately.  Returns EMSGSIZE if a partial packet was transmitted or if
> @@ -3298,7 +3359,7 @@ const struct netdev_class netdev_afxdp_class = {
>      .set_config = netdev_afxdp_set_config,
>      .get_config = netdev_afxdp_get_config,
>      .reconfigure = netdev_afxdp_reconfigure,
> -    .get_numa_id = netdev_afxdp_get_numa_id,
> +    .get_numa_id = netdev_linux_get_numa_id,
>      .send = netdev_afxdp_batch_send,
>      .rxq_construct = netdev_afxdp_rxq_construct,
>      .rxq_destruct = netdev_afxdp_rxq_destruct,
> -- 
> 2.7.4
>
Eelco Chaudron Nov. 20, 2019, 7:39 a.m. UTC | #3
On 19 Nov 2019, at 22:39, William Tu wrote:

> Hi,
> Any feedback on this patch?

LGTM!

Acked-by: Eelco Chaudron <echaudro@redhat.com>
>
> On Thu, Oct 31, 2019 at 12:21:21PM -0700, William Tu wrote:
>> The patch detects the numa node id from the name of the netdev,
>> by reading the '/sys/class/net/<devname>/device/numa_node'.
>> If not available, ex: virtual device, or any error happens,
>> return numa id 0.  Currently only the afxdp netdev type uses it,
>> other linux netdev types are disabled due to no use case.
>>
>> Signed-off-by: William Tu <u9012063@gmail.com>
>> Acked-by: Eelco Chaudron <echaudro@redhat.com>
>>
>> ---
>> v6:
>>   Feedbacks from Ilya
>>   - add thread safety check at netdev_linux_get_numa_id__, and
>>     pass netdev_linux
>>   - preserve numa cache on netlink updates
>>   - Tested-at: 
>> https://travis-ci.org/williamtu/ovs-travis/builds/605634673
>>
>> v5:
>>   Feedbacks from Ilya
>>   - reafactor the error handling
>>   - add mutex lock
>>   - Tested-at: 
>> https://travis-ci.org/williamtu/ovs-travis/builds/601947245
>>
>> v4:
>>   Feedbacks from Eelco
>>   - Tested-at: 
>> https://travis-ci.org/williamtu/ovs-travis/builds/599308893
>>
>> v3:
>>   Feedbacks from Ilya and Eelco
>>   - update doc, afxdp.rst
>>   - fix coding style
>>   - fix limit of numa node max, by using ovs_numa_numa_id_is_valid
>>   - move the function to netdev-linux
>>   - cache the result of numa_id
>>   - add security check for netdev name
>>   - use fscanf instead of read and convert to int
>>   - revise some error message content
>>
>> v2:
>>   address feedback from Eelco
>> 	fix memory leak of xaspintf
>> 	log using INFO instead of WARN
>> ---
>>  Documentation/intro/install/afxdp.rst |  1 -
>>  lib/netdev-afxdp.c                    | 11 ------
>>  lib/netdev-linux-private.h            |  2 ++
>>  lib/netdev-linux.c                    | 67 
>> +++++++++++++++++++++++++++++++++--
>>  4 files changed, 66 insertions(+), 15 deletions(-)
>>
>> diff --git a/Documentation/intro/install/afxdp.rst 
>> b/Documentation/intro/install/afxdp.rst
>> index a136db0c950a..fa898912b65b 100644
>> --- a/Documentation/intro/install/afxdp.rst
>> +++ b/Documentation/intro/install/afxdp.rst
>> @@ -317,7 +317,6 @@ Below is a script using namespaces and veth 
>> peer::
>>
>>  Limitations/Known Issues
>>  ------------------------
>> -#. Device's numa ID is always 0, need a way to find numa id from a 
>> netdev.
>>  #. No QoS support because AF_XDP netdev by-pass the Linux TC layer. 
>> A possible
>>     work-around is to use OpenFlow meter action.
>>  #. Most of the tests are done using i40e single port. Multiple ports 
>> and
>> diff --git a/lib/netdev-afxdp.c b/lib/netdev-afxdp.c
>> index af654d498a88..270a5ab0c69b 100644
>> --- a/lib/netdev-afxdp.c
>> +++ b/lib/netdev-afxdp.c
>> @@ -626,17 +626,6 @@ out:
>>      return err;
>>  }
>>
>> -int
>> -netdev_afxdp_get_numa_id(const struct netdev *netdev)
>> -{
>> -    /* FIXME: Get netdev's PCIe device ID, then find
>> -     * its NUMA node id.
>> -     */
>> -    VLOG_INFO("FIXME: Device %s always use numa id 0.",
>> -              netdev_get_name(netdev));
>> -    return 0;
>> -}
>> -
>>  static void
>>  xsk_remove_xdp_program(uint32_t ifindex, int xdpmode)
>>  {
>> diff --git a/lib/netdev-linux-private.h b/lib/netdev-linux-private.h
>> index c14f2fb81bb0..41079ae0c1b5 100644
>> --- a/lib/netdev-linux-private.h
>> +++ b/lib/netdev-linux-private.h
>> @@ -96,6 +96,8 @@ struct netdev_linux {
>>      /* LAG information. */
>>      bool is_lag_master;         /* True if the netdev is a LAG 
>> master. */
>>
>> +    int numa_id;                /* NUMA node id. */
>> +
>>  #ifdef HAVE_AF_XDP
>>      /* AF_XDP information. */
>>      struct xsk_socket_info **xsks;
>> diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c
>> index f4819237370a..69b85a006462 100644
>> --- a/lib/netdev-linux.c
>> +++ b/lib/netdev-linux.c
>> @@ -236,6 +236,7 @@ enum {
>>      VALID_VPORT_STAT_ERROR  = 1 << 5,
>>      VALID_DRVINFO           = 1 << 6,
>>      VALID_FEATURES          = 1 << 7,
>> +    VALID_NUMA_ID           = 1 << 8,
>>  };
>>  
>>  struct linux_lag_slave {
>> @@ -820,9 +821,9 @@ netdev_linux_update__(struct netdev_linux *dev,
>>  {
>>      if (rtnetlink_type_is_rtnlgrp_link(change->nlmsg_type)) {
>>          if (change->nlmsg_type == RTM_NEWLINK) {
>> -            /* Keep drv-info, and ip addresses. */
>> +            /* Keep drv-info, ip addresses, and NUMA id. */
>>              netdev_linux_changed(dev, change->ifi_flags,
>> -                                 VALID_DRVINFO | VALID_IN);
>> +                                 VALID_DRVINFO | VALID_IN | 
>> VALID_NUMA_ID);
>>
>>              /* Update netdev from rtnl-change msg. */
>>              if (change->mtu) {
>> @@ -1391,6 +1392,66 @@ netdev_linux_tap_batch_send(struct netdev 
>> *netdev_,
>>      return 0;
>>  }
>>
>> +static int
>> +netdev_linux_get_numa_id__(struct netdev_linux *netdev)
>> +    OVS_REQUIRES(netdev->mutex)
>> +{
>> +    char *numa_node_path;
>> +    const char *name;
>> +    int node_id;
>> +    FILE *stream;
>> +
>> +    if (netdev->cache_valid & VALID_NUMA_ID) {
>> +        return netdev->numa_id;
>> +    }
>> +
>> +    netdev->numa_id = 0;
>> +    netdev->cache_valid |= VALID_NUMA_ID;
>> +
>> +    name = netdev_get_name(&netdev->up);
>> +    if (strpbrk(name, "/\\")) {
>> +        VLOG_ERR_RL(&rl, "\"%s\" is not a valid name for a port. "
>> +                    "A valid name must not include '/' or '\\'."
>> +                    "Using numa_id 0", name);
>> +        return 0;
>> +    }
>> +
>> +    numa_node_path = xasprintf("/sys/class/net/%s/device/numa_node", 
>> name);
>> +
>> +    stream = fopen(numa_node_path, "r");
>> +    if (!stream) {
>> +        /* Virtual device does not have this info. */
>> +        VLOG_INFO_RL(&rl, "%s: Can't open '%s': %s, using numa_id 
>> 0",
>> +                     name, numa_node_path, ovs_strerror(errno));
>> +        free(numa_node_path);
>> +        return 0;
>> +    }
>> +
>> +    if (fscanf(stream, "%d", &node_id) != 1
>> +        || !ovs_numa_numa_id_is_valid(node_id))  {
>> +        VLOG_WARN_RL(&rl, "%s: Can't detect NUMA node, using numa_id 
>> 0", name);
>> +        node_id = 0;
>> +    }
>> +
>> +    netdev->numa_id = node_id;
>> +    fclose(stream);
>> +    free(numa_node_path);
>> +    return node_id;
>> +}
>> +
>> +static int OVS_UNUSED
>> +netdev_linux_get_numa_id(const struct netdev *netdev_)
>> +{
>> +    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
>> +    int numa_id;
>> +
>> +    ovs_mutex_lock(&netdev->mutex);
>> +    numa_id = netdev_linux_get_numa_id__(netdev);
>> +    ovs_mutex_unlock(&netdev->mutex);
>> +
>> +    return numa_id;
>> +}
>> +
>>  /* Sends 'batch' on 'netdev'.  Returns 0 if successful, otherwise a 
>> positive
>>   * errno value.  Returns EAGAIN without blocking if the packet 
>> cannot be queued
>>   * immediately.  Returns EMSGSIZE if a partial packet was 
>> transmitted or if
>> @@ -3298,7 +3359,7 @@ const struct netdev_class netdev_afxdp_class = 
>> {
>>      .set_config = netdev_afxdp_set_config,
>>      .get_config = netdev_afxdp_get_config,
>>      .reconfigure = netdev_afxdp_reconfigure,
>> -    .get_numa_id = netdev_afxdp_get_numa_id,
>> +    .get_numa_id = netdev_linux_get_numa_id,
>>      .send = netdev_afxdp_batch_send,
>>      .rxq_construct = netdev_afxdp_rxq_construct,
>>      .rxq_destruct = netdev_afxdp_rxq_destruct,
>> -- 
>> 2.7.4
>>
Ilya Maximets Nov. 20, 2019, 4:06 p.m. UTC | #4
On 31.10.2019 20:21, William Tu wrote:
> The patch detects the numa node id from the name of the netdev,
> by reading the '/sys/class/net/<devname>/device/numa_node'.
> If not available, ex: virtual device, or any error happens,
> return numa id 0.  Currently only the afxdp netdev type uses it,
> other linux netdev types are disabled due to no use case.

Hi.

This version looks good in general, but I'm concerned a bit with
the global effect it will have.  Let me explain, since this patch
doesn't manage any memory allocations and umem/pools allocations
are happened in OVS main thread, all the memory will be still
on original NUMA node (assuming NUMA id 0 in most cases).  umem
in native cases will be locked in kernel and will not be able to
migrate.  So, without this patch we always have all devices polled
by threads from NUMA 0 and all the memory will be on NUMA 0.
The only cross-NUMA access will be done by device DMA, which is OK
and usually the fastest cross-NUMA scenario.  But with this patch
applied device will be polled by the thread from NUMA 1, part of
the memory will migrate creating random performance spikes, but
locked umem will remain on NUMA 0.  In this case both DMA and PMD
will perform cross-NUMA memory accesses all the time stressing
the QPI and degrading performance significantly.

So, the question is: Should we merge this now or wait and merge
along with correct NUMA-aware memory allocations?
Thoughts?

Best regards, Ilya Maximets.
Eelco Chaudron Nov. 20, 2019, 4:09 p.m. UTC | #5
On 20 Nov 2019, at 17:06, Ilya Maximets wrote:

> On 31.10.2019 20:21, William Tu wrote:
>> The patch detects the numa node id from the name of the netdev,
>> by reading the '/sys/class/net/<devname>/device/numa_node'.
>> If not available, ex: virtual device, or any error happens,
>> return numa id 0.  Currently only the afxdp netdev type uses it,
>> other linux netdev types are disabled due to no use case.
>
> Hi.
>
> This version looks good in general, but I'm concerned a bit with
> the global effect it will have.  Let me explain, since this patch
> doesn't manage any memory allocations and umem/pools allocations
> are happened in OVS main thread, all the memory will be still
> on original NUMA node (assuming NUMA id 0 in most cases).  umem
> in native cases will be locked in kernel and will not be able to
> migrate.  So, without this patch we always have all devices polled
> by threads from NUMA 0 and all the memory will be on NUMA 0.
> The only cross-NUMA access will be done by device DMA, which is OK
> and usually the fastest cross-NUMA scenario.  But with this patch
> applied device will be polled by the thread from NUMA 1, part of
> the memory will migrate creating random performance spikes, but
> locked umem will remain on NUMA 0.  In this case both DMA and PMD
> will perform cross-NUMA memory accesses all the time stressing
> the QPI and degrading performance significantly.
>
> So, the question is: Should we merge this now or wait and merge
> along with correct NUMA-aware memory allocations?
> Thoughts?

Mentioned this in the v1 (or v2) patch and William was working on the 
umem allocation on the correct numa in a different patch. William any 
update on this?
William Tu Nov. 20, 2019, 6:19 p.m. UTC | #6
On Wed, Nov 20, 2019 at 05:09:10PM +0100, Eelco Chaudron wrote:
> 
> 
> On 20 Nov 2019, at 17:06, Ilya Maximets wrote:
> 
> >On 31.10.2019 20:21, William Tu wrote:
> >>The patch detects the numa node id from the name of the netdev,
> >>by reading the '/sys/class/net/<devname>/device/numa_node'.
> >>If not available, ex: virtual device, or any error happens,
> >>return numa id 0.  Currently only the afxdp netdev type uses it,
> >>other linux netdev types are disabled due to no use case.
> >
> >Hi.
> >
> >This version looks good in general, but I'm concerned a bit with
> >the global effect it will have.  Let me explain, since this patch
> >doesn't manage any memory allocations and umem/pools allocations
> >are happened in OVS main thread, all the memory will be still
> >on original NUMA node (assuming NUMA id 0 in most cases).  umem
> >in native cases will be locked in kernel and will not be able to
> >migrate.  So, without this patch we always have all devices polled
> >by threads from NUMA 0 and all the memory will be on NUMA 0.
> >The only cross-NUMA access will be done by device DMA, which is OK
> >and usually the fastest cross-NUMA scenario.  But with this patch
> >applied device will be polled by the thread from NUMA 1, part of
> >the memory will migrate creating random performance spikes, but
> >locked umem will remain on NUMA 0.  In this case both DMA and PMD
> >will perform cross-NUMA memory accesses all the time stressing
> >the QPI and degrading performance significantly.
> >
> >So, the question is: Should we merge this now or wait and merge
> >along with correct NUMA-aware memory allocations?
> >Thoughts?
> 
> Mentioned this in the v1 (or v2) patch and William was working on the umem
> allocation on the correct numa in a different patch. William any update on
> this?
> 
Hi Eelco and Ilya,

Thanks for explaining the concern. Let's make it correct at once.
So put on hold this patch and let me finish the NUMA-aware memory
allocation.

--William
diff mbox series

Patch

diff --git a/Documentation/intro/install/afxdp.rst b/Documentation/intro/install/afxdp.rst
index a136db0c950a..fa898912b65b 100644
--- a/Documentation/intro/install/afxdp.rst
+++ b/Documentation/intro/install/afxdp.rst
@@ -317,7 +317,6 @@  Below is a script using namespaces and veth peer::
 
 Limitations/Known Issues
 ------------------------
-#. Device's numa ID is always 0, need a way to find numa id from a netdev.
 #. No QoS support because AF_XDP netdev by-pass the Linux TC layer. A possible
    work-around is to use OpenFlow meter action.
 #. Most of the tests are done using i40e single port. Multiple ports and
diff --git a/lib/netdev-afxdp.c b/lib/netdev-afxdp.c
index af654d498a88..270a5ab0c69b 100644
--- a/lib/netdev-afxdp.c
+++ b/lib/netdev-afxdp.c
@@ -626,17 +626,6 @@  out:
     return err;
 }
 
-int
-netdev_afxdp_get_numa_id(const struct netdev *netdev)
-{
-    /* FIXME: Get netdev's PCIe device ID, then find
-     * its NUMA node id.
-     */
-    VLOG_INFO("FIXME: Device %s always use numa id 0.",
-              netdev_get_name(netdev));
-    return 0;
-}
-
 static void
 xsk_remove_xdp_program(uint32_t ifindex, int xdpmode)
 {
diff --git a/lib/netdev-linux-private.h b/lib/netdev-linux-private.h
index c14f2fb81bb0..41079ae0c1b5 100644
--- a/lib/netdev-linux-private.h
+++ b/lib/netdev-linux-private.h
@@ -96,6 +96,8 @@  struct netdev_linux {
     /* LAG information. */
     bool is_lag_master;         /* True if the netdev is a LAG master. */
 
+    int numa_id;                /* NUMA node id. */
+
 #ifdef HAVE_AF_XDP
     /* AF_XDP information. */
     struct xsk_socket_info **xsks;
diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c
index f4819237370a..69b85a006462 100644
--- a/lib/netdev-linux.c
+++ b/lib/netdev-linux.c
@@ -236,6 +236,7 @@  enum {
     VALID_VPORT_STAT_ERROR  = 1 << 5,
     VALID_DRVINFO           = 1 << 6,
     VALID_FEATURES          = 1 << 7,
+    VALID_NUMA_ID           = 1 << 8,
 };
 
 struct linux_lag_slave {
@@ -820,9 +821,9 @@  netdev_linux_update__(struct netdev_linux *dev,
 {
     if (rtnetlink_type_is_rtnlgrp_link(change->nlmsg_type)) {
         if (change->nlmsg_type == RTM_NEWLINK) {
-            /* Keep drv-info, and ip addresses. */
+            /* Keep drv-info, ip addresses, and NUMA id. */
             netdev_linux_changed(dev, change->ifi_flags,
-                                 VALID_DRVINFO | VALID_IN);
+                                 VALID_DRVINFO | VALID_IN | VALID_NUMA_ID);
 
             /* Update netdev from rtnl-change msg. */
             if (change->mtu) {
@@ -1391,6 +1392,66 @@  netdev_linux_tap_batch_send(struct netdev *netdev_,
     return 0;
 }
 
+static int
+netdev_linux_get_numa_id__(struct netdev_linux *netdev)
+    OVS_REQUIRES(netdev->mutex)
+{
+    char *numa_node_path;
+    const char *name;
+    int node_id;
+    FILE *stream;
+
+    if (netdev->cache_valid & VALID_NUMA_ID) {
+        return netdev->numa_id;
+    }
+
+    netdev->numa_id = 0;
+    netdev->cache_valid |= VALID_NUMA_ID;
+
+    name = netdev_get_name(&netdev->up);
+    if (strpbrk(name, "/\\")) {
+        VLOG_ERR_RL(&rl, "\"%s\" is not a valid name for a port. "
+                    "A valid name must not include '/' or '\\'."
+                    "Using numa_id 0", name);
+        return 0;
+    }
+
+    numa_node_path = xasprintf("/sys/class/net/%s/device/numa_node", name);
+
+    stream = fopen(numa_node_path, "r");
+    if (!stream) {
+        /* Virtual device does not have this info. */
+        VLOG_INFO_RL(&rl, "%s: Can't open '%s': %s, using numa_id 0",
+                     name, numa_node_path, ovs_strerror(errno));
+        free(numa_node_path);
+        return 0;
+    }
+
+    if (fscanf(stream, "%d", &node_id) != 1
+        || !ovs_numa_numa_id_is_valid(node_id))  {
+        VLOG_WARN_RL(&rl, "%s: Can't detect NUMA node, using numa_id 0", name);
+        node_id = 0;
+    }
+
+    netdev->numa_id = node_id;
+    fclose(stream);
+    free(numa_node_path);
+    return node_id;
+}
+
+static int OVS_UNUSED
+netdev_linux_get_numa_id(const struct netdev *netdev_)
+{
+    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
+    int numa_id;
+
+    ovs_mutex_lock(&netdev->mutex);
+    numa_id = netdev_linux_get_numa_id__(netdev);
+    ovs_mutex_unlock(&netdev->mutex);
+
+    return numa_id;
+}
+
 /* Sends 'batch' on 'netdev'.  Returns 0 if successful, otherwise a positive
  * errno value.  Returns EAGAIN without blocking if the packet cannot be queued
  * immediately.  Returns EMSGSIZE if a partial packet was transmitted or if
@@ -3298,7 +3359,7 @@  const struct netdev_class netdev_afxdp_class = {
     .set_config = netdev_afxdp_set_config,
     .get_config = netdev_afxdp_get_config,
     .reconfigure = netdev_afxdp_reconfigure,
-    .get_numa_id = netdev_afxdp_get_numa_id,
+    .get_numa_id = netdev_linux_get_numa_id,
     .send = netdev_afxdp_batch_send,
     .rxq_construct = netdev_afxdp_rxq_construct,
     .rxq_destruct = netdev_afxdp_rxq_destruct,