diff mbox series

[ovs-dev,RFC] netdev-afxdp: Enable shared umem support.

Message ID 1572992200-44323-1-git-send-email-u9012063@gmail.com
State RFC
Headers show
Series [ovs-dev,RFC] netdev-afxdp: Enable shared umem support. | expand

Commit Message

William Tu Nov. 5, 2019, 10:16 p.m. UTC
The RFC patch enables shared umem support. It requires kernel change and
libbpf change, I will post it in another thread. I tested with multiple
afxdp ports using skb mode.  For example:
  ovs-vsctl -- set interface afxdp-p0 options:n_rxq=1 type="afxdp" options:xdpmode=skb
  ovs-vsctl -- set interface afxdp-p1 options:n_rxq=1 type="afxdp" options:xdpmode=skb
Will share one umem instead of two.

Note that once shared umem is created with a specific mode (ex: XDP_COPY),
the netdev that shares this umem can not change its mode.  So I'm thinking about
using just one shared umem for all skb-mode netdevs, and for the rest drv-mode
netdevs, keep using dedicated umem. Or create one umem per mode? So the
drv-mode netdevs also share one umem.

Any comments are welcome.

Suggested-by: Eelco Chaudron <echaudro@redhat.com>
Signed-off-by: William Tu <u9012063@gmail.com>
---
 lib/netdev-afxdp.c | 97 +++++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 70 insertions(+), 27 deletions(-)

Comments

Eelco Chaudron Nov. 6, 2019, 7:49 a.m. UTC | #1
On 5 Nov 2019, at 23:16, William Tu wrote:

> The RFC patch enables shared umem support. It requires kernel change 
> and
> libbpf change, I will post it in another thread. I tested with 
> multiple
> afxdp ports using skb mode.  For example:
>   ovs-vsctl -- set interface afxdp-p0 options:n_rxq=1 type="afxdp" 
> options:xdpmode=skb
>   ovs-vsctl -- set interface afxdp-p1 options:n_rxq=1 type="afxdp" 
> options:xdpmode=skb
> Will share one umem instead of two.
>
> Note that once shared umem is created with a specific mode (ex: 
> XDP_COPY),
> the netdev that shares this umem can not change its mode.  So I'm 
> thinking about
> using just one shared umem for all skb-mode netdevs, and for the rest 
> drv-mode
> netdevs, keep using dedicated umem. Or create one umem per mode? So 
> the
> drv-mode netdevs also share one umem.


Hi William,

Did not go trough the entire patch, but wasn’t Magnus hinting on not 
using the shared umem ring’s but just the buffers they point too?

This way you do not need any locking when doing the ring operations, 
only when getting buffers. Guess this is more like the mbuf 
implementation in DPDK.

//Eelco

> Any comments are welcome.
>
> Suggested-by: Eelco Chaudron <echaudro@redhat.com>
> Signed-off-by: William Tu <u9012063@gmail.com>
> ---
>  lib/netdev-afxdp.c | 97 
> +++++++++++++++++++++++++++++++++++++++---------------
>  1 file changed, 70 insertions(+), 27 deletions(-)
>
> diff --git a/lib/netdev-afxdp.c b/lib/netdev-afxdp.c
> index 3037770b27cb..42767e1e27f3 100644
> --- a/lib/netdev-afxdp.c
> +++ b/lib/netdev-afxdp.c
> @@ -95,6 +95,8 @@ static void xsk_destroy(struct xsk_socket_info 
> *xsk);
>  static int xsk_configure_all(struct netdev *netdev);
>  static void xsk_destroy_all(struct netdev *netdev);
>
> +static struct xsk_umem_info *shared_umem;
> +
>  struct unused_pool {
>      struct xsk_umem_info *umem_info;
>      int lost_in_rings; /* Number of packets left in tx, rx, cq and 
> fq. */
> @@ -112,6 +114,8 @@ struct xsk_umem_info {
>      struct xsk_ring_cons cq;
>      struct xsk_umem *umem;
>      void *buffer;
> +    int refcount;
> +    struct ovs_mutex mutex;
>  };
>
>  struct xsk_socket_info {
> @@ -228,6 +232,7 @@ xsk_configure_umem(void *buffer, uint64_t size, 
> int xdpmode)
>      uconfig.comp_size = CONS_NUM_DESCS;
>      uconfig.frame_size = FRAME_SIZE;
>      uconfig.frame_headroom = OVS_XDP_HEADROOM;
> +    ovs_mutex_init(&umem->mutex);
>
>      ret = xsk_umem__create(&umem->umem, buffer, size, &umem->fq, 
> &umem->cq,
>                             &uconfig);
> @@ -296,6 +301,7 @@ xsk_configure_socket(struct xsk_umem_info *umem, 
> uint32_t ifindex,
>      struct xsk_socket_info *xsk;
>      char devname[IF_NAMESIZE];
>      uint32_t idx = 0, prog_id;
> +    bool shared;
>      int ret;
>      int i;
>
> @@ -304,6 +310,7 @@ xsk_configure_socket(struct xsk_umem_info *umem, 
> uint32_t ifindex,
>      cfg.rx_size = CONS_NUM_DESCS;
>      cfg.tx_size = PROD_NUM_DESCS;
>      cfg.libbpf_flags = 0;
> +    shared = umem->refcount > 1;
>
>      if (xdpmode == XDP_ZEROCOPY) {
>          cfg.bind_flags = XDP_ZEROCOPY;
> @@ -319,6 +326,10 @@ xsk_configure_socket(struct xsk_umem_info *umem, 
> uint32_t ifindex,
>      }
>  #endif
>
> +    if (shared) {
> +        cfg.bind_flags = XDP_SHARED_UMEM;
> +    }
> +
>      if (if_indextoname(ifindex, devname) == NULL) {
>          VLOG_ERR("ifindex %d to devname failed (%s)",
>                   ifindex, ovs_strerror(errno));
> @@ -352,6 +363,11 @@ xsk_configure_socket(struct xsk_umem_info *umem, 
> uint32_t ifindex,
>          return NULL;
>      }
>
> +    if (shared) {
> +        return xsk;
> +    }
> +
> +    /* Only the first umem needs to go below. */
>      while (!xsk_ring_prod__reserve(&xsk->umem->fq,
>                                     PROD_NUM_DESCS, &idx)) {
>          VLOG_WARN_RL(&rl, "Retry xsk_ring_prod__reserve to FILL 
> queue");
> @@ -380,33 +396,43 @@ xsk_configure(int ifindex, int xdp_queue_id, int 
> xdpmode,
>  {
>      struct xsk_socket_info *xsk;
>      struct xsk_umem_info *umem;
> -    void *bufs;
> +    void *bufs = NULL;
>
>      netdev_afxdp_sweep_unused_pools(NULL);
>
> -    /* Umem memory region. */
> -    bufs = xmalloc_pagealign(NUM_FRAMES * FRAME_SIZE);
> -    memset(bufs, 0, NUM_FRAMES * FRAME_SIZE);
> +    if (!shared_umem) {
> +        /* Umem memory region. */
> +        bufs = xmalloc_pagealign(NUM_FRAMES * FRAME_SIZE);
> +        memset(bufs, 0, NUM_FRAMES * FRAME_SIZE);
> +
> +        /* Create AF_XDP socket. */
> +        umem = xsk_configure_umem(bufs,
> +                                  NUM_FRAMES * FRAME_SIZE,
> +                                  xdpmode);
> +        if (!umem) {
> +            free_pagealign(bufs);
> +            return NULL;
> +        }
>
> -    /* Create AF_XDP socket. */
> -    umem = xsk_configure_umem(bufs,
> -                              NUM_FRAMES * FRAME_SIZE,
> -                              xdpmode);
> -    if (!umem) {
> -        free_pagealign(bufs);
> -        return NULL;
> +        shared_umem = umem;
> +        umem->refcount++;
> +        VLOG_DBG("Allocated umem pool at 0x%"PRIxPTR, (uintptr_t) 
> umem);
> +    } else {
> +        umem = shared_umem;
> +        umem->refcount++;
>      }
>
> -    VLOG_DBG("Allocated umem pool at 0x%"PRIxPTR, (uintptr_t) umem);
> -
>      xsk = xsk_configure_socket(umem, ifindex, xdp_queue_id, xdpmode,
>                                 use_need_wakeup);
> -    if (!xsk) {
> +    if (!xsk && !umem->refcount--) {
>          /* Clean up umem and xpacket pool. */
> +        shared_umem = NULL;
>          if (xsk_umem__delete(umem->umem)) {
>              VLOG_ERR("xsk_umem__delete failed.");
>          }
> -        free_pagealign(bufs);
> +        if (bufs) {
> +            free_pagealign(bufs);
> +        }
>          umem_pool_cleanup(&umem->mpool);
>          xpacket_pool_cleanup(&umem->xpool);
>          free(umem);
> @@ -472,21 +498,29 @@ xsk_destroy(struct xsk_socket_info *xsk_info)
>      xsk_info->xsk = NULL;
>
>      umem = xsk_info->umem->umem;
> -    if (xsk_umem__delete(umem)) {
> -        VLOG_ERR("xsk_umem__delete failed.");
> -    }
> +    xsk_info->umem->refcount--;
>
> -    pool = xzalloc(sizeof *pool);
> -    pool->umem_info = xsk_info->umem;
> -    pool->lost_in_rings = xsk_info->outstanding_tx + 
> xsk_info->available_rx;
> +    if (!xsk_info->umem->refcount) {
> +        VLOG_WARN("destroy umem");
> +        shared_umem = NULL;
> +        if (xsk_umem__delete(umem)) {
> +            VLOG_ERR("xsk_umem__delete failed.");
> +        }
> +        ovs_mutex_destroy(&xsk_info->umem->mutex);
>
> -    ovs_mutex_lock(&unused_pools_mutex);
> -    ovs_list_push_back(&unused_pools, &pool->list_node);
> -    ovs_mutex_unlock(&unused_pools_mutex);
> +        pool = xzalloc(sizeof *pool);
> +        pool->umem_info = xsk_info->umem;
> +        pool->lost_in_rings = xsk_info->outstanding_tx +
> +                              xsk_info->available_rx;
>
> -    free(xsk_info);
> +        ovs_mutex_lock(&unused_pools_mutex);
> +        ovs_list_push_back(&unused_pools, &pool->list_node);
> +        ovs_mutex_unlock(&unused_pools_mutex);
>
> -    netdev_afxdp_sweep_unused_pools(NULL);
> +        free(xsk_info);
> +
> +        netdev_afxdp_sweep_unused_pools(NULL);
> +    }
>  }
>
>  static void
> @@ -733,11 +767,14 @@ netdev_afxdp_rxq_recv(struct netdev_rxq *rxq_, 
> struct dp_packet_batch *batch,
>      prepare_fill_queue(xsk_info);
>
>      umem = xsk_info->umem;
> +    ovs_mutex_lock(&umem->mutex);
> +
>      rx->fd = xsk_socket__fd(xsk_info->xsk);
>
>      rcvd = xsk_ring_cons__peek(&xsk_info->rx, BATCH_SIZE, &idx_rx);
>      if (!rcvd) {
>          xsk_rx_wakeup_if_needed(umem, netdev, rx->fd);
> +        ovs_mutex_unlock(&umem->mutex);
>          return EAGAIN;
>      }
>
> @@ -778,6 +815,7 @@ netdev_afxdp_rxq_recv(struct netdev_rxq *rxq_, 
> struct dp_packet_batch *batch,
>          /* TODO: return the number of remaining packets in the queue. 
> */
>          *qfill = 0;
>      }
> +    ovs_mutex_unlock(&umem->mutex);
>      return 0;
>  }
>
> @@ -924,7 +962,7 @@ __netdev_afxdp_batch_send(struct netdev *netdev, 
> int qid,
>      struct netdev_linux *dev = netdev_linux_cast(netdev);
>      struct xsk_socket_info *xsk_info;
>      void *elems_pop[BATCH_SIZE];
> -    struct xsk_umem_info *umem;
> +    struct xsk_umem_info *umem = NULL;
>      struct dp_packet *packet;
>      bool free_batch = false;
>      unsigned long orig;
> @@ -942,6 +980,8 @@ __netdev_afxdp_batch_send(struct netdev *netdev, 
> int qid,
>      free_batch = check_free_batch(batch);
>
>      umem = xsk_info->umem;
> +    ovs_mutex_lock(&umem->mutex);
> +
>      ret = umem_elem_pop_n(&umem->mpool, dp_packet_batch_size(batch),
>                            elems_pop);
>      if (OVS_UNLIKELY(ret)) {
> @@ -993,6 +1033,9 @@ __netdev_afxdp_batch_send(struct netdev *netdev, 
> int qid,
>      }
>
>  out:
> +    if (umem) {
> +        ovs_mutex_unlock(&umem->mutex);
> +    }
>      if (free_batch) {
>          free_afxdp_buf_batch(batch);
>      } else {
> -- 
> 2.7.4
William Tu Nov. 6, 2019, 6:17 p.m. UTC | #2
On Wed, Nov 06, 2019 at 08:49:53AM +0100, Eelco Chaudron wrote:
> 
> 
> On 5 Nov 2019, at 23:16, William Tu wrote:
> 
> >The RFC patch enables shared umem support. It requires kernel change and
> >libbpf change, I will post it in another thread. I tested with multiple
> >afxdp ports using skb mode.  For example:
> >  ovs-vsctl -- set interface afxdp-p0 options:n_rxq=1 type="afxdp"
> >options:xdpmode=skb
> >  ovs-vsctl -- set interface afxdp-p1 options:n_rxq=1 type="afxdp"
> >options:xdpmode=skb
> >Will share one umem instead of two.
> >
> >Note that once shared umem is created with a specific mode (ex: XDP_COPY),
> >the netdev that shares this umem can not change its mode.  So I'm thinking
> >about
> >using just one shared umem for all skb-mode netdevs, and for the rest
> >drv-mode
> >netdevs, keep using dedicated umem. Or create one umem per mode? So the
> >drv-mode netdevs also share one umem.
> 
> 
> Hi William,
> 
> Did not go trough the entire patch, but wasn’t Magnus hinting on not using
> the shared umem ring’s but just the buffers they point too?
> 
> This way you do not need any locking when doing the ring operations, only
> when getting buffers. Guess this is more like the mbuf implementation in
> DPDK.
Hi Eelco,

You're right, thanks! I was following the old sample code in kernel,
xdpsock_user.c and its sharing single cq and fq among multiple xsks.

Based on Magnus suggestion, we could do:
"
you can register the same umem
area multiple times (creating multiple umem handles and multiple fqs
and cqs) to be able to support xsk sockets that have different queue
ids, but the same umem area. In both cases you need a mempool that can
handle multiple threads.
"

Magnus has replied in another email thread, I will wait for
his patch set.

Regards,
William
diff mbox series

Patch

diff --git a/lib/netdev-afxdp.c b/lib/netdev-afxdp.c
index 3037770b27cb..42767e1e27f3 100644
--- a/lib/netdev-afxdp.c
+++ b/lib/netdev-afxdp.c
@@ -95,6 +95,8 @@  static void xsk_destroy(struct xsk_socket_info *xsk);
 static int xsk_configure_all(struct netdev *netdev);
 static void xsk_destroy_all(struct netdev *netdev);
 
+static struct xsk_umem_info *shared_umem;
+
 struct unused_pool {
     struct xsk_umem_info *umem_info;
     int lost_in_rings; /* Number of packets left in tx, rx, cq and fq. */
@@ -112,6 +114,8 @@  struct xsk_umem_info {
     struct xsk_ring_cons cq;
     struct xsk_umem *umem;
     void *buffer;
+    int refcount;
+    struct ovs_mutex mutex;
 };
 
 struct xsk_socket_info {
@@ -228,6 +232,7 @@  xsk_configure_umem(void *buffer, uint64_t size, int xdpmode)
     uconfig.comp_size = CONS_NUM_DESCS;
     uconfig.frame_size = FRAME_SIZE;
     uconfig.frame_headroom = OVS_XDP_HEADROOM;
+    ovs_mutex_init(&umem->mutex);
 
     ret = xsk_umem__create(&umem->umem, buffer, size, &umem->fq, &umem->cq,
                            &uconfig);
@@ -296,6 +301,7 @@  xsk_configure_socket(struct xsk_umem_info *umem, uint32_t ifindex,
     struct xsk_socket_info *xsk;
     char devname[IF_NAMESIZE];
     uint32_t idx = 0, prog_id;
+    bool shared;
     int ret;
     int i;
 
@@ -304,6 +310,7 @@  xsk_configure_socket(struct xsk_umem_info *umem, uint32_t ifindex,
     cfg.rx_size = CONS_NUM_DESCS;
     cfg.tx_size = PROD_NUM_DESCS;
     cfg.libbpf_flags = 0;
+    shared = umem->refcount > 1;
 
     if (xdpmode == XDP_ZEROCOPY) {
         cfg.bind_flags = XDP_ZEROCOPY;
@@ -319,6 +326,10 @@  xsk_configure_socket(struct xsk_umem_info *umem, uint32_t ifindex,
     }
 #endif
 
+    if (shared) {
+        cfg.bind_flags = XDP_SHARED_UMEM;
+    }
+
     if (if_indextoname(ifindex, devname) == NULL) {
         VLOG_ERR("ifindex %d to devname failed (%s)",
                  ifindex, ovs_strerror(errno));
@@ -352,6 +363,11 @@  xsk_configure_socket(struct xsk_umem_info *umem, uint32_t ifindex,
         return NULL;
     }
 
+    if (shared) {
+        return xsk;
+    }
+
+    /* Only the first umem needs to go below. */
     while (!xsk_ring_prod__reserve(&xsk->umem->fq,
                                    PROD_NUM_DESCS, &idx)) {
         VLOG_WARN_RL(&rl, "Retry xsk_ring_prod__reserve to FILL queue");
@@ -380,33 +396,43 @@  xsk_configure(int ifindex, int xdp_queue_id, int xdpmode,
 {
     struct xsk_socket_info *xsk;
     struct xsk_umem_info *umem;
-    void *bufs;
+    void *bufs = NULL;
 
     netdev_afxdp_sweep_unused_pools(NULL);
 
-    /* Umem memory region. */
-    bufs = xmalloc_pagealign(NUM_FRAMES * FRAME_SIZE);
-    memset(bufs, 0, NUM_FRAMES * FRAME_SIZE);
+    if (!shared_umem) {
+        /* Umem memory region. */
+        bufs = xmalloc_pagealign(NUM_FRAMES * FRAME_SIZE);
+        memset(bufs, 0, NUM_FRAMES * FRAME_SIZE);
+
+        /* Create AF_XDP socket. */
+        umem = xsk_configure_umem(bufs,
+                                  NUM_FRAMES * FRAME_SIZE,
+                                  xdpmode);
+        if (!umem) {
+            free_pagealign(bufs);
+            return NULL;
+        }
 
-    /* Create AF_XDP socket. */
-    umem = xsk_configure_umem(bufs,
-                              NUM_FRAMES * FRAME_SIZE,
-                              xdpmode);
-    if (!umem) {
-        free_pagealign(bufs);
-        return NULL;
+        shared_umem = umem;
+        umem->refcount++;
+        VLOG_DBG("Allocated umem pool at 0x%"PRIxPTR, (uintptr_t) umem);
+    } else {
+        umem = shared_umem;
+        umem->refcount++;
     }
 
-    VLOG_DBG("Allocated umem pool at 0x%"PRIxPTR, (uintptr_t) umem);
-
     xsk = xsk_configure_socket(umem, ifindex, xdp_queue_id, xdpmode,
                                use_need_wakeup);
-    if (!xsk) {
+    if (!xsk && !umem->refcount--) {
         /* Clean up umem and xpacket pool. */
+        shared_umem = NULL;
         if (xsk_umem__delete(umem->umem)) {
             VLOG_ERR("xsk_umem__delete failed.");
         }
-        free_pagealign(bufs);
+        if (bufs) {
+            free_pagealign(bufs);
+        }
         umem_pool_cleanup(&umem->mpool);
         xpacket_pool_cleanup(&umem->xpool);
         free(umem);
@@ -472,21 +498,29 @@  xsk_destroy(struct xsk_socket_info *xsk_info)
     xsk_info->xsk = NULL;
 
     umem = xsk_info->umem->umem;
-    if (xsk_umem__delete(umem)) {
-        VLOG_ERR("xsk_umem__delete failed.");
-    }
+    xsk_info->umem->refcount--;
 
-    pool = xzalloc(sizeof *pool);
-    pool->umem_info = xsk_info->umem;
-    pool->lost_in_rings = xsk_info->outstanding_tx + xsk_info->available_rx;
+    if (!xsk_info->umem->refcount) {
+        VLOG_WARN("destroy umem");
+        shared_umem = NULL;
+        if (xsk_umem__delete(umem)) {
+            VLOG_ERR("xsk_umem__delete failed.");
+        }
+        ovs_mutex_destroy(&xsk_info->umem->mutex);
 
-    ovs_mutex_lock(&unused_pools_mutex);
-    ovs_list_push_back(&unused_pools, &pool->list_node);
-    ovs_mutex_unlock(&unused_pools_mutex);
+        pool = xzalloc(sizeof *pool);
+        pool->umem_info = xsk_info->umem;
+        pool->lost_in_rings = xsk_info->outstanding_tx +
+                              xsk_info->available_rx;
 
-    free(xsk_info);
+        ovs_mutex_lock(&unused_pools_mutex);
+        ovs_list_push_back(&unused_pools, &pool->list_node);
+        ovs_mutex_unlock(&unused_pools_mutex);
 
-    netdev_afxdp_sweep_unused_pools(NULL);
+        free(xsk_info);
+
+        netdev_afxdp_sweep_unused_pools(NULL);
+    }
 }
 
 static void
@@ -733,11 +767,14 @@  netdev_afxdp_rxq_recv(struct netdev_rxq *rxq_, struct dp_packet_batch *batch,
     prepare_fill_queue(xsk_info);
 
     umem = xsk_info->umem;
+    ovs_mutex_lock(&umem->mutex);
+
     rx->fd = xsk_socket__fd(xsk_info->xsk);
 
     rcvd = xsk_ring_cons__peek(&xsk_info->rx, BATCH_SIZE, &idx_rx);
     if (!rcvd) {
         xsk_rx_wakeup_if_needed(umem, netdev, rx->fd);
+        ovs_mutex_unlock(&umem->mutex);
         return EAGAIN;
     }
 
@@ -778,6 +815,7 @@  netdev_afxdp_rxq_recv(struct netdev_rxq *rxq_, struct dp_packet_batch *batch,
         /* TODO: return the number of remaining packets in the queue. */
         *qfill = 0;
     }
+    ovs_mutex_unlock(&umem->mutex);
     return 0;
 }
 
@@ -924,7 +962,7 @@  __netdev_afxdp_batch_send(struct netdev *netdev, int qid,
     struct netdev_linux *dev = netdev_linux_cast(netdev);
     struct xsk_socket_info *xsk_info;
     void *elems_pop[BATCH_SIZE];
-    struct xsk_umem_info *umem;
+    struct xsk_umem_info *umem = NULL;
     struct dp_packet *packet;
     bool free_batch = false;
     unsigned long orig;
@@ -942,6 +980,8 @@  __netdev_afxdp_batch_send(struct netdev *netdev, int qid,
     free_batch = check_free_batch(batch);
 
     umem = xsk_info->umem;
+    ovs_mutex_lock(&umem->mutex);
+
     ret = umem_elem_pop_n(&umem->mpool, dp_packet_batch_size(batch),
                           elems_pop);
     if (OVS_UNLIKELY(ret)) {
@@ -993,6 +1033,9 @@  __netdev_afxdp_batch_send(struct netdev *netdev, int qid,
     }
 
 out:
+    if (umem) {
+        ovs_mutex_unlock(&umem->mutex);
+    }
     if (free_batch) {
         free_afxdp_buf_batch(batch);
     } else {