diff mbox series

[ovs-dev] ovs-thread: Avoid huge alignment on a base spinlock structure.

Message ID 20191216142200.16749-1-i.maximets@ovn.org
State Accepted
Delegated to: Ilya Maximets
Headers show
Series [ovs-dev] ovs-thread: Avoid huge alignment on a base spinlock structure. | expand

Commit Message

Ilya Maximets Dec. 16, 2019, 2:22 p.m. UTC
Marking the structure as 64 bytes aligned forces compiler to produce
big holes in the containing structures in order to fulfill this
requirement.  Also, any structure that contains this one as a member
automatically inherits this huge alignment making resulted memory
layout not efficient.  For example, 'struct umem_pool' currently
uses 3 full cache lines (192 bytes) with only 32 bytes of actual data:

  struct umem_pool {
    int                        index;                /*  0   4 */
    unsigned int               size;                 /*  4   4 */

    /* XXX 56 bytes hole, try to pack */

    /* --- cacheline 1 boundary (64 bytes) --- */
    struct ovs_spin lock __attribute__((__aligned__(64))); /* 64  64 */

    /* XXX last struct has 48 bytes of padding */

    /* --- cacheline 2 boundary (128 bytes) --- */
    void * *                   array;                /* 128  8 */

    /* size: 192, cachelines: 3, members: 4 */
    /* sum members: 80, holes: 1, sum holes: 56 */
    /* padding: 56 */
    /* paddings: 1, sum paddings: 48 */
    /* forced alignments: 1, forced holes: 1, sum forced holes: 56 */
  } __attribute__((__aligned__(64)));

Actual alignment of a spin lock is required only for Tx queue locks
inside netdev-afxdp to avoid false sharing, in all other cases
alignment only produces inefficient memory usage.

Also, CACHE_LINE_SIZE macro should be used instead of 64 as different
platforms may have different cache line sizes.

Using PADDED_MEMBERS to avoid alignment inheritance.

CC: William Tu <u9012063@gmail.com>
Fixes: ae36d63d7e3c ("ovs-thread: Make struct spin lock cache aligned.")
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
---
 include/openvswitch/thread.h |  2 +-
 lib/netdev-afxdp.c           | 20 ++++++++++++++------
 lib/netdev-afxdp.h           | 13 +++++++------
 lib/netdev-linux-private.h   |  2 +-
 4 files changed, 23 insertions(+), 14 deletions(-)

Comments

William Tu Dec. 16, 2019, 6:21 p.m. UTC | #1
On Mon, Dec 16, 2019 at 03:22:00PM +0100, Ilya Maximets wrote:
> Marking the structure as 64 bytes aligned forces compiler to produce
> big holes in the containing structures in order to fulfill this
> requirement.  Also, any structure that contains this one as a member
> automatically inherits this huge alignment making resulted memory
> layout not efficient.  For example, 'struct umem_pool' currently
> uses 3 full cache lines (192 bytes) with only 32 bytes of actual data:
> 
>   struct umem_pool {
>     int                        index;                /*  0   4 */
>     unsigned int               size;                 /*  4   4 */
> 
>     /* XXX 56 bytes hole, try to pack */
> 
>     /* --- cacheline 1 boundary (64 bytes) --- */
>     struct ovs_spin lock __attribute__((__aligned__(64))); /* 64  64 */
> 
>     /* XXX last struct has 48 bytes of padding */
> 
>     /* --- cacheline 2 boundary (128 bytes) --- */
>     void * *                   array;                /* 128  8 */
> 
>     /* size: 192, cachelines: 3, members: 4 */
>     /* sum members: 80, holes: 1, sum holes: 56 */
>     /* padding: 56 */
>     /* paddings: 1, sum paddings: 48 */
>     /* forced alignments: 1, forced holes: 1, sum forced holes: 56 */
>   } __attribute__((__aligned__(64)));
> 
> Actual alignment of a spin lock is required only for Tx queue locks
> inside netdev-afxdp to avoid false sharing, in all other cases
> alignment only produces inefficient memory usage.

yes, now I realize umem->lock does not have the false
sharing problem.

> 
> Also, CACHE_LINE_SIZE macro should be used instead of 64 as different
> platforms may have different cache line sizes.
> 
> Using PADDED_MEMBERS to avoid alignment inheritance.
> 
> CC: William Tu <u9012063@gmail.com>
> Fixes: ae36d63d7e3c ("ovs-thread: Make struct spin lock cache aligned.")
> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
> ---

Somehow my pahole stops working, but the patch looks
good to me, and thanks for also fixing the include ordering.

Acked-by: William Tu <u9012063@gmail.com>

>  include/openvswitch/thread.h |  2 +-
>  lib/netdev-afxdp.c           | 20 ++++++++++++++------
>  lib/netdev-afxdp.h           | 13 +++++++------
>  lib/netdev-linux-private.h   |  2 +-
>  4 files changed, 23 insertions(+), 14 deletions(-)
> 
> diff --git a/include/openvswitch/thread.h b/include/openvswitch/thread.h
> index 5053cb309..acc822904 100644
> --- a/include/openvswitch/thread.h
> +++ b/include/openvswitch/thread.h
> @@ -34,7 +34,7 @@ struct OVS_LOCKABLE ovs_mutex {
>  };
>  
>  #ifdef HAVE_PTHREAD_SPIN_LOCK
> -OVS_ALIGNED_STRUCT(64, OVS_LOCKABLE ovs_spin) {
> +struct OVS_LOCKABLE ovs_spin {
>      pthread_spinlock_t lock;
>      const char *where;          /* NULL if and only if uninitialized. */
>  };
> diff --git a/lib/netdev-afxdp.c b/lib/netdev-afxdp.c
> index ca2dfd005..c59a671e7 100644
> --- a/lib/netdev-afxdp.c
> +++ b/lib/netdev-afxdp.c
> @@ -40,6 +40,7 @@
>  #include "openvswitch/compiler.h"
>  #include "openvswitch/dynamic-string.h"
>  #include "openvswitch/list.h"
> +#include "openvswitch/thread.h"
>  #include "openvswitch/vlog.h"
>  #include "packets.h"
>  #include "socket-util.h"
> @@ -158,6 +159,13 @@ struct xsk_socket_info {
>      atomic_uint64_t tx_dropped;
>  };
>  
> +struct netdev_afxdp_tx_lock {
> +    /* Padding to make netdev_afxdp_tx_lock exactly one cache line long. */
> +    PADDED_MEMBERS(CACHE_LINE_SIZE,
> +        struct ovs_spin lock;
> +    );
> +};
> +
>  #ifdef HAVE_XDP_NEED_WAKEUP
>  static inline void
>  xsk_rx_wakeup_if_needed(struct xsk_umem_info *umem,
> @@ -512,10 +520,10 @@ xsk_configure_all(struct netdev *netdev)
>      }
>  
>      n_txq = netdev_n_txq(netdev);
> -    dev->tx_locks = xcalloc(n_txq, sizeof *dev->tx_locks);
> +    dev->tx_locks = xzalloc_cacheline(n_txq * sizeof *dev->tx_locks);
>  
>      for (i = 0; i < n_txq; i++) {
> -        ovs_spin_init(&dev->tx_locks[i]);
> +        ovs_spin_init(&dev->tx_locks[i].lock);
>      }
>  
>      return 0;
> @@ -577,9 +585,9 @@ xsk_destroy_all(struct netdev *netdev)
>  
>      if (dev->tx_locks) {
>          for (i = 0; i < netdev_n_txq(netdev); i++) {
> -            ovs_spin_destroy(&dev->tx_locks[i]);
> +            ovs_spin_destroy(&dev->tx_locks[i].lock);
>          }
> -        free(dev->tx_locks);
> +        free_cacheline(dev->tx_locks);
>          dev->tx_locks = NULL;
>      }
>  }
> @@ -1076,9 +1084,9 @@ netdev_afxdp_batch_send(struct netdev *netdev, int qid,
>          dev = netdev_linux_cast(netdev);
>          qid = qid % netdev_n_txq(netdev);
>  
> -        ovs_spin_lock(&dev->tx_locks[qid]);
> +        ovs_spin_lock(&dev->tx_locks[qid].lock);
>          ret = __netdev_afxdp_batch_send(netdev, qid, batch);
> -        ovs_spin_unlock(&dev->tx_locks[qid]);
> +        ovs_spin_unlock(&dev->tx_locks[qid].lock);
>      } else {
>          ret = __netdev_afxdp_batch_send(netdev, qid, batch);
>      }
> diff --git a/lib/netdev-afxdp.h b/lib/netdev-afxdp.h
> index 8188bc669..ae6971efd 100644
> --- a/lib/netdev-afxdp.h
> +++ b/lib/netdev-afxdp.h
> @@ -34,15 +34,16 @@ enum afxdp_mode {
>      OVS_AF_XDP_MODE_MAX,
>  };
>  
> -struct netdev;
> -struct xsk_socket_info;
> -struct xdp_umem;
> -struct dp_packet_batch;
> -struct smap;
>  struct dp_packet;
> +struct dp_packet_batch;
> +struct netdev;
> +struct netdev_afxdp_tx_lock;
> +struct netdev_custom_stats;
>  struct netdev_rxq;
>  struct netdev_stats;
> -struct netdev_custom_stats;
> +struct smap;
> +struct xdp_umem;
> +struct xsk_socket_info;
>  
>  int netdev_afxdp_rxq_construct(struct netdev_rxq *rxq_);
>  void netdev_afxdp_rxq_destruct(struct netdev_rxq *rxq_);
> diff --git a/lib/netdev-linux-private.h b/lib/netdev-linux-private.h
> index 8873caa9d..f08159aa7 100644
> --- a/lib/netdev-linux-private.h
> +++ b/lib/netdev-linux-private.h
> @@ -108,7 +108,7 @@ struct netdev_linux {
>      bool use_need_wakeup;
>      bool requested_need_wakeup;
>  
> -    struct ovs_spin *tx_locks;  /* spin lock array for TX queues. */
> +    struct netdev_afxdp_tx_lock *tx_locks;  /* Array of locks for TX queues. */
>  #endif
>  };
>  
> -- 
> 2.17.1
>
Ilya Maximets Dec. 19, 2019, 8:19 a.m. UTC | #2
On 16.12.2019 19:21, William Tu wrote:
> On Mon, Dec 16, 2019 at 03:22:00PM +0100, Ilya Maximets wrote:
>> Marking the structure as 64 bytes aligned forces compiler to produce
>> big holes in the containing structures in order to fulfill this
>> requirement.  Also, any structure that contains this one as a member
>> automatically inherits this huge alignment making resulted memory
>> layout not efficient.  For example, 'struct umem_pool' currently
>> uses 3 full cache lines (192 bytes) with only 32 bytes of actual data:
>>
>>   struct umem_pool {
>>     int                        index;                /*  0   4 */
>>     unsigned int               size;                 /*  4   4 */
>>
>>     /* XXX 56 bytes hole, try to pack */
>>
>>     /* --- cacheline 1 boundary (64 bytes) --- */
>>     struct ovs_spin lock __attribute__((__aligned__(64))); /* 64  64 */
>>
>>     /* XXX last struct has 48 bytes of padding */
>>
>>     /* --- cacheline 2 boundary (128 bytes) --- */
>>     void * *                   array;                /* 128  8 */
>>
>>     /* size: 192, cachelines: 3, members: 4 */
>>     /* sum members: 80, holes: 1, sum holes: 56 */
>>     /* padding: 56 */
>>     /* paddings: 1, sum paddings: 48 */
>>     /* forced alignments: 1, forced holes: 1, sum forced holes: 56 */
>>   } __attribute__((__aligned__(64)));
>>
>> Actual alignment of a spin lock is required only for Tx queue locks
>> inside netdev-afxdp to avoid false sharing, in all other cases
>> alignment only produces inefficient memory usage.
> 
> yes, now I realize umem->lock does not have the false
> sharing problem.
> 
>>
>> Also, CACHE_LINE_SIZE macro should be used instead of 64 as different
>> platforms may have different cache line sizes.
>>
>> Using PADDED_MEMBERS to avoid alignment inheritance.
>>
>> CC: William Tu <u9012063@gmail.com>
>> Fixes: ae36d63d7e3c ("ovs-thread: Make struct spin lock cache aligned.")
>> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
>> ---
> 
> Somehow my pahole stops working, but the patch looks
> good to me, and thanks for also fixing the include ordering.

pahole is broken on Ubuntu 18.04 at least.  Had to copy binary with
dependencies from the fedora package.

> 
> Acked-by: William Tu <u9012063@gmail.com>

Thanks! Applied to master.

Best regards, Ilya Maximets.
diff mbox series

Patch

diff --git a/include/openvswitch/thread.h b/include/openvswitch/thread.h
index 5053cb309..acc822904 100644
--- a/include/openvswitch/thread.h
+++ b/include/openvswitch/thread.h
@@ -34,7 +34,7 @@  struct OVS_LOCKABLE ovs_mutex {
 };
 
 #ifdef HAVE_PTHREAD_SPIN_LOCK
-OVS_ALIGNED_STRUCT(64, OVS_LOCKABLE ovs_spin) {
+struct OVS_LOCKABLE ovs_spin {
     pthread_spinlock_t lock;
     const char *where;          /* NULL if and only if uninitialized. */
 };
diff --git a/lib/netdev-afxdp.c b/lib/netdev-afxdp.c
index ca2dfd005..c59a671e7 100644
--- a/lib/netdev-afxdp.c
+++ b/lib/netdev-afxdp.c
@@ -40,6 +40,7 @@ 
 #include "openvswitch/compiler.h"
 #include "openvswitch/dynamic-string.h"
 #include "openvswitch/list.h"
+#include "openvswitch/thread.h"
 #include "openvswitch/vlog.h"
 #include "packets.h"
 #include "socket-util.h"
@@ -158,6 +159,13 @@  struct xsk_socket_info {
     atomic_uint64_t tx_dropped;
 };
 
+struct netdev_afxdp_tx_lock {
+    /* Padding to make netdev_afxdp_tx_lock exactly one cache line long. */
+    PADDED_MEMBERS(CACHE_LINE_SIZE,
+        struct ovs_spin lock;
+    );
+};
+
 #ifdef HAVE_XDP_NEED_WAKEUP
 static inline void
 xsk_rx_wakeup_if_needed(struct xsk_umem_info *umem,
@@ -512,10 +520,10 @@  xsk_configure_all(struct netdev *netdev)
     }
 
     n_txq = netdev_n_txq(netdev);
-    dev->tx_locks = xcalloc(n_txq, sizeof *dev->tx_locks);
+    dev->tx_locks = xzalloc_cacheline(n_txq * sizeof *dev->tx_locks);
 
     for (i = 0; i < n_txq; i++) {
-        ovs_spin_init(&dev->tx_locks[i]);
+        ovs_spin_init(&dev->tx_locks[i].lock);
     }
 
     return 0;
@@ -577,9 +585,9 @@  xsk_destroy_all(struct netdev *netdev)
 
     if (dev->tx_locks) {
         for (i = 0; i < netdev_n_txq(netdev); i++) {
-            ovs_spin_destroy(&dev->tx_locks[i]);
+            ovs_spin_destroy(&dev->tx_locks[i].lock);
         }
-        free(dev->tx_locks);
+        free_cacheline(dev->tx_locks);
         dev->tx_locks = NULL;
     }
 }
@@ -1076,9 +1084,9 @@  netdev_afxdp_batch_send(struct netdev *netdev, int qid,
         dev = netdev_linux_cast(netdev);
         qid = qid % netdev_n_txq(netdev);
 
-        ovs_spin_lock(&dev->tx_locks[qid]);
+        ovs_spin_lock(&dev->tx_locks[qid].lock);
         ret = __netdev_afxdp_batch_send(netdev, qid, batch);
-        ovs_spin_unlock(&dev->tx_locks[qid]);
+        ovs_spin_unlock(&dev->tx_locks[qid].lock);
     } else {
         ret = __netdev_afxdp_batch_send(netdev, qid, batch);
     }
diff --git a/lib/netdev-afxdp.h b/lib/netdev-afxdp.h
index 8188bc669..ae6971efd 100644
--- a/lib/netdev-afxdp.h
+++ b/lib/netdev-afxdp.h
@@ -34,15 +34,16 @@  enum afxdp_mode {
     OVS_AF_XDP_MODE_MAX,
 };
 
-struct netdev;
-struct xsk_socket_info;
-struct xdp_umem;
-struct dp_packet_batch;
-struct smap;
 struct dp_packet;
+struct dp_packet_batch;
+struct netdev;
+struct netdev_afxdp_tx_lock;
+struct netdev_custom_stats;
 struct netdev_rxq;
 struct netdev_stats;
-struct netdev_custom_stats;
+struct smap;
+struct xdp_umem;
+struct xsk_socket_info;
 
 int netdev_afxdp_rxq_construct(struct netdev_rxq *rxq_);
 void netdev_afxdp_rxq_destruct(struct netdev_rxq *rxq_);
diff --git a/lib/netdev-linux-private.h b/lib/netdev-linux-private.h
index 8873caa9d..f08159aa7 100644
--- a/lib/netdev-linux-private.h
+++ b/lib/netdev-linux-private.h
@@ -108,7 +108,7 @@  struct netdev_linux {
     bool use_need_wakeup;
     bool requested_need_wakeup;
 
-    struct ovs_spin *tx_locks;  /* spin lock array for TX queues. */
+    struct netdev_afxdp_tx_lock *tx_locks;  /* Array of locks for TX queues. */
 #endif
 };