diff mbox series

[ovs-dev,v3] dpif-netdev: Expand the meter capacity.

Message ID 20210512091736.48172-1-xiangxia.m.yue@gmail.com
State Accepted
Headers show
Series [ovs-dev,v3] dpif-netdev: Expand the meter capacity. | expand

Commit Message

Tonghao Zhang May 12, 2021, 9:17 a.m. UTC
From: Tonghao Zhang <xiangxia.m.yue@gmail.com>

For now, ovs-vswitchd use the array of the dp_meter struct
to store meter's data, and at most, there are only 65536
(defined by MAX_METERS) meters that can be used. But in some
case, for example, in the edge gateway, we should use 200,000+,
at least, meters for IP address bandwidth limitation.
Every one IP address will use two meters for its rx and tx
path[1]. In other way, ovs-vswitchd should support meter-offload
(rte_mtr_xxx api introduced by dpdk.), but there are more than
65536 meters in the hardware, such as Mellanox ConnectX-6.

This patch use cmap to manage the meter, instead of the array.

* Insertion performance, ovs-ofctl add-meter 1000+ meters,
  the cmap takes abount 4000ms, as same as previous implementation.
* Lookup performance in datapath, we add 1000+ meters which rate limit
  are 10Gbps (the NIC cards are 10Gbps, so netdev-datapath will not
  drop the packets.), and a flow which only forward packets from p0
  to p1, with meter action[2]. On other machine, pktgen-dpdk will
  generate 64B packets to p0.

  The forwarding performance always is 1324 Kpps on my server
  which CPU is Intel E5-2650, 2.00GHz.

[1].
$ in_port=p0,ip,ip_dst=1.1.1.x action=meter:n,output:p1
$ in_port=p1,ip,ip_src=1.1.1.x action=meter:m,output:p0

[2].
$ in_port=p0 action=meter:100,output:p1

Signed-off-by: Tonghao Zhang <xiangxia.m.yue@gmail.com>
---
v3:
* update the commit message
* remove dp_netdev_meter struct
* remove create_dp_netdev function
* don't use the hash_basis
* use the meter_id as a hash instead of hash_xxx function. see *dp_meter_hash for details
* fix coding style
* v2: http://patchwork.ozlabs.org/project/openvswitch/patch/1584254601-7321-1-git-send-email-xiangxia.m.yue@gmail.com/
---
 lib/dpif-netdev.c | 158 ++++++++++++++++++++++++++++------------------
 1 file changed, 97 insertions(+), 61 deletions(-)

Comments

0-day Robot May 12, 2021, 10 a.m. UTC | #1
Bleep bloop.  Greetings Tonghao Zhang, I am a robot and I have tried out your patch.
Thanks for your contribution.

I encountered some error that I wasn't expecting.  See the details below.


checkpatch:
WARNING: Comment with 'xxx' marker
#107 FILE: lib/dpif-netdev.c:1745:
     * Use them directly instead of hash_xxx function

Lines checked: 333, Warnings: 1, Errors: 0


Please check this out.  If you feel there has been an error, please email aconole@redhat.com

Thanks,
0-day Robot
Tonghao Zhang June 11, 2021, 8:33 a.m. UTC | #2
On Wed, May 12, 2021 at 5:18 PM <xiangxia.m.yue@gmail.com> wrote:
>
> From: Tonghao Zhang <xiangxia.m.yue@gmail.com>
>
> For now, ovs-vswitchd use the array of the dp_meter struct
> to store meter's data, and at most, there are only 65536
> (defined by MAX_METERS) meters that can be used. But in some
> case, for example, in the edge gateway, we should use 200,000+,
> at least, meters for IP address bandwidth limitation.
> Every one IP address will use two meters for its rx and tx
> path[1]. In other way, ovs-vswitchd should support meter-offload
> (rte_mtr_xxx api introduced by dpdk.), but there are more than
> 65536 meters in the hardware, such as Mellanox ConnectX-6.
>
> This patch use cmap to manage the meter, instead of the array.
>
> * Insertion performance, ovs-ofctl add-meter 1000+ meters,
>   the cmap takes abount 4000ms, as same as previous implementation.
> * Lookup performance in datapath, we add 1000+ meters which rate limit
>   are 10Gbps (the NIC cards are 10Gbps, so netdev-datapath will not
>   drop the packets.), and a flow which only forward packets from p0
>   to p1, with meter action[2]. On other machine, pktgen-dpdk will
>   generate 64B packets to p0.
>
>   The forwarding performance always is 1324 Kpps on my server
>   which CPU is Intel E5-2650, 2.00GHz.
>
> [1].
> $ in_port=p0,ip,ip_dst=1.1.1.x action=meter:n,output:p1
> $ in_port=p1,ip,ip_src=1.1.1.x action=meter:m,output:p0
>
> [2].
> $ in_port=p0 action=meter:100,output:p1
>
> Signed-off-by: Tonghao Zhang <xiangxia.m.yue@gmail.com>
Hi, friendly ping
> ---
> v3:
> * update the commit message
> * remove dp_netdev_meter struct
> * remove create_dp_netdev function
> * don't use the hash_basis
> * use the meter_id as a hash instead of hash_xxx function. see *dp_meter_hash for details
> * fix coding style
> * v2: http://patchwork.ozlabs.org/project/openvswitch/patch/1584254601-7321-1-git-send-email-xiangxia.m.yue@gmail.com/
> ---
>  lib/dpif-netdev.c | 158 ++++++++++++++++++++++++++++------------------
>  1 file changed, 97 insertions(+), 61 deletions(-)
>
> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
> index 251788b04965..f800c8c65e13 100644
> --- a/lib/dpif-netdev.c
> +++ b/lib/dpif-netdev.c
> @@ -99,9 +99,8 @@ DEFINE_STATIC_PER_THREAD_DATA(uint32_t, recirc_depth, 0)
>  #define DEFAULT_TX_FLUSH_INTERVAL 0
>
>  /* Configuration parameters. */
> -enum { MAX_METERS = 65536 };    /* Maximum number of meters. */
> +enum { MAX_METERS = 1 << 18 };    /* Maximum number of meters. */
>  enum { MAX_BANDS = 8 };         /* Maximum number of bands / meter. */
> -enum { N_METER_LOCKS = 64 };    /* Maximum number of meters. */
>
>  COVERAGE_DEFINE(datapath_drop_meter);
>  COVERAGE_DEFINE(datapath_drop_upcall_error);
> @@ -287,6 +286,9 @@ struct dp_meter_band {
>  };
>
>  struct dp_meter {
> +    struct cmap_node node;
> +    struct ovs_mutex lock;
> +    uint32_t id;
>      uint16_t flags;
>      uint16_t n_bands;
>      uint32_t max_delta_t;
> @@ -339,8 +341,8 @@ struct dp_netdev {
>      atomic_uint32_t tx_flush_interval;
>
>      /* Meters. */
> -    struct ovs_mutex meter_locks[N_METER_LOCKS];
> -    struct dp_meter *meters[MAX_METERS]; /* Meter bands. */
> +    struct ovs_mutex meters_lock;
> +    struct cmap meters OVS_GUARDED;
>
>      /* Probability of EMC insertions is a factor of 'emc_insert_min'.*/
>      OVS_ALIGNED_VAR(CACHE_LINE_SIZE) atomic_uint32_t emc_insert_min;
> @@ -392,19 +394,6 @@ struct dp_netdev {
>      struct cmap tx_bonds; /* Contains 'struct tx_bond'. */
>  };
>
> -static void meter_lock(const struct dp_netdev *dp, uint32_t meter_id)
> -    OVS_ACQUIRES(dp->meter_locks[meter_id % N_METER_LOCKS])
> -{
> -    ovs_mutex_lock(&dp->meter_locks[meter_id % N_METER_LOCKS]);
> -}
> -
> -static void meter_unlock(const struct dp_netdev *dp, uint32_t meter_id)
> -    OVS_RELEASES(dp->meter_locks[meter_id % N_METER_LOCKS])
> -{
> -    ovs_mutex_unlock(&dp->meter_locks[meter_id % N_METER_LOCKS]);
> -}
> -
> -
>  static struct dp_netdev_port *dp_netdev_lookup_port(const struct dp_netdev *dp,
>                                                      odp_port_t)
>      OVS_REQUIRES(dp->port_mutex);
> @@ -1747,6 +1736,68 @@ choose_port(struct dp_netdev *dp, const char *name)
>      return ODPP_NONE;
>  }
>
> +static uint32_t
> +dp_meter_hash(uint32_t meter_id)
> +{
> +    /* In the ofproto-dpif layer, we use the id-pool
> +     * to alloc meter id orderly (e.g. 1, 2, ... N.),
> +     * which provide a better hash distribution.
> +     * Use them directly instead of hash_xxx function
> +     * for achieving high-performance.
> +     */
> +    return meter_id;
> +}
> +
> +static void
> +dp_netdev_meter_destroy(struct dp_netdev *dp)
> +{
> +    struct dp_meter *m;
> +
> +    ovs_mutex_lock(&dp->meters_lock);
> +    CMAP_FOR_EACH (m, node, &dp->meters) {
> +        cmap_remove(&dp->meters, &m->node, dp_meter_hash(m->id));
> +        ovsrcu_postpone(free, m);
> +    }
> +
> +    cmap_destroy(&dp->meters);
> +    ovs_mutex_unlock(&dp->meters_lock);
> +
> +    ovs_mutex_destroy(&dp->meters_lock);
> +}
> +
> +static struct dp_meter*
> +dp_meter_lookup(struct cmap *meters, uint32_t meter_id)
> +{
> +    uint32_t hash = dp_meter_hash(meter_id);
> +    struct dp_meter *m;
> +
> +    CMAP_FOR_EACH_WITH_HASH (m, node, hash, meters) {
> +        if (m->id == meter_id) {
> +            return m;
> +        }
> +    }
> +
> +    return NULL;
> +}
> +
> +static void
> +dp_meter_detach_free(struct cmap *meters, uint32_t meter_id)
> +{
> +    struct dp_meter *m;
> +
> +    m = dp_meter_lookup(meters, meter_id);
> +    if (m) {
> +            cmap_remove(meters, &m->node, dp_meter_hash(meter_id));
> +            ovsrcu_postpone(free, m);
> +    }
> +}
> +
> +static void
> +dp_meter_attach(struct cmap *meters, struct dp_meter *meter)
> +{
> +    cmap_insert(meters, &meter->node, dp_meter_hash(meter->id));
> +}
> +
>  static int
>  create_dp_netdev(const char *name, const struct dpif_class *class,
>                   struct dp_netdev **dpp)
> @@ -1783,9 +1834,9 @@ create_dp_netdev(const char *name, const struct dpif_class *class,
>      dp->reconfigure_seq = seq_create();
>      dp->last_reconfigure_seq = seq_read(dp->reconfigure_seq);
>
> -    for (int i = 0; i < N_METER_LOCKS; ++i) {
> -        ovs_mutex_init_adaptive(&dp->meter_locks[i]);
> -    }
> +    /* Init meter resources. */
> +    cmap_init(&dp->meters);
> +    ovs_mutex_init(&dp->meters_lock);
>
>      /* Disable upcalls by default. */
>      dp_netdev_disable_upcall(dp);
> @@ -1874,16 +1925,6 @@ dp_netdev_destroy_upcall_lock(struct dp_netdev *dp)
>      fat_rwlock_destroy(&dp->upcall_rwlock);
>  }
>
> -static void
> -dp_delete_meter(struct dp_netdev *dp, uint32_t meter_id)
> -    OVS_REQUIRES(dp->meter_locks[meter_id % N_METER_LOCKS])
> -{
> -    if (dp->meters[meter_id]) {
> -        free(dp->meters[meter_id]);
> -        dp->meters[meter_id] = NULL;
> -    }
> -}
> -
>  static uint32_t
>  hash_bond_id(uint32_t bond_id)
>  {
> @@ -1938,16 +1979,7 @@ dp_netdev_free(struct dp_netdev *dp)
>      /* Upcalls must be disabled at this point */
>      dp_netdev_destroy_upcall_lock(dp);
>
> -    int i;
> -
> -    for (i = 0; i < MAX_METERS; ++i) {
> -        meter_lock(dp, i);
> -        dp_delete_meter(dp, i);
> -        meter_unlock(dp, i);
> -    }
> -    for (i = 0; i < N_METER_LOCKS; ++i) {
> -        ovs_mutex_destroy(&dp->meter_locks[i]);
> -    }
> +    dp_netdev_meter_destroy(dp);
>
>      free(dp->pmd_cmask);
>      free(CONST_CAST(char *, dp->name));
> @@ -6175,10 +6207,9 @@ dp_netdev_run_meter(struct dp_netdev *dp, struct dp_packet_batch *packets_,
>          return;
>      }
>
> -    meter_lock(dp, meter_id);
> -    meter = dp->meters[meter_id];
> +    meter = dp_meter_lookup(&dp->meters, meter_id);
>      if (!meter) {
> -        goto out;
> +        return;
>      }
>
>      /* Initialize as negative values. */
> @@ -6186,6 +6217,7 @@ dp_netdev_run_meter(struct dp_netdev *dp, struct dp_packet_batch *packets_,
>      /* Initialize as zeroes. */
>      memset(exceeded_rate, 0, cnt * sizeof *exceeded_rate);
>
> +    ovs_mutex_lock(&meter->lock);
>      /* All packets will hit the meter at the same time. */
>      long_delta_t = now / 1000 - meter->used / 1000; /* msec */
>
> @@ -6301,8 +6333,8 @@ dp_netdev_run_meter(struct dp_netdev *dp, struct dp_packet_batch *packets_,
>              dp_packet_batch_refill(packets_, packet, j);
>          }
>      }
> - out:
> -    meter_unlock(dp, meter_id);
> +
> +    ovs_mutex_unlock(&meter->lock);
>  }
>
>  /* Meter set/get/del processing is still single-threaded. */
> @@ -6344,6 +6376,8 @@ dpif_netdev_meter_set(struct dpif *dpif, ofproto_meter_id meter_id,
>      meter->n_bands = config->n_bands;
>      meter->max_delta_t = 0;
>      meter->used = time_usec();
> +    meter->id = mid;
> +    ovs_mutex_init(&meter->lock);
>
>      /* set up bands */
>      for (i = 0; i < config->n_bands; ++i) {
> @@ -6368,10 +6402,12 @@ dpif_netdev_meter_set(struct dpif *dpif, ofproto_meter_id meter_id,
>          }
>      }
>
> -    meter_lock(dp, mid);
> -    dp_delete_meter(dp, mid); /* Free existing meter, if any */
> -    dp->meters[mid] = meter;
> -    meter_unlock(dp, mid);
> +    ovs_mutex_lock(&dp->meters_lock);
> +
> +    dp_meter_detach_free(&dp->meters, mid); /* Free existing meter, if any */
> +    dp_meter_attach(&dp->meters, meter);
> +
> +    ovs_mutex_unlock(&dp->meters_lock);
>
>      return 0;
>  }
> @@ -6381,23 +6417,24 @@ dpif_netdev_meter_get(const struct dpif *dpif,
>                        ofproto_meter_id meter_id_,
>                        struct ofputil_meter_stats *stats, uint16_t n_bands)
>  {
> -    const struct dp_netdev *dp = get_dp_netdev(dpif);
> +    struct dp_netdev *dp = get_dp_netdev(dpif);
>      uint32_t meter_id = meter_id_.uint32;
> -    int retval = 0;
> +    const struct dp_meter *meter;
>
>      if (meter_id >= MAX_METERS) {
>          return EFBIG;
>      }
>
> -    meter_lock(dp, meter_id);
> -    const struct dp_meter *meter = dp->meters[meter_id];
> +    meter = dp_meter_lookup(&dp->meters, meter_id);
>      if (!meter) {
> -        retval = ENOENT;
> -        goto done;
> +        return ENOENT;
>      }
> +
>      if (stats) {
>          int i = 0;
>
> +        ovs_mutex_lock(&meter->lock);
> +
>          stats->packet_in_count = meter->packet_count;
>          stats->byte_in_count = meter->byte_count;
>
> @@ -6406,12 +6443,11 @@ dpif_netdev_meter_get(const struct dpif *dpif,
>              stats->bands[i].byte_count = meter->bands[i].byte_count;
>          }
>
> +        ovs_mutex_unlock(&meter->lock);
>          stats->n_bands = i;
>      }
>
> -done:
> -    meter_unlock(dp, meter_id);
> -    return retval;
> +    return 0;
>  }
>
>  static int
> @@ -6426,9 +6462,9 @@ dpif_netdev_meter_del(struct dpif *dpif,
>      if (!error) {
>          uint32_t meter_id = meter_id_.uint32;
>
> -        meter_lock(dp, meter_id);
> -        dp_delete_meter(dp, meter_id);
> -        meter_unlock(dp, meter_id);
> +        ovs_mutex_lock(&dp->meters_lock);
> +        dp_meter_detach_free(&dp->meters, meter_id);
> +        ovs_mutex_unlock(&dp->meters_lock);
>      }
>      return error;
>  }
> --
> 2.27.0
>
Ilya Maximets June 23, 2021, 11:07 a.m. UTC | #3
On 5/12/21 11:17 AM, xiangxia.m.yue@gmail.com wrote:
> From: Tonghao Zhang <xiangxia.m.yue@gmail.com>
> 
> For now, ovs-vswitchd use the array of the dp_meter struct
> to store meter's data, and at most, there are only 65536
> (defined by MAX_METERS) meters that can be used. But in some
> case, for example, in the edge gateway, we should use 200,000+,
> at least, meters for IP address bandwidth limitation.
> Every one IP address will use two meters for its rx and tx
> path[1]. In other way, ovs-vswitchd should support meter-offload
> (rte_mtr_xxx api introduced by dpdk.), but there are more than
> 65536 meters in the hardware, such as Mellanox ConnectX-6.
> 
> This patch use cmap to manage the meter, instead of the array.
> 
> * Insertion performance, ovs-ofctl add-meter 1000+ meters,
>   the cmap takes abount 4000ms, as same as previous implementation.
> * Lookup performance in datapath, we add 1000+ meters which rate limit
>   are 10Gbps (the NIC cards are 10Gbps, so netdev-datapath will not
>   drop the packets.), and a flow which only forward packets from p0
>   to p1, with meter action[2]. On other machine, pktgen-dpdk will
>   generate 64B packets to p0.
> 
>   The forwarding performance always is 1324 Kpps on my server
>   which CPU is Intel E5-2650, 2.00GHz.
> 
> [1].
> $ in_port=p0,ip,ip_dst=1.1.1.x action=meter:n,output:p1
> $ in_port=p1,ip,ip_src=1.1.1.x action=meter:m,output:p0
> 
> [2].
> $ in_port=p0 action=meter:100,output:p1
> 
> Signed-off-by: Tonghao Zhang <xiangxia.m.yue@gmail.com>
> ---
> v3:
> * update the commit message
> * remove dp_netdev_meter struct
> * remove create_dp_netdev function
> * don't use the hash_basis
> * use the meter_id as a hash instead of hash_xxx function. see *dp_meter_hash for details
> * fix coding style
> * v2: http://patchwork.ozlabs.org/project/openvswitch/patch/1584254601-7321-1-git-send-email-xiangxia.m.yue@gmail.com/
> ---
>  lib/dpif-netdev.c | 158 ++++++++++++++++++++++++++++------------------
>  1 file changed, 97 insertions(+), 61 deletions(-)

Hi.  Thanks for v3!

This version looks mostly OK to me with only one question:
In current code meter locks are adaptive mutexes, but this patch
makes them usual.  Is there particular reason to do that?

Have you tested performance in case where several threads uses
the same meter?  If not, I'd prefer to keep it adaptive, as it's
the current behavior and adaptive mutexes sometimes provides
better performance since they act like spinlocks for a short
period of time (in some cases they're worse than simple mutexes,
but extensive performance testing is needed for each particular
case to confirm).
I can change the type of meter mutexes before applying the patch.
Let me know, what do you think.

Best regards, Ilya Maximets.
Tonghao Zhang June 24, 2021, 2:59 p.m. UTC | #4
On Wed, Jun 23, 2021 at 7:07 PM Ilya Maximets <i.maximets@ovn.org> wrote:
>
> On 5/12/21 11:17 AM, xiangxia.m.yue@gmail.com wrote:
> > From: Tonghao Zhang <xiangxia.m.yue@gmail.com>
> >
> > For now, ovs-vswitchd use the array of the dp_meter struct
> > to store meter's data, and at most, there are only 65536
> > (defined by MAX_METERS) meters that can be used. But in some
> > case, for example, in the edge gateway, we should use 200,000+,
> > at least, meters for IP address bandwidth limitation.
> > Every one IP address will use two meters for its rx and tx
> > path[1]. In other way, ovs-vswitchd should support meter-offload
> > (rte_mtr_xxx api introduced by dpdk.), but there are more than
> > 65536 meters in the hardware, such as Mellanox ConnectX-6.
> >
> > This patch use cmap to manage the meter, instead of the array.
> >
> > * Insertion performance, ovs-ofctl add-meter 1000+ meters,
> >   the cmap takes abount 4000ms, as same as previous implementation.
> > * Lookup performance in datapath, we add 1000+ meters which rate limit
> >   are 10Gbps (the NIC cards are 10Gbps, so netdev-datapath will not
> >   drop the packets.), and a flow which only forward packets from p0
> >   to p1, with meter action[2]. On other machine, pktgen-dpdk will
> >   generate 64B packets to p0.
> >
> >   The forwarding performance always is 1324 Kpps on my server
> >   which CPU is Intel E5-2650, 2.00GHz.
> >
> > [1].
> > $ in_port=p0,ip,ip_dst=1.1.1.x action=meter:n,output:p1
> > $ in_port=p1,ip,ip_src=1.1.1.x action=meter:m,output:p0
> >
> > [2].
> > $ in_port=p0 action=meter:100,output:p1
> >
> > Signed-off-by: Tonghao Zhang <xiangxia.m.yue@gmail.com>
> > ---
> > v3:
> > * update the commit message
> > * remove dp_netdev_meter struct
> > * remove create_dp_netdev function
> > * don't use the hash_basis
> > * use the meter_id as a hash instead of hash_xxx function. see *dp_meter_hash for details
> > * fix coding style
> > * v2: http://patchwork.ozlabs.org/project/openvswitch/patch/1584254601-7321-1-git-send-email-xiangxia.m.yue@gmail.com/
> > ---
> >  lib/dpif-netdev.c | 158 ++++++++++++++++++++++++++++------------------
> >  1 file changed, 97 insertions(+), 61 deletions(-)
>
> Hi.  Thanks for v3!
>
> This version looks mostly OK to me with only one question:
> In current code meter locks are adaptive mutexes, but this patch
> makes them usual.  Is there particular reason to do that?
No, we should use the adaptive mutexes, shown below. I tested the
forwarding performance, using dpdk-pktgen to gen flows:
range 0 dst ip start 4.4.4.200
range 0 dst ip min 4.4.4.200
range 0 dst ip max 4.4.4.200
range 0 proto udp
range 0 src port start 1
range 0 src port min 1
range 0 src port max 1000
range 0 src port inc 1
range 0 size start 64
range 0 size min 64
range 0 size max 64
enable 0 range
start 0

the usual metex: 988192 pps
the adaptive mutex: 1007872 pps

Ilya, we can change them before applying this patch. Thanks!

diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index f800c8c65e13..cc173541f1c8 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -6377,7 +6377,7 @@ dpif_netdev_meter_set(struct dpif *dpif,
ofproto_meter_id meter_id,
     meter->max_delta_t = 0;
     meter->used = time_usec();
     meter->id = mid;
-    ovs_mutex_init(&meter->lock);
+    ovs_mutex_init_adaptive(&meter->lock);

     /* set up bands */
     for (i = 0; i < config->n_bands; ++i) {

> Have you tested performance in case where several threads uses
> the same meter?  If not, I'd prefer to keep it adaptive, as it's
> the current behavior and adaptive mutexes sometimes provides
> better performance since they act like spinlocks for a short
> period of time (in some cases they're worse than simple mutexes,
> but extensive performance testing is needed for each particular
> case to confirm).
> I can change the type of meter mutexes before applying the patch.
> Let me know, what do you think.
>
> Best regards, Ilya Maximets.
Ilya Maximets June 24, 2021, 8:17 p.m. UTC | #5
On 6/24/21 4:59 PM, Tonghao Zhang wrote:
> On Wed, Jun 23, 2021 at 7:07 PM Ilya Maximets <i.maximets@ovn.org> wrote:
>>
>> On 5/12/21 11:17 AM, xiangxia.m.yue@gmail.com wrote:
>>> From: Tonghao Zhang <xiangxia.m.yue@gmail.com>
>>>
>>> For now, ovs-vswitchd use the array of the dp_meter struct
>>> to store meter's data, and at most, there are only 65536
>>> (defined by MAX_METERS) meters that can be used. But in some
>>> case, for example, in the edge gateway, we should use 200,000+,
>>> at least, meters for IP address bandwidth limitation.
>>> Every one IP address will use two meters for its rx and tx
>>> path[1]. In other way, ovs-vswitchd should support meter-offload
>>> (rte_mtr_xxx api introduced by dpdk.), but there are more than
>>> 65536 meters in the hardware, such as Mellanox ConnectX-6.
>>>
>>> This patch use cmap to manage the meter, instead of the array.
>>>
>>> * Insertion performance, ovs-ofctl add-meter 1000+ meters,
>>>   the cmap takes abount 4000ms, as same as previous implementation.
>>> * Lookup performance in datapath, we add 1000+ meters which rate limit
>>>   are 10Gbps (the NIC cards are 10Gbps, so netdev-datapath will not
>>>   drop the packets.), and a flow which only forward packets from p0
>>>   to p1, with meter action[2]. On other machine, pktgen-dpdk will
>>>   generate 64B packets to p0.
>>>
>>>   The forwarding performance always is 1324 Kpps on my server
>>>   which CPU is Intel E5-2650, 2.00GHz.
>>>
>>> [1].
>>> $ in_port=p0,ip,ip_dst=1.1.1.x action=meter:n,output:p1
>>> $ in_port=p1,ip,ip_src=1.1.1.x action=meter:m,output:p0
>>>
>>> [2].
>>> $ in_port=p0 action=meter:100,output:p1
>>>
>>> Signed-off-by: Tonghao Zhang <xiangxia.m.yue@gmail.com>
>>> ---
>>> v3:
>>> * update the commit message
>>> * remove dp_netdev_meter struct
>>> * remove create_dp_netdev function
>>> * don't use the hash_basis
>>> * use the meter_id as a hash instead of hash_xxx function. see *dp_meter_hash for details
>>> * fix coding style
>>> * v2: http://patchwork.ozlabs.org/project/openvswitch/patch/1584254601-7321-1-git-send-email-xiangxia.m.yue@gmail.com/
>>> ---
>>>  lib/dpif-netdev.c | 158 ++++++++++++++++++++++++++++------------------
>>>  1 file changed, 97 insertions(+), 61 deletions(-)
>>
>> Hi.  Thanks for v3!
>>
>> This version looks mostly OK to me with only one question:
>> In current code meter locks are adaptive mutexes, but this patch
>> makes them usual.  Is there particular reason to do that?
> No, we should use the adaptive mutexes, shown below. I tested the
> forwarding performance, using dpdk-pktgen to gen flows:
> range 0 dst ip start 4.4.4.200
> range 0 dst ip min 4.4.4.200
> range 0 dst ip max 4.4.4.200
> range 0 proto udp
> range 0 src port start 1
> range 0 src port min 1
> range 0 src port max 1000
> range 0 src port inc 1
> range 0 size start 64
> range 0 size min 64
> range 0 size max 64
> enable 0 range
> start 0
> 
> the usual metex: 988192 pps
> the adaptive mutex: 1007872 pps
> 
> Ilya, we can change them before applying this patch. Thanks!

Thanks for testing!  I changed the mutex type, made a couple of
small style changes and applied to master.

Best regards, Ilya Maximets.

> 
> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
> index f800c8c65e13..cc173541f1c8 100644
> --- a/lib/dpif-netdev.c
> +++ b/lib/dpif-netdev.c
> @@ -6377,7 +6377,7 @@ dpif_netdev_meter_set(struct dpif *dpif,
> ofproto_meter_id meter_id,
>      meter->max_delta_t = 0;
>      meter->used = time_usec();
>      meter->id = mid;
> -    ovs_mutex_init(&meter->lock);
> +    ovs_mutex_init_adaptive(&meter->lock);
> 
>      /* set up bands */
>      for (i = 0; i < config->n_bands; ++i) {
> 
>> Have you tested performance in case where several threads uses
>> the same meter?  If not, I'd prefer to keep it adaptive, as it's
>> the current behavior and adaptive mutexes sometimes provides
>> better performance since they act like spinlocks for a short
>> period of time (in some cases they're worse than simple mutexes,
>> but extensive performance testing is needed for each particular
>> case to confirm).
>> I can change the type of meter mutexes before applying the patch.
>> Let me know, what do you think.
>>
>> Best regards, Ilya Maximets.
> 
> 
>
diff mbox series

Patch

diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index 251788b04965..f800c8c65e13 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -99,9 +99,8 @@  DEFINE_STATIC_PER_THREAD_DATA(uint32_t, recirc_depth, 0)
 #define DEFAULT_TX_FLUSH_INTERVAL 0
 
 /* Configuration parameters. */
-enum { MAX_METERS = 65536 };    /* Maximum number of meters. */
+enum { MAX_METERS = 1 << 18 };    /* Maximum number of meters. */
 enum { MAX_BANDS = 8 };         /* Maximum number of bands / meter. */
-enum { N_METER_LOCKS = 64 };    /* Maximum number of meters. */
 
 COVERAGE_DEFINE(datapath_drop_meter);
 COVERAGE_DEFINE(datapath_drop_upcall_error);
@@ -287,6 +286,9 @@  struct dp_meter_band {
 };
 
 struct dp_meter {
+    struct cmap_node node;
+    struct ovs_mutex lock;
+    uint32_t id;
     uint16_t flags;
     uint16_t n_bands;
     uint32_t max_delta_t;
@@ -339,8 +341,8 @@  struct dp_netdev {
     atomic_uint32_t tx_flush_interval;
 
     /* Meters. */
-    struct ovs_mutex meter_locks[N_METER_LOCKS];
-    struct dp_meter *meters[MAX_METERS]; /* Meter bands. */
+    struct ovs_mutex meters_lock;
+    struct cmap meters OVS_GUARDED;
 
     /* Probability of EMC insertions is a factor of 'emc_insert_min'.*/
     OVS_ALIGNED_VAR(CACHE_LINE_SIZE) atomic_uint32_t emc_insert_min;
@@ -392,19 +394,6 @@  struct dp_netdev {
     struct cmap tx_bonds; /* Contains 'struct tx_bond'. */
 };
 
-static void meter_lock(const struct dp_netdev *dp, uint32_t meter_id)
-    OVS_ACQUIRES(dp->meter_locks[meter_id % N_METER_LOCKS])
-{
-    ovs_mutex_lock(&dp->meter_locks[meter_id % N_METER_LOCKS]);
-}
-
-static void meter_unlock(const struct dp_netdev *dp, uint32_t meter_id)
-    OVS_RELEASES(dp->meter_locks[meter_id % N_METER_LOCKS])
-{
-    ovs_mutex_unlock(&dp->meter_locks[meter_id % N_METER_LOCKS]);
-}
-
-
 static struct dp_netdev_port *dp_netdev_lookup_port(const struct dp_netdev *dp,
                                                     odp_port_t)
     OVS_REQUIRES(dp->port_mutex);
@@ -1747,6 +1736,68 @@  choose_port(struct dp_netdev *dp, const char *name)
     return ODPP_NONE;
 }
 
+static uint32_t
+dp_meter_hash(uint32_t meter_id)
+{
+    /* In the ofproto-dpif layer, we use the id-pool
+     * to alloc meter id orderly (e.g. 1, 2, ... N.),
+     * which provide a better hash distribution.
+     * Use them directly instead of hash_xxx function
+     * for achieving high-performance.
+     */
+    return meter_id;
+}
+
+static void
+dp_netdev_meter_destroy(struct dp_netdev *dp)
+{
+    struct dp_meter *m;
+
+    ovs_mutex_lock(&dp->meters_lock);
+    CMAP_FOR_EACH (m, node, &dp->meters) {
+        cmap_remove(&dp->meters, &m->node, dp_meter_hash(m->id));
+        ovsrcu_postpone(free, m);
+    }
+
+    cmap_destroy(&dp->meters);
+    ovs_mutex_unlock(&dp->meters_lock);
+
+    ovs_mutex_destroy(&dp->meters_lock);
+}
+
+static struct dp_meter*
+dp_meter_lookup(struct cmap *meters, uint32_t meter_id)
+{
+    uint32_t hash = dp_meter_hash(meter_id);
+    struct dp_meter *m;
+
+    CMAP_FOR_EACH_WITH_HASH (m, node, hash, meters) {
+        if (m->id == meter_id) {
+            return m;
+        }
+    }
+
+    return NULL;
+}
+
+static void
+dp_meter_detach_free(struct cmap *meters, uint32_t meter_id)
+{
+    struct dp_meter *m;
+
+    m = dp_meter_lookup(meters, meter_id);
+    if (m) {
+            cmap_remove(meters, &m->node, dp_meter_hash(meter_id));
+            ovsrcu_postpone(free, m);
+    }
+}
+
+static void
+dp_meter_attach(struct cmap *meters, struct dp_meter *meter)
+{
+    cmap_insert(meters, &meter->node, dp_meter_hash(meter->id));
+}
+
 static int
 create_dp_netdev(const char *name, const struct dpif_class *class,
                  struct dp_netdev **dpp)
@@ -1783,9 +1834,9 @@  create_dp_netdev(const char *name, const struct dpif_class *class,
     dp->reconfigure_seq = seq_create();
     dp->last_reconfigure_seq = seq_read(dp->reconfigure_seq);
 
-    for (int i = 0; i < N_METER_LOCKS; ++i) {
-        ovs_mutex_init_adaptive(&dp->meter_locks[i]);
-    }
+    /* Init meter resources. */
+    cmap_init(&dp->meters);
+    ovs_mutex_init(&dp->meters_lock);
 
     /* Disable upcalls by default. */
     dp_netdev_disable_upcall(dp);
@@ -1874,16 +1925,6 @@  dp_netdev_destroy_upcall_lock(struct dp_netdev *dp)
     fat_rwlock_destroy(&dp->upcall_rwlock);
 }
 
-static void
-dp_delete_meter(struct dp_netdev *dp, uint32_t meter_id)
-    OVS_REQUIRES(dp->meter_locks[meter_id % N_METER_LOCKS])
-{
-    if (dp->meters[meter_id]) {
-        free(dp->meters[meter_id]);
-        dp->meters[meter_id] = NULL;
-    }
-}
-
 static uint32_t
 hash_bond_id(uint32_t bond_id)
 {
@@ -1938,16 +1979,7 @@  dp_netdev_free(struct dp_netdev *dp)
     /* Upcalls must be disabled at this point */
     dp_netdev_destroy_upcall_lock(dp);
 
-    int i;
-
-    for (i = 0; i < MAX_METERS; ++i) {
-        meter_lock(dp, i);
-        dp_delete_meter(dp, i);
-        meter_unlock(dp, i);
-    }
-    for (i = 0; i < N_METER_LOCKS; ++i) {
-        ovs_mutex_destroy(&dp->meter_locks[i]);
-    }
+    dp_netdev_meter_destroy(dp);
 
     free(dp->pmd_cmask);
     free(CONST_CAST(char *, dp->name));
@@ -6175,10 +6207,9 @@  dp_netdev_run_meter(struct dp_netdev *dp, struct dp_packet_batch *packets_,
         return;
     }
 
-    meter_lock(dp, meter_id);
-    meter = dp->meters[meter_id];
+    meter = dp_meter_lookup(&dp->meters, meter_id);
     if (!meter) {
-        goto out;
+        return;
     }
 
     /* Initialize as negative values. */
@@ -6186,6 +6217,7 @@  dp_netdev_run_meter(struct dp_netdev *dp, struct dp_packet_batch *packets_,
     /* Initialize as zeroes. */
     memset(exceeded_rate, 0, cnt * sizeof *exceeded_rate);
 
+    ovs_mutex_lock(&meter->lock);
     /* All packets will hit the meter at the same time. */
     long_delta_t = now / 1000 - meter->used / 1000; /* msec */
 
@@ -6301,8 +6333,8 @@  dp_netdev_run_meter(struct dp_netdev *dp, struct dp_packet_batch *packets_,
             dp_packet_batch_refill(packets_, packet, j);
         }
     }
- out:
-    meter_unlock(dp, meter_id);
+
+    ovs_mutex_unlock(&meter->lock);
 }
 
 /* Meter set/get/del processing is still single-threaded. */
@@ -6344,6 +6376,8 @@  dpif_netdev_meter_set(struct dpif *dpif, ofproto_meter_id meter_id,
     meter->n_bands = config->n_bands;
     meter->max_delta_t = 0;
     meter->used = time_usec();
+    meter->id = mid;
+    ovs_mutex_init(&meter->lock);
 
     /* set up bands */
     for (i = 0; i < config->n_bands; ++i) {
@@ -6368,10 +6402,12 @@  dpif_netdev_meter_set(struct dpif *dpif, ofproto_meter_id meter_id,
         }
     }
 
-    meter_lock(dp, mid);
-    dp_delete_meter(dp, mid); /* Free existing meter, if any */
-    dp->meters[mid] = meter;
-    meter_unlock(dp, mid);
+    ovs_mutex_lock(&dp->meters_lock);
+
+    dp_meter_detach_free(&dp->meters, mid); /* Free existing meter, if any */
+    dp_meter_attach(&dp->meters, meter);
+
+    ovs_mutex_unlock(&dp->meters_lock);
 
     return 0;
 }
@@ -6381,23 +6417,24 @@  dpif_netdev_meter_get(const struct dpif *dpif,
                       ofproto_meter_id meter_id_,
                       struct ofputil_meter_stats *stats, uint16_t n_bands)
 {
-    const struct dp_netdev *dp = get_dp_netdev(dpif);
+    struct dp_netdev *dp = get_dp_netdev(dpif);
     uint32_t meter_id = meter_id_.uint32;
-    int retval = 0;
+    const struct dp_meter *meter;
 
     if (meter_id >= MAX_METERS) {
         return EFBIG;
     }
 
-    meter_lock(dp, meter_id);
-    const struct dp_meter *meter = dp->meters[meter_id];
+    meter = dp_meter_lookup(&dp->meters, meter_id);
     if (!meter) {
-        retval = ENOENT;
-        goto done;
+        return ENOENT;
     }
+
     if (stats) {
         int i = 0;
 
+        ovs_mutex_lock(&meter->lock);
+
         stats->packet_in_count = meter->packet_count;
         stats->byte_in_count = meter->byte_count;
 
@@ -6406,12 +6443,11 @@  dpif_netdev_meter_get(const struct dpif *dpif,
             stats->bands[i].byte_count = meter->bands[i].byte_count;
         }
 
+        ovs_mutex_unlock(&meter->lock);
         stats->n_bands = i;
     }
 
-done:
-    meter_unlock(dp, meter_id);
-    return retval;
+    return 0;
 }
 
 static int
@@ -6426,9 +6462,9 @@  dpif_netdev_meter_del(struct dpif *dpif,
     if (!error) {
         uint32_t meter_id = meter_id_.uint32;
 
-        meter_lock(dp, meter_id);
-        dp_delete_meter(dp, meter_id);
-        meter_unlock(dp, meter_id);
+        ovs_mutex_lock(&dp->meters_lock);
+        dp_meter_detach_free(&dp->meters, meter_id);
+        ovs_mutex_unlock(&dp->meters_lock);
     }
     return error;
 }