diff mbox

[ovs-dev,01/12] dpcls: Use 32 packet batches for lookups.

Message ID 1475857062-55311-2-git-send-email-bhanuprakash.bodireddy@intel.com
State Changes Requested
Delegated to: Daniele Di Proietto
Headers show

Commit Message

Bodireddy, Bhanuprakash Oct. 7, 2016, 4:17 p.m. UTC
This patch increases the number of packets processed in a batch during a
lookup from 16 to 32. Processing batches of 32 packets improves
performance and also one of the internal loops can be avoided here.

Signed-off-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Bhanuprakash Bodireddy <bhanuprakash.bodireddy@intel.com>
---
 lib/dpif-netdev.c | 109 +++++++++++++++++++++++-------------------------------
 1 file changed, 46 insertions(+), 63 deletions(-)

Comments

Jarno Rajahalme Oct. 7, 2016, 9:07 p.m. UTC | #1
> On Oct 7, 2016, at 9:17 AM, Bhanuprakash Bodireddy <bhanuprakash.bodireddy@intel.com> wrote:
> 
> This patch increases the number of packets processed in a batch during a
> lookup from 16 to 32. Processing batches of 32 packets improves
> performance and also one of the internal loops can be avoided here.
> 

Can you provide some qualification of the performance test conditions? Do you believe the performance difference applies universally?

> Signed-off-by: Antonio Fischetti <antonio.fischetti@intel.com>
> Signed-off-by: Bhanuprakash Bodireddy <bhanuprakash.bodireddy@intel.com>
> ---
> lib/dpif-netdev.c | 109 +++++++++++++++++++++++-------------------------------
> 1 file changed, 46 insertions(+), 63 deletions(-)
> 
> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
> index 6e09e44..c002dd3 100644
> --- a/lib/dpif-netdev.c
> +++ b/lib/dpif-netdev.c
> @@ -4975,23 +4975,20 @@ dpcls_lookup(struct dpcls *cls, const struct netdev_flow_key keys[],
>              int *num_lookups_p)
> {
>     /* The received 'cnt' miniflows are the search-keys that will be processed
> -     * in batches of 16 elements.  N_MAPS will contain the number of these
> -     * 16-elements batches.  i.e. for 'cnt' = 32, N_MAPS will be 2.  The batch
> -     * size 16 was experimentally found faster than 8 or 32. */
> -    typedef uint16_t map_type;
> +     * to find a matching entry into the available subtables.
> +     * The number of bits in map_type is equal to NETDEV_MAX_BURST. */

This needs a build time assertion to catch any future change in NETDEV_MAX_BURST.

Preferably, if you can verify that the compiler is able to remove the unnecessary loop in this case, you should consider not removing the extra loop that would kick in only if NETDEV_MAX_BURST becomes larger than 32.

> +    typedef uint32_t map_type;
> #define MAP_BITS (sizeof(map_type) * CHAR_BIT)
> 
> -#if !defined(__CHECKER__) && !defined(_WIN32)
> -    const int N_MAPS = DIV_ROUND_UP(cnt, MAP_BITS);
> -#else
> -    enum { N_MAPS = DIV_ROUND_UP(NETDEV_MAX_BURST, MAP_BITS) };
> -#endif
> -    map_type maps[N_MAPS];
>     struct dpcls_subtable *subtable;
> 
> -    memset(maps, 0xff, sizeof maps);
> -    if (cnt % MAP_BITS) {
> -        maps[N_MAPS - 1] >>= MAP_BITS - cnt % MAP_BITS; /* Clear extra bits. */
> +    map_type keys_map = 0xffffffff;
> +    map_type found_map;
> +    uint32_t hashes[MAP_BITS];
> +    const struct cmap_node *nodes[MAP_BITS];
> +
> +    if (OVS_UNLIKELY(cnt != NETDEV_MAX_BURST)) {
> +        keys_map >>= NETDEV_MAX_BURST - cnt; /* Clear extra bits. */
>     }
>     memset(rules, 0, cnt * sizeof *rules);
> 
> @@ -5007,59 +5004,45 @@ dpcls_lookup(struct dpcls *cls, const struct netdev_flow_key keys[],
>     PVECTOR_FOR_EACH (subtable, &cls->subtables) {
>         const struct netdev_flow_key *mkeys = keys;
>         struct dpcls_rule **mrules = rules;
> -        map_type remains = 0;
> -        int m;
> -
> -        BUILD_ASSERT_DECL(sizeof remains == sizeof *maps);
> -
> -        /* Loops on each batch of 16 search-keys. */
> -        for (m = 0; m < N_MAPS; m++, mkeys += MAP_BITS, mrules += MAP_BITS) {
> -            uint32_t hashes[MAP_BITS];
> -            const struct cmap_node *nodes[MAP_BITS];
> -            unsigned long map = maps[m];
> -            int i;
> -
> -            if (!map) {
> -                continue; /* Skip empty maps. */
> -            }
> -
> -            /* Compute hashes for the remaining keys.  Each search-key is
> -             * masked with the subtable's mask to avoid hashing the wildcarded
> -             * bits. */
> -            ULLONG_FOR_EACH_1(i, map) {
> -                hashes[i] = netdev_flow_key_hash_in_mask(&mkeys[i],
> -                                                         &subtable->mask);
> -            }
> -            /* Lookup. */
> -            map = cmap_find_batch(&subtable->rules, map, hashes, nodes);
> -            /* Check results.  When the i-th bit of map is set, it means that a
> -             * set of nodes with a matching hash value was found for the i-th
> -             * search-key.  Due to possible hash collisions we need to check
> -             * which of the found rules, if any, really matches our masked
> -             * search-key. */
> -            ULLONG_FOR_EACH_1(i, map) {
> -                struct dpcls_rule *rule;
> -
> -                CMAP_NODE_FOR_EACH (rule, cmap_node, nodes[i]) {
> -                    if (OVS_LIKELY(dpcls_rule_matches_key(rule, &mkeys[i]))) {
> -                        mrules[i] = rule;
> -                        /* Even at 20 Mpps the 32-bit hit_cnt cannot wrap
> -                         * within one second optimization interval  */
> -                        subtable->hit_cnt++;
> -                        lookups_match += subtable_pos;
> -                        goto next;
> -                    }
> +        int i;
> +        found_map = keys_map;
> +
> +        /* Compute hashes for the remaining keys.  Each search-key is
> +         * masked with the subtable's mask to avoid hashing the wildcarded
> +         * bits. */
> +        ULLONG_FOR_EACH_1(i, keys_map) {
> +            hashes[i] = netdev_flow_key_hash_in_mask(&mkeys[i],
> +                                                     &subtable->mask);
> +        }
> +        /* Lookup. */
> +        found_map = cmap_find_batch(&subtable->rules, found_map, hashes,
> +                                    nodes);
> +        /* Check results.  When the i-th bit of found_map is set, it means
> +         * that a set of nodes with a matching hash value was found for the
> +         * i-th search-key.  Due to possible hash collisions we need to check
> +         * which of the found rules, if any, really matches our masked
> +         * search-key. */
> +        ULLONG_FOR_EACH_1(i, found_map) {
> +            struct dpcls_rule *rule;
> +
> +            CMAP_NODE_FOR_EACH (rule, cmap_node, nodes[i]) {
> +                if (OVS_LIKELY(dpcls_rule_matches_key(rule, &mkeys[i]))) {
> +                    mrules[i] = rule;
> +                    /* Even at 20 Mpps the 32-bit hit_cnt cannot wrap
> +                     * within one second optimization interval. */
> +                    subtable->hit_cnt++;
> +                    lookups_match += subtable_pos;
> +                    goto next;
>                 }
> -                /* None of the found rules was a match. Reset the i-th bit to
> -                 * keep searching in the next subtable. */
> -                ULLONG_SET0(map, i);  /* Did not match. */
> -            next:
> -                ;                     /* Keep Sparse happy. */
>             }
> -            maps[m] &= ~map;          /* Clear the found rules. */
> -            remains |= maps[m];
> +            /* None of the found rules was a match.  Reset the i-th bit to
> +             * keep searching this key in the next subtable. */
> +            ULLONG_SET0(found_map, i);  /* Did not match. */
> +        next:
> +            ;                     /* Keep Sparse happy. */
>         }
> -        if (!remains) {
> +        keys_map &= ~found_map;             /* Clear the found rules. */
> +        if (!keys_map) {
>             if (num_lookups_p) {
>                 *num_lookups_p = lookups_match;
>             }
> -- 
> 2.4.11
> 
> _______________________________________________
> dev mailing list
> dev@openvswitch.org
> http://openvswitch.org/mailman/listinfo/dev
Fischetti, Antonio Oct. 11, 2016, 3:33 p.m. UTC | #2
Comments inline.

Thanks,
Antonio

> -----Original Message-----

> From: dev [mailto:dev-bounces@openvswitch.org] On Behalf Of Jarno

> Rajahalme

> Sent: Friday, October 7, 2016 10:08 PM

> To: Bodireddy, Bhanuprakash <bhanuprakash.bodireddy@intel.com>

> Cc: dev@openvswitch.org

> Subject: Re: [ovs-dev] [PATCH 01/12] dpcls: Use 32 packet batches for

> lookups.

> 

> 

> > On Oct 7, 2016, at 9:17 AM, Bhanuprakash Bodireddy

> <bhanuprakash.bodireddy@intel.com> wrote:

> >

> > This patch increases the number of packets processed in a batch during a

> > lookup from 16 to 32. Processing batches of 32 packets improves

> > performance and also one of the internal loops can be avoided here.

> >

> 

> Can you provide some qualification of the performance test conditions? Do

> you believe the performance difference applies universally?

> 

[Antonio F] We saw a performance improvement in EMC disabled PHY2PHY loopback testcase with 2 physical ports and few tens of active IXIA streams. With 1 subtable - flow: from port1 to port2 - we got a throughput increment of +150 kpps. With 4 subtables the increment was +80 kpps. 
Also we spent some time profiling OVS with VTune. When comparing stock ovs with the patched one, it’s been observed that for dpcls_lookup(), there is significant reduction in the overall retired instructions (reduced by 729,800,000), CPI rate improved from 0.471 to 0.427 and Front-end bound cycles reduced from 19.1% to 8.5%.


> > Signed-off-by: Antonio Fischetti <antonio.fischetti@intel.com>

> > Signed-off-by: Bhanuprakash Bodireddy <bhanuprakash.bodireddy@intel.com>

> > ---

> > lib/dpif-netdev.c | 109 +++++++++++++++++++++++-------------------------

> ------

> > 1 file changed, 46 insertions(+), 63 deletions(-)

> >

> > diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c

> > index 6e09e44..c002dd3 100644

> > --- a/lib/dpif-netdev.c

> > +++ b/lib/dpif-netdev.c

> > @@ -4975,23 +4975,20 @@ dpcls_lookup(struct dpcls *cls, const struct

> netdev_flow_key keys[],

> >              int *num_lookups_p)

> > {

> >     /* The received 'cnt' miniflows are the search-keys that will be

> processed

> > -     * in batches of 16 elements.  N_MAPS will contain the number of

> these

> > -     * 16-elements batches.  i.e. for 'cnt' = 32, N_MAPS will be 2.

> The batch

> > -     * size 16 was experimentally found faster than 8 or 32. */

> > -    typedef uint16_t map_type;

> > +     * to find a matching entry into the available subtables.

> > +     * The number of bits in map_type is equal to NETDEV_MAX_BURST. */

> 

> This needs a build time assertion to catch any future change in

> NETDEV_MAX_BURST.

> 

> Preferably, if you can verify that the compiler is able to remove the

> unnecessary loop in this case, you should consider not removing the extra

> loop that would kick in only if NETDEV_MAX_BURST becomes larger than 32.

> 

[Antonio F] Is that ok if we add the following line after the MAP_BITS define?
BUILD_ASSERT_DECL(MAP_BITS >= NETDEV_MAX_BURST);

It seems NETDEV_MAX_BURST was reduced to 32 from 256 a while ago, and one of the reasons 
being the slow VMs and packet drops at VMs. Can we safely assume that NETDEV_MAX_BURST is
unlikely to be increased as it has wider consequence on the system stability?


> > +    typedef uint32_t map_type;

> > #define MAP_BITS (sizeof(map_type) * CHAR_BIT)

> >

> > -#if !defined(__CHECKER__) && !defined(_WIN32)

> > -    const int N_MAPS = DIV_ROUND_UP(cnt, MAP_BITS);

> > -#else

> > -    enum { N_MAPS = DIV_ROUND_UP(NETDEV_MAX_BURST, MAP_BITS) };

> > -#endif

> > -    map_type maps[N_MAPS];

> >     struct dpcls_subtable *subtable;

> >

> > -    memset(maps, 0xff, sizeof maps);

> > -    if (cnt % MAP_BITS) {

> > -        maps[N_MAPS - 1] >>= MAP_BITS - cnt % MAP_BITS; /* Clear extra

> bits. */

> > +    map_type keys_map = 0xffffffff;

> > +    map_type found_map;

> > +    uint32_t hashes[MAP_BITS];

> > +    const struct cmap_node *nodes[MAP_BITS];

> > +

> > +    if (OVS_UNLIKELY(cnt != NETDEV_MAX_BURST)) {

> > +        keys_map >>= NETDEV_MAX_BURST - cnt; /* Clear extra bits. */

> >     }

> >     memset(rules, 0, cnt * sizeof *rules);

> >

> > @@ -5007,59 +5004,45 @@ dpcls_lookup(struct dpcls *cls, const struct

> netdev_flow_key keys[],

> >     PVECTOR_FOR_EACH (subtable, &cls->subtables) {

> >         const struct netdev_flow_key *mkeys = keys;

> >         struct dpcls_rule **mrules = rules;

> > -        map_type remains = 0;

> > -        int m;

> > -

> > -        BUILD_ASSERT_DECL(sizeof remains == sizeof *maps);

> > -

> > -        /* Loops on each batch of 16 search-keys. */

> > -        for (m = 0; m < N_MAPS; m++, mkeys += MAP_BITS, mrules +=

> MAP_BITS) {

> > -            uint32_t hashes[MAP_BITS];

> > -            const struct cmap_node *nodes[MAP_BITS];

> > -            unsigned long map = maps[m];

> > -            int i;

> > -

> > -            if (!map) {

> > -                continue; /* Skip empty maps. */

> > -            }

> > -

> > -            /* Compute hashes for the remaining keys.  Each search-key

> is

> > -             * masked with the subtable's mask to avoid hashing the

> wildcarded

> > -             * bits. */

> > -            ULLONG_FOR_EACH_1(i, map) {

> > -                hashes[i] = netdev_flow_key_hash_in_mask(&mkeys[i],

> > -                                                         &subtable-

> >mask);

> > -            }

> > -            /* Lookup. */

> > -            map = cmap_find_batch(&subtable->rules, map, hashes,

> nodes);

> > -            /* Check results.  When the i-th bit of map is set, it

> means that a

> > -             * set of nodes with a matching hash value was found for

> the i-th

> > -             * search-key.  Due to possible hash collisions we need to

> check

> > -             * which of the found rules, if any, really matches our

> masked

> > -             * search-key. */

> > -            ULLONG_FOR_EACH_1(i, map) {

> > -                struct dpcls_rule *rule;

> > -

> > -                CMAP_NODE_FOR_EACH (rule, cmap_node, nodes[i]) {

> > -                    if (OVS_LIKELY(dpcls_rule_matches_key(rule,

> &mkeys[i]))) {

> > -                        mrules[i] = rule;

> > -                        /* Even at 20 Mpps the 32-bit hit_cnt cannot

> wrap

> > -                         * within one second optimization interval  */

> > -                        subtable->hit_cnt++;

> > -                        lookups_match += subtable_pos;

> > -                        goto next;

> > -                    }

> > +        int i;

> > +        found_map = keys_map;

> > +

> > +        /* Compute hashes for the remaining keys.  Each search-key is

> > +         * masked with the subtable's mask to avoid hashing the

> wildcarded

> > +         * bits. */

> > +        ULLONG_FOR_EACH_1(i, keys_map) {

> > +            hashes[i] = netdev_flow_key_hash_in_mask(&mkeys[i],

> > +                                                     &subtable->mask);

> > +        }

> > +        /* Lookup. */

> > +        found_map = cmap_find_batch(&subtable->rules, found_map,

> hashes,

> > +                                    nodes);

> > +        /* Check results.  When the i-th bit of found_map is set, it

> means

> > +         * that a set of nodes with a matching hash value was found for

> the

> > +         * i-th search-key.  Due to possible hash collisions we need to

> check

> > +         * which of the found rules, if any, really matches our masked

> > +         * search-key. */

> > +        ULLONG_FOR_EACH_1(i, found_map) {

> > +            struct dpcls_rule *rule;

> > +

> > +            CMAP_NODE_FOR_EACH (rule, cmap_node, nodes[i]) {

> > +                if (OVS_LIKELY(dpcls_rule_matches_key(rule,

> &mkeys[i]))) {

> > +                    mrules[i] = rule;

> > +                    /* Even at 20 Mpps the 32-bit hit_cnt cannot wrap

> > +                     * within one second optimization interval. */

> > +                    subtable->hit_cnt++;

> > +                    lookups_match += subtable_pos;

> > +                    goto next;

> >                 }

> > -                /* None of the found rules was a match. Reset the i-th

> bit to

> > -                 * keep searching in the next subtable. */

> > -                ULLONG_SET0(map, i);  /* Did not match. */

> > -            next:

> > -                ;                     /* Keep Sparse happy. */

> >             }

> > -            maps[m] &= ~map;          /* Clear the found rules. */

> > -            remains |= maps[m];

> > +            /* None of the found rules was a match.  Reset the i-th bit

> to

> > +             * keep searching this key in the next subtable. */

> > +            ULLONG_SET0(found_map, i);  /* Did not match. */

> > +        next:

> > +            ;                     /* Keep Sparse happy. */

> >         }

> > -        if (!remains) {

> > +        keys_map &= ~found_map;             /* Clear the found rules.

> */

> > +        if (!keys_map) {

> >             if (num_lookups_p) {

> >                 *num_lookups_p = lookups_match;

> >             }

> > --

> > 2.4.11

> >

> > _______________________________________________

> > dev mailing list

> > dev@openvswitch.org

> > http://openvswitch.org/mailman/listinfo/dev

> 

> _______________________________________________

> dev mailing list

> dev@openvswitch.org

> http://openvswitch.org/mailman/listinfo/dev
Pravin Shelar Oct. 12, 2016, 9:09 p.m. UTC | #3
On Tue, Oct 11, 2016 at 8:33 AM, Fischetti, Antonio
<antonio.fischetti@intel.com> wrote:
> Comments inline.
>
> Thanks,
> Antonio
>
>> -----Original Message-----
>> From: dev [mailto:dev-bounces@openvswitch.org] On Behalf Of Jarno
>> Rajahalme
>> Sent: Friday, October 7, 2016 10:08 PM
>> To: Bodireddy, Bhanuprakash <bhanuprakash.bodireddy@intel.com>
>> Cc: dev@openvswitch.org
>> Subject: Re: [ovs-dev] [PATCH 01/12] dpcls: Use 32 packet batches for
>> lookups.
>>
>>
>> > On Oct 7, 2016, at 9:17 AM, Bhanuprakash Bodireddy
>> <bhanuprakash.bodireddy@intel.com> wrote:
>> >
>> > This patch increases the number of packets processed in a batch during a
>> > lookup from 16 to 32. Processing batches of 32 packets improves
>> > performance and also one of the internal loops can be avoided here.
>> >
>>
>> Can you provide some qualification of the performance test conditions? Do
>> you believe the performance difference applies universally?
>>
> [Antonio F] We saw a performance improvement in EMC disabled PHY2PHY loopback testcase with 2 physical ports and few tens of active IXIA streams. With 1 subtable - flow: from port1 to port2 - we got a throughput increment of +150 kpps. With 4 subtables the increment was +80 kpps.
> Also we spent some time profiling OVS with VTune. When comparing stock ovs with the patched one, it’s been observed that for dpcls_lookup(), there is significant reduction in the overall retired instructions (reduced by 729,800,000), CPI rate improved from 0.471 to 0.427 and Front-end bound cycles reduced from 19.1% to 8.5%.
>
>
>> > Signed-off-by: Antonio Fischetti <antonio.fischetti@intel.com>
>> > Signed-off-by: Bhanuprakash Bodireddy <bhanuprakash.bodireddy@intel.com>
>> > ---
>> > lib/dpif-netdev.c | 109 +++++++++++++++++++++++-------------------------
>> ------
>> > 1 file changed, 46 insertions(+), 63 deletions(-)
>> >
>> > diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
>> > index 6e09e44..c002dd3 100644
>> > --- a/lib/dpif-netdev.c
>> > +++ b/lib/dpif-netdev.c
>> > @@ -4975,23 +4975,20 @@ dpcls_lookup(struct dpcls *cls, const struct
>> netdev_flow_key keys[],
>> >              int *num_lookups_p)
>> > {
>> >     /* The received 'cnt' miniflows are the search-keys that will be
>> processed
>> > -     * in batches of 16 elements.  N_MAPS will contain the number of
>> these
>> > -     * 16-elements batches.  i.e. for 'cnt' = 32, N_MAPS will be 2.
>> The batch
>> > -     * size 16 was experimentally found faster than 8 or 32. */
>> > -    typedef uint16_t map_type;
>> > +     * to find a matching entry into the available subtables.
>> > +     * The number of bits in map_type is equal to NETDEV_MAX_BURST. */
>>
>> This needs a build time assertion to catch any future change in
>> NETDEV_MAX_BURST.
>>
>> Preferably, if you can verify that the compiler is able to remove the
>> unnecessary loop in this case, you should consider not removing the extra
>> loop that would kick in only if NETDEV_MAX_BURST becomes larger than 32.
>>
> [Antonio F] Is that ok if we add the following line after the MAP_BITS define?
> BUILD_ASSERT_DECL(MAP_BITS >= NETDEV_MAX_BURST);
>
> It seems NETDEV_MAX_BURST was reduced to 32 from 256 a while ago, and one of the reasons
> being the slow VMs and packet drops at VMs. Can we safely assume that NETDEV_MAX_BURST is
> unlikely to be increased as it has wider consequence on the system stability?
>
>
I think it is safe to assume it for now. Looking at the code I think
it can be easily increased to 64. So it is fine to add the assertion.
diff mbox

Patch

diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index 6e09e44..c002dd3 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -4975,23 +4975,20 @@  dpcls_lookup(struct dpcls *cls, const struct netdev_flow_key keys[],
              int *num_lookups_p)
 {
     /* The received 'cnt' miniflows are the search-keys that will be processed
-     * in batches of 16 elements.  N_MAPS will contain the number of these
-     * 16-elements batches.  i.e. for 'cnt' = 32, N_MAPS will be 2.  The batch
-     * size 16 was experimentally found faster than 8 or 32. */
-    typedef uint16_t map_type;
+     * to find a matching entry into the available subtables.
+     * The number of bits in map_type is equal to NETDEV_MAX_BURST. */
+    typedef uint32_t map_type;
 #define MAP_BITS (sizeof(map_type) * CHAR_BIT)
 
-#if !defined(__CHECKER__) && !defined(_WIN32)
-    const int N_MAPS = DIV_ROUND_UP(cnt, MAP_BITS);
-#else
-    enum { N_MAPS = DIV_ROUND_UP(NETDEV_MAX_BURST, MAP_BITS) };
-#endif
-    map_type maps[N_MAPS];
     struct dpcls_subtable *subtable;
 
-    memset(maps, 0xff, sizeof maps);
-    if (cnt % MAP_BITS) {
-        maps[N_MAPS - 1] >>= MAP_BITS - cnt % MAP_BITS; /* Clear extra bits. */
+    map_type keys_map = 0xffffffff;
+    map_type found_map;
+    uint32_t hashes[MAP_BITS];
+    const struct cmap_node *nodes[MAP_BITS];
+
+    if (OVS_UNLIKELY(cnt != NETDEV_MAX_BURST)) {
+        keys_map >>= NETDEV_MAX_BURST - cnt; /* Clear extra bits. */
     }
     memset(rules, 0, cnt * sizeof *rules);
 
@@ -5007,59 +5004,45 @@  dpcls_lookup(struct dpcls *cls, const struct netdev_flow_key keys[],
     PVECTOR_FOR_EACH (subtable, &cls->subtables) {
         const struct netdev_flow_key *mkeys = keys;
         struct dpcls_rule **mrules = rules;
-        map_type remains = 0;
-        int m;
-
-        BUILD_ASSERT_DECL(sizeof remains == sizeof *maps);
-
-        /* Loops on each batch of 16 search-keys. */
-        for (m = 0; m < N_MAPS; m++, mkeys += MAP_BITS, mrules += MAP_BITS) {
-            uint32_t hashes[MAP_BITS];
-            const struct cmap_node *nodes[MAP_BITS];
-            unsigned long map = maps[m];
-            int i;
-
-            if (!map) {
-                continue; /* Skip empty maps. */
-            }
-
-            /* Compute hashes for the remaining keys.  Each search-key is
-             * masked with the subtable's mask to avoid hashing the wildcarded
-             * bits. */
-            ULLONG_FOR_EACH_1(i, map) {
-                hashes[i] = netdev_flow_key_hash_in_mask(&mkeys[i],
-                                                         &subtable->mask);
-            }
-            /* Lookup. */
-            map = cmap_find_batch(&subtable->rules, map, hashes, nodes);
-            /* Check results.  When the i-th bit of map is set, it means that a
-             * set of nodes with a matching hash value was found for the i-th
-             * search-key.  Due to possible hash collisions we need to check
-             * which of the found rules, if any, really matches our masked
-             * search-key. */
-            ULLONG_FOR_EACH_1(i, map) {
-                struct dpcls_rule *rule;
-
-                CMAP_NODE_FOR_EACH (rule, cmap_node, nodes[i]) {
-                    if (OVS_LIKELY(dpcls_rule_matches_key(rule, &mkeys[i]))) {
-                        mrules[i] = rule;
-                        /* Even at 20 Mpps the 32-bit hit_cnt cannot wrap
-                         * within one second optimization interval  */
-                        subtable->hit_cnt++;
-                        lookups_match += subtable_pos;
-                        goto next;
-                    }
+        int i;
+        found_map = keys_map;
+
+        /* Compute hashes for the remaining keys.  Each search-key is
+         * masked with the subtable's mask to avoid hashing the wildcarded
+         * bits. */
+        ULLONG_FOR_EACH_1(i, keys_map) {
+            hashes[i] = netdev_flow_key_hash_in_mask(&mkeys[i],
+                                                     &subtable->mask);
+        }
+        /* Lookup. */
+        found_map = cmap_find_batch(&subtable->rules, found_map, hashes,
+                                    nodes);
+        /* Check results.  When the i-th bit of found_map is set, it means
+         * that a set of nodes with a matching hash value was found for the
+         * i-th search-key.  Due to possible hash collisions we need to check
+         * which of the found rules, if any, really matches our masked
+         * search-key. */
+        ULLONG_FOR_EACH_1(i, found_map) {
+            struct dpcls_rule *rule;
+
+            CMAP_NODE_FOR_EACH (rule, cmap_node, nodes[i]) {
+                if (OVS_LIKELY(dpcls_rule_matches_key(rule, &mkeys[i]))) {
+                    mrules[i] = rule;
+                    /* Even at 20 Mpps the 32-bit hit_cnt cannot wrap
+                     * within one second optimization interval. */
+                    subtable->hit_cnt++;
+                    lookups_match += subtable_pos;
+                    goto next;
                 }
-                /* None of the found rules was a match. Reset the i-th bit to
-                 * keep searching in the next subtable. */
-                ULLONG_SET0(map, i);  /* Did not match. */
-            next:
-                ;                     /* Keep Sparse happy. */
             }
-            maps[m] &= ~map;          /* Clear the found rules. */
-            remains |= maps[m];
+            /* None of the found rules was a match.  Reset the i-th bit to
+             * keep searching this key in the next subtable. */
+            ULLONG_SET0(found_map, i);  /* Did not match. */
+        next:
+            ;                     /* Keep Sparse happy. */
         }
-        if (!remains) {
+        keys_map &= ~found_map;             /* Clear the found rules. */
+        if (!keys_map) {
             if (num_lookups_p) {
                 *num_lookups_p = lookups_match;
             }