diff mbox

[ovs-dev,4/4] tunneling: Avoid recirculation on datapath by computing the recirculate actions at translate time.

Message ID 1497631551-62258-5-git-send-email-sugesh.chandran@intel.com
State Changes Requested
Delegated to: Joe Stringer
Headers show

Commit Message

Chandran, Sugesh June 16, 2017, 4:45 p.m. UTC
This patch set removes the recirculation of encapsulated tunnel packets
if possible. It is done by computing the post tunnel actions at the time of
translation. The combined nested action set are programmed in the datapath
using CLONE action.

Signed-off-by: Sugesh Chandran <sugesh.chandran@intel.com>
Signed-off-by: Zoltán Balogh <zoltan.balogh@ericsson.com>
Co-authored-by: Zoltán Balogh <zoltan.balogh@ericsson.com>
---
 lib/dpif-netdev.c                  | 18 +-------
 ofproto/ofproto-dpif-xlate-cache.c | 14 ++++++-
 ofproto/ofproto-dpif-xlate-cache.h | 15 ++++++-
 ofproto/ofproto-dpif-xlate.c       | 85 ++++++++++++++++++++++++++++++++++----
 ofproto/ofproto-dpif-xlate.h       |  1 +
 ofproto/ofproto-dpif.c             |  3 +-
 6 files changed, 107 insertions(+), 29 deletions(-)

Comments

Joe Stringer June 26, 2017, 6:05 p.m. UTC | #1
On 16 June 2017 at 09:45, Sugesh Chandran <sugesh.chandran@intel.com> wrote:
> This patch set removes the recirculation of encapsulated tunnel packets
> if possible. It is done by computing the post tunnel actions at the time of
> translation. The combined nested action set are programmed in the datapath
> using CLONE action.
>
> Signed-off-by: Sugesh Chandran <sugesh.chandran@intel.com>
> Signed-off-by: Zoltán Balogh <zoltan.balogh@ericsson.com>
> Co-authored-by: Zoltán Balogh <zoltan.balogh@ericsson.com>
> ---

It seems like this approach attempts to simulate the later bridge
traversal to generate the actions to be processed, forgets all of this
processing, then if it won't work, generates the recirc action.. but
if it would work, goes back through the latter bridge traversal again,
this time keeping all of the generated netlink actions; attributing
stats; and applying side effects.

So, the first traversal needs to *not* attribute stats/side effects
initially, until we later do the check and determine that it's the
right thing to do... but later on, if we determine that it's the
correct set of actions, then ideally we wouldn't have to go and do the
whole same translation again. The common case should be that the
latter bridge traversal is the actual actions we want to do.. Could we
instead, for this first traversal, clear out the
packet/resubmit_stats/etc, then provide an xlate_cache object to
collect stats/side effects, traverse the other bridge, then if we
determine that it was viable to do it this way, keep the buffer as-is,
use xlate_cache to handle the stats/side effects, then (if there is
already an xin->xc) add temporary xlate_cache entries to the end of
the xin->xc?

Modulo need for the clone action, of course... but in my eyes even if
we just needed to shift a portion of the odp buffer back/forward to
insert/remove a clone action afterwards, then doing this memmove would
be cheaper than performing the whole translation for the latter bridge
again.

Now, in the packet processing path for xlate_actions I wouldn't expect
the xlate_cache to be there today. If implementing the above, then we
would introduce xlate_cache to that path, which can be expensive due
to things like taking atomic refcounts on rules. So, I am making an
assumption that in this situation, performing one xlate_actions in the
subsequent bridge with xlate_cache being collected is cheaper than
performing the two xlate_actions with no xlate_cache. It might be
worthwhile to check out that assumption.

For what it's worth, I think Andy had some thoughts on doing cloning
in this area, but he's OOO so won't be able to provide that feedback
just yet. He had posted the patches below. There is outstanding
feedback on these so I imagine that he will want to respin them. It
might be worth taking a look to see how they might interact with your
series:
http://patchwork.ozlabs.org/patch/767655/
http://patchwork.ozlabs.org/patch/767656/

>  lib/dpif-netdev.c                  | 18 +-------
>  ofproto/ofproto-dpif-xlate-cache.c | 14 ++++++-
>  ofproto/ofproto-dpif-xlate-cache.h | 15 ++++++-
>  ofproto/ofproto-dpif-xlate.c       | 85 ++++++++++++++++++++++++++++++++++----
>  ofproto/ofproto-dpif-xlate.h       |  1 +
>  ofproto/ofproto-dpif.c             |  3 +-
>  6 files changed, 107 insertions(+), 29 deletions(-)
>
> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
> index 2b65dc7..7e10f1d 100644
> --- a/lib/dpif-netdev.c
> +++ b/lib/dpif-netdev.c
> @@ -5029,24 +5029,8 @@ dp_execute_cb(void *aux_, struct dp_packet_batch *packets_,
>
>      case OVS_ACTION_ATTR_TUNNEL_PUSH:
>          if (*depth < MAX_RECIRC_DEPTH) {
> -            struct dp_packet_batch tnl_pkt;
> -            struct dp_packet_batch *orig_packets_ = packets_;
> -            int err;
> -
> -            if (!may_steal) {
> -                dp_packet_batch_clone(&tnl_pkt, packets_);
> -                packets_ = &tnl_pkt;
> -                dp_packet_batch_reset_cutlen(orig_packets_);
> -            }
> -
>              dp_packet_batch_apply_cutlen(packets_);
> -
> -            err = push_tnl_action(pmd, a, packets_);
> -            if (!err) {
> -                (*depth)++;
> -                dp_netdev_recirculate(pmd, packets_);
> -                (*depth)--;
> -            }
> +            push_tnl_action(pmd, a, packets_);
>              return;
>          }
>          break;
> diff --git a/ofproto/ofproto-dpif-xlate-cache.c b/ofproto/ofproto-dpif-xlate-cache.c
> index 9161701..945f169 100644
> --- a/ofproto/ofproto-dpif-xlate-cache.c
> +++ b/ofproto/ofproto-dpif-xlate-cache.c
> @@ -89,7 +89,7 @@ xlate_cache_netdev(struct xc_entry *entry, const struct dpif_flow_stats *stats)
>  /* Push stats and perform side effects of flow translation. */
>  void
>  xlate_push_stats_entry(struct xc_entry *entry,
> -                       const struct dpif_flow_stats *stats)
> +                       struct dpif_flow_stats *stats)
>  {
>      struct eth_addr dmac;
>
> @@ -160,6 +160,14 @@ xlate_push_stats_entry(struct xc_entry *entry,
>              entry->controller.am = NULL; /* One time only. */
>          }
>          break;
> +    case XC_TUNNEL_HEADER:
> +        if (entry->tunnel_hdr.operation == ADD) {
> +            stats->n_bytes += stats->n_packets * entry->tunnel_hdr.hdr_size;
> +        } else {
> +            stats->n_bytes -= stats->n_packets * entry->tunnel_hdr.hdr_size;
> +        }
> +
> +        break;
>      default:
>          OVS_NOT_REACHED();
>      }
> @@ -167,7 +175,7 @@ xlate_push_stats_entry(struct xc_entry *entry,
>
>  void
>  xlate_push_stats(struct xlate_cache *xcache,
> -                 const struct dpif_flow_stats *stats)
> +                 struct dpif_flow_stats *stats)
>  {
>      if (!stats->n_packets) {
>          return;
> @@ -245,6 +253,8 @@ xlate_cache_clear_entry(struct xc_entry *entry)
>              entry->controller.am = NULL;
>          }
>          break;
> +    case XC_TUNNEL_HEADER:
> +        break;
>      default:
>          OVS_NOT_REACHED();
>      }
> diff --git a/ofproto/ofproto-dpif-xlate-cache.h b/ofproto/ofproto-dpif-xlate-cache.h
> index 13f7cbc..4ead2f6 100644
> --- a/ofproto/ofproto-dpif-xlate-cache.h
> +++ b/ofproto/ofproto-dpif-xlate-cache.h
> @@ -52,6 +52,7 @@ enum xc_type {
>      XC_GROUP,
>      XC_TNL_NEIGH,
>      XC_CONTROLLER,
> +    XC_TUNNEL_HEADER,
>  };
>
>  /* xlate_cache entries hold enough information to perform the side effects of
> @@ -119,6 +120,16 @@ struct xc_entry {
>              struct ofproto_dpif *ofproto;
>              struct ofproto_async_msg *am;
>          } controller;
> +        struct {
> +            uint64_t max_len;
> +        } truncate;

This appears to be unused.

> +        struct {
> +            enum {
> +                ADD,
> +                REMOVE,
> +            } operation;
> +            uint16_t hdr_size;
> +        } tunnel_hdr;
>      };
>  };
>
> @@ -134,8 +145,8 @@ struct xlate_cache {
>  void xlate_cache_init(struct xlate_cache *);
>  struct xlate_cache *xlate_cache_new(void);
>  struct xc_entry *xlate_cache_add_entry(struct xlate_cache *, enum xc_type);
> -void xlate_push_stats_entry(struct xc_entry *, const struct dpif_flow_stats *);
> -void xlate_push_stats(struct xlate_cache *, const struct dpif_flow_stats *);
> +void xlate_push_stats_entry(struct xc_entry *, struct dpif_flow_stats *);
> +void xlate_push_stats(struct xlate_cache *, struct dpif_flow_stats *);
>  void xlate_cache_clear_entry(struct xc_entry *);
>  void xlate_cache_clear(struct xlate_cache *);
>  void xlate_cache_uninit(struct xlate_cache *);
> diff --git a/ofproto/ofproto-dpif-xlate.c b/ofproto/ofproto-dpif-xlate.c
> index c110f2a..bf93e8b 100644
> --- a/ofproto/ofproto-dpif-xlate.c
> +++ b/ofproto/ofproto-dpif-xlate.c
> @@ -3259,8 +3259,7 @@ is_tunnel_actions_clone_ready(struct ofpbuf *actions)
>
>  static bool
>  nested_clone_valid_on_tunnel(struct xlate_ctx *ctx, const struct xport *xport,
> -                             struct xport *out_dev,
> -                             struct ovs_action_push_tnl *tnl_push_data)
> +                             struct xport *out_dev)
>  {
>      const struct dpif_flow_stats *bckup_resubmit_stats;
>      struct xlate_cache *bckup_xcache;
> @@ -3285,7 +3284,6 @@ nested_clone_valid_on_tunnel(struct xlate_ctx *ctx, const struct xport *xport,
>      bckup_xcache = ctx->xin->xcache;
>      ctx->xin->resubmit_stats =  NULL;
>      ctx->xin->xcache = NULL;
> -    odp_put_tnl_push_action(ctx->odp_actions, tnl_push_data);
>      apply_nested_clone_actions(ctx, xport, out_dev);
>      nested_act_flag = is_tunnel_actions_clone_ready(ctx->odp_actions);
>
> @@ -3318,6 +3316,17 @@ build_tunnel_send(struct xlate_ctx *ctx, const struct xport *xport,
>      int err;
>      char buf_sip6[INET6_ADDRSTRLEN];
>      char buf_dip6[INET6_ADDRSTRLEN];
> +    /* Structures to backup Ethernet and IP of base_flow. */
> +    struct flow old_base_flow;
> +    struct flow old_flow;
> +    struct dpif_flow_stats tmp_resubmit_stats;
> +    const struct dpif_flow_stats *old_resubmit_stats = NULL;
> +    struct xlate_cache *old_xcache = NULL;
> +
> +    /* Backup base_flow data. */
> +    copy_flow(&old_base_flow, &ctx->base_flow);
> +    copy_flow(&old_flow, &ctx->xin->flow);
> +    memset(&tmp_resubmit_stats, 0, sizeof tmp_resubmit_stats);
>
>      err = tnl_route_lookup_flow(flow, &d_ip6, &s_ip6, &out_dev);
>      if (err) {
> @@ -3378,12 +3387,72 @@ build_tunnel_send(struct xlate_ctx *ctx, const struct xport *xport,
>      tnl_push_data.tnl_port = odp_to_u32(tunnel_odp_port);
>      tnl_push_data.out_port = odp_to_u32(out_dev->odp_port);
>
> -    /* After tunnel header has been added, packet_type of flow and base_flow
> -     * need to be set to PT_ETH. */
> -    ctx->xin->flow.packet_type = htonl(PT_ETH);
> -    ctx->base_flow.packet_type = htonl(PT_ETH);
> +    /* After tunnel header has been added, MAC and IP data of flow and
> +     * base_flow need to be set properly, since there is not recirculation
> +     * any more when sending packet to tunnel. */
>
> +    propagate_tunnel_data_to_flow(ctx, dmac, smac, s_ip6, s_ip,
> +                                  tnl_params.is_ipv6, tnl_push_data.tnl_type);
> +
> +
> +    size_t push_action_size = 0;
> +    size_t clone_ofs = nl_msg_start_nested(ctx->odp_actions,
> +                                           OVS_ACTION_ATTR_CLONE);
>      odp_put_tnl_push_action(ctx->odp_actions, &tnl_push_data);
> +    push_action_size = ctx->odp_actions->size;
> +
> +    /* Try to see if its possible to apply nested clone action.
> +     * There is no way to know the nested clone is a valid on this
> +     * tunnel without performing the translate. Doing a temporary
> +     * translate for validation.
> +     */
> +    if (!nested_clone_valid_on_tunnel(ctx, xport, out_dev)) {
> +        /* Datapath is not doing the recirculation now, so lets make it
> +         * happen explicitly.
> +         */
> +        nl_msg_put_u32(ctx->odp_actions, OVS_ACTION_ATTR_RECIRC, 0);
> +        nl_msg_end_nested(ctx->odp_actions, clone_ofs);
> +        goto out;
> +    }
> +
> +    if (ctx->xin->resubmit_stats) {
> +        ovs_mutex_lock(&ctx->xin->resubmit_stats_mutex);
> +        old_resubmit_stats = ctx->xin->resubmit_stats;
> +        tmp_resubmit_stats = *ctx->xin->resubmit_stats;
> +        tmp_resubmit_stats.n_bytes += tnl_push_data.header_len;
> +        ctx->xin->resubmit_stats = &tmp_resubmit_stats;
> +        ovs_mutex_unlock(&ctx->xin->resubmit_stats_mutex);

What exactly does this mutex protect?

> +
> +        if (ctx->xin->xcache) {
> +            struct xc_entry *entry;
> +            entry = xlate_cache_add_entry(ctx->xin->xcache, XC_TUNNEL_HEADER);
> +            entry->tunnel_hdr.hdr_size = tnl_push_data.header_len;
> +            entry->tunnel_hdr.operation = ADD;
> +            old_xcache = ctx->xin->xcache;

Is store/restore of xcache really necessary in the current approach?

> +        }
> +    }
> +
> +    apply_nested_clone_actions(ctx, xport, out_dev);
> +    if (ctx->odp_actions->size > push_action_size) {
> +        /* Update the CLONE action only when combined */
> +        nl_msg_end_nested(ctx->odp_actions, clone_ofs);
> +    }
> +    else {
> +        /* No actions after the tunnel, no need of clone */
> +        nl_msg_cancel_nested(ctx->odp_actions, clone_ofs);
> +        odp_put_tnl_push_action(ctx->odp_actions, &tnl_push_data);
> +    }
> +    if (ctx->xin->resubmit_stats) {
> +        /* Revert the resubmit stats after clone actions */
> +        ctx->xin->resubmit_stats = old_resubmit_stats;
> +        if (ctx->xin->xcache) {
> +            ctx->xin->xcache = old_xcache;
> +        }
> +    }
> +out:
> +    /* Restore flow and base_flow data. */
> +    copy_flow(&ctx->base_flow, &old_base_flow);
> +    copy_flow(&ctx->xin->flow, &old_flow);
>      return 0;
>  }
>
> @@ -6133,6 +6202,8 @@ xlate_in_init(struct xlate_in *xin, struct ofproto_dpif *ofproto,
>      xin->odp_actions = odp_actions;
>      xin->in_packet_out = false;
>
> +    ovs_mutex_init_adaptive(&xin->resubmit_stats_mutex);
> +
>      /* Do recirc lookup. */
>      xin->frozen_state = NULL;
>      if (flow->recirc_id) {
> diff --git a/ofproto/ofproto-dpif-xlate.h b/ofproto/ofproto-dpif-xlate.h
> index 68e114a..8e48665 100644
> --- a/ofproto/ofproto-dpif-xlate.h
> +++ b/ofproto/ofproto-dpif-xlate.h
> @@ -112,6 +112,7 @@ struct xlate_in {
>       * This is normally null so the client has to set it manually after
>       * calling xlate_in_init(). */
>      const struct dpif_flow_stats *resubmit_stats;
> +    struct ovs_mutex resubmit_stats_mutex;
>
>      /* Counters carried over from a pre-existing translation of a related flow.
>       * This can occur due to, e.g., the translation of an ARP packet that was
> diff --git a/ofproto/ofproto-dpif.c b/ofproto/ofproto-dpif.c
> index 23ba1a3..a2c73c5 100644
> --- a/ofproto/ofproto-dpif.c
> +++ b/ofproto/ofproto-dpif.c
> @@ -4591,7 +4591,7 @@ packet_xlate_revert(struct ofproto *ofproto OVS_UNUSED,
>  static void
>  ofproto_dpif_xcache_execute(struct ofproto_dpif *ofproto,
>                              struct xlate_cache *xcache,
> -                            const struct dpif_flow_stats *stats)
> +                            struct dpif_flow_stats *stats)
>      OVS_REQUIRES(ofproto_mutex)
>  {
>      struct xc_entry *entry;
> @@ -4624,6 +4624,7 @@ ofproto_dpif_xcache_execute(struct ofproto_dpif *ofproto,
>          case XC_GROUP:
>          case XC_TNL_NEIGH:
>          case XC_CONTROLLER:
> +        case XC_TUNNEL_HEADER:
>              xlate_push_stats_entry(entry, stats);
>              break;
>          default:
> --
> 2.7.4
>
Chandran, Sugesh June 27, 2017, 4:30 p.m. UTC | #2
Regards
_Sugesh


> -----Original Message-----

> From: Joe Stringer [mailto:joe@ovn.org]

> Sent: Monday, June 26, 2017 7:06 PM

> To: Chandran, Sugesh <sugesh.chandran@intel.com>

> Cc: ovs dev <dev@openvswitch.org>; Zoltán Balogh

> <zoltan.balogh@ericsson.com>; Andy Zhou <azhou@ovn.org>

> Subject: Re: [PATCH 4/4] tunneling: Avoid recirculation on datapath by

> computing the recirculate actions at translate time.

> 

> On 16 June 2017 at 09:45, Sugesh Chandran <sugesh.chandran@intel.com>

> wrote:

> > This patch set removes the recirculation of encapsulated tunnel

> > packets if possible. It is done by computing the post tunnel actions

> > at the time of translation. The combined nested action set are

> > programmed in the datapath using CLONE action.

> >

> > Signed-off-by: Sugesh Chandran <sugesh.chandran@intel.com>

> > Signed-off-by: Zoltán Balogh <zoltan.balogh@ericsson.com>

> > Co-authored-by: Zoltán Balogh <zoltan.balogh@ericsson.com>

> > ---

> 

> It seems like this approach attempts to simulate the later bridge traversal to

> generate the actions to be processed, forgets all of this processing, then if it

> won't work, generates the recirc action.. but if it would work, goes back

> through the latter bridge traversal again, this time keeping all of the

> generated netlink actions; attributing stats; and applying side effects.

[Sugesh] yes, that’s right!
> 

> So, the first traversal needs to *not* attribute stats/side effects initially, until

> we later do the check and determine that it's the right thing to do... but later

> on, if we determine that it's the correct set of actions, then ideally we

> wouldn't have to go and do the whole same translation again. The common

> case should be that the latter bridge traversal is the actual actions we want to

> do.. Could we instead, for this first traversal, clear out the

> packet/resubmit_stats/etc, then provide an xlate_cache object to collect

> stats/side effects, traverse the other bridge, then if we determine that it was

> viable to do it this way, keep the buffer as-is, use xlate_cache to handle the

> stats/side effects, then (if there is already an xin->xc) add temporary

> xlate_cache entries to the end of the xin->xc?

[Sugesh] Our first attempt is to use single translation. The issue that we faced for that approach is
with resubmit_stats. We noticed the resubmit_stats is used in translation to update stats
of ports, rule and etc in the translation phase. 
If we need to revert the actions in case of trunc, the stats update made by resubmit_stats must be
reverted, which is bit difficult to track. Even if we keep resubmit_stats as 0, stats may change
for cases like adding tunnel.


As you suggested another option that we can try out would be as

* Backup the xc and resubmit_stats before making the first traversal. Also clear the  xin->packet and set side-effects to false.
* Create a tmp_xc and use it while doing the traversal. But will set the resubmit_stats as null.
* Once the traversal complete, validate if its safe to combine the actions
* If yes, perform the stats update using xlate_push_stats(tmp_xc). Append temporary xc entries into xin->xc


> 

> Modulo need for the clone action, of course... but in my eyes even if we just

> needed to shift a portion of the odp buffer back/forward to insert/remove a

> clone action afterwards, then doing this memmove would be cheaper than

> performing the whole translation for the latter bridge again.

[Sugesh] May be I am missing something here, why do we need a memmove?
We can do the clone_cancel in case to remove the clone action in case we wanted revert
The actions.
> 

> Now, in the packet processing path for xlate_actions I wouldn't expect the

> xlate_cache to be there today. If implementing the above, then we would

> introduce xlate_cache to that path, which can be expensive due to things like

> taking atomic refcounts on rules. So, I am making an assumption that in this

> situation, performing one xlate_actions in the subsequent bridge with

> xlate_cache being collected is cheaper than performing the two

> xlate_actions with no xlate_cache. It might be worthwhile to check out that

> assumption.

[Sugesh] Will try to do in the above approach in v2 as its avoid additional translation.
> 

> For what it's worth, I think Andy had some thoughts on doing cloning in this

> area, but he's OOO so won't be able to provide that feedback just yet. He

> had posted the patches below. There is outstanding feedback on these so I

> imagine that he will want to respin them. It might be worth taking a look to

> see how they might interact with your

> series:

> http://patchwork.ozlabs.org/patch/767655/

> http://patchwork.ozlabs.org/patch/767656/

[Sugesh] Will review this patch series to see how it can work with our proposal

> 

> >  lib/dpif-netdev.c                  | 18 +-------

> >  ofproto/ofproto-dpif-xlate-cache.c | 14 ++++++-

> > ofproto/ofproto-dpif-xlate-cache.h | 15 ++++++-

> >  ofproto/ofproto-dpif-xlate.c       | 85

> ++++++++++++++++++++++++++++++++++----

> >  ofproto/ofproto-dpif-xlate.h       |  1 +

> >  ofproto/ofproto-dpif.c             |  3 +-

> >  6 files changed, 107 insertions(+), 29 deletions(-)

> >

> > diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c index

> > 2b65dc7..7e10f1d 100644

> > --- a/lib/dpif-netdev.c

> > +++ b/lib/dpif-netdev.c

> > @@ -5029,24 +5029,8 @@ dp_execute_cb(void *aux_, struct

> > dp_packet_batch *packets_,

> >

> >      case OVS_ACTION_ATTR_TUNNEL_PUSH:

> >          if (*depth < MAX_RECIRC_DEPTH) {

> > -            struct dp_packet_batch tnl_pkt;

> > -            struct dp_packet_batch *orig_packets_ = packets_;

> > -            int err;

> > -

> > -            if (!may_steal) {

> > -                dp_packet_batch_clone(&tnl_pkt, packets_);

> > -                packets_ = &tnl_pkt;

> > -                dp_packet_batch_reset_cutlen(orig_packets_);

> > -            }

> > -

> >              dp_packet_batch_apply_cutlen(packets_);

> > -

> > -            err = push_tnl_action(pmd, a, packets_);

> > -            if (!err) {

> > -                (*depth)++;

> > -                dp_netdev_recirculate(pmd, packets_);

> > -                (*depth)--;

> > -            }

> > +            push_tnl_action(pmd, a, packets_);

> >              return;

> >          }

> >          break;

> > diff --git a/ofproto/ofproto-dpif-xlate-cache.c

> > b/ofproto/ofproto-dpif-xlate-cache.c

> > index 9161701..945f169 100644

> > --- a/ofproto/ofproto-dpif-xlate-cache.c

> > +++ b/ofproto/ofproto-dpif-xlate-cache.c

> > @@ -89,7 +89,7 @@ xlate_cache_netdev(struct xc_entry *entry, const

> > struct dpif_flow_stats *stats)

> >  /* Push stats and perform side effects of flow translation. */  void

> > xlate_push_stats_entry(struct xc_entry *entry,

> > -                       const struct dpif_flow_stats *stats)

> > +                       struct dpif_flow_stats *stats)

> >  {

> >      struct eth_addr dmac;

> >

> > @@ -160,6 +160,14 @@ xlate_push_stats_entry(struct xc_entry *entry,

> >              entry->controller.am = NULL; /* One time only. */

> >          }

> >          break;

> > +    case XC_TUNNEL_HEADER:

> > +        if (entry->tunnel_hdr.operation == ADD) {

> > +            stats->n_bytes += stats->n_packets * entry->tunnel_hdr.hdr_size;

> > +        } else {

> > +            stats->n_bytes -= stats->n_packets * entry->tunnel_hdr.hdr_size;

> > +        }

> > +

> > +        break;

> >      default:

> >          OVS_NOT_REACHED();

> >      }

> > @@ -167,7 +175,7 @@ xlate_push_stats_entry(struct xc_entry *entry,

> >

> >  void

> >  xlate_push_stats(struct xlate_cache *xcache,

> > -                 const struct dpif_flow_stats *stats)

> > +                 struct dpif_flow_stats *stats)

> >  {

> >      if (!stats->n_packets) {

> >          return;

> > @@ -245,6 +253,8 @@ xlate_cache_clear_entry(struct xc_entry *entry)

> >              entry->controller.am = NULL;

> >          }

> >          break;

> > +    case XC_TUNNEL_HEADER:

> > +        break;

> >      default:

> >          OVS_NOT_REACHED();

> >      }

> > diff --git a/ofproto/ofproto-dpif-xlate-cache.h

> > b/ofproto/ofproto-dpif-xlate-cache.h

> > index 13f7cbc..4ead2f6 100644

> > --- a/ofproto/ofproto-dpif-xlate-cache.h

> > +++ b/ofproto/ofproto-dpif-xlate-cache.h

> > @@ -52,6 +52,7 @@ enum xc_type {

> >      XC_GROUP,

> >      XC_TNL_NEIGH,

> >      XC_CONTROLLER,

> > +    XC_TUNNEL_HEADER,

> >  };

> >

> >  /* xlate_cache entries hold enough information to perform the side

> > effects of @@ -119,6 +120,16 @@ struct xc_entry {

> >              struct ofproto_dpif *ofproto;

> >              struct ofproto_async_msg *am;

> >          } controller;

> > +        struct {

> > +            uint64_t max_len;

> > +        } truncate;

> 

> This appears to be unused.

> 

> > +        struct {

> > +            enum {

> > +                ADD,

> > +                REMOVE,

> > +            } operation;

> > +            uint16_t hdr_size;

> > +        } tunnel_hdr;

> >      };

> >  };

> >

> > @@ -134,8 +145,8 @@ struct xlate_cache {  void xlate_cache_init(struct

> > xlate_cache *);  struct xlate_cache *xlate_cache_new(void);  struct

> > xc_entry *xlate_cache_add_entry(struct xlate_cache *, enum xc_type);

> > -void xlate_push_stats_entry(struct xc_entry *, const struct

> > dpif_flow_stats *); -void xlate_push_stats(struct xlate_cache *, const

> > struct dpif_flow_stats *);

> > +void xlate_push_stats_entry(struct xc_entry *, struct dpif_flow_stats

> > +*); void xlate_push_stats(struct xlate_cache *, struct

> > +dpif_flow_stats *);

> >  void xlate_cache_clear_entry(struct xc_entry *);  void

> > xlate_cache_clear(struct xlate_cache *);  void

> > xlate_cache_uninit(struct xlate_cache *); diff --git

> > a/ofproto/ofproto-dpif-xlate.c b/ofproto/ofproto-dpif-xlate.c index

> > c110f2a..bf93e8b 100644

> > --- a/ofproto/ofproto-dpif-xlate.c

> > +++ b/ofproto/ofproto-dpif-xlate.c

> > @@ -3259,8 +3259,7 @@ is_tunnel_actions_clone_ready(struct ofpbuf

> > *actions)

> >

> >  static bool

> >  nested_clone_valid_on_tunnel(struct xlate_ctx *ctx, const struct xport

> *xport,

> > -                             struct xport *out_dev,

> > -                             struct ovs_action_push_tnl *tnl_push_data)

> > +                             struct xport *out_dev)

> >  {

> >      const struct dpif_flow_stats *bckup_resubmit_stats;

> >      struct xlate_cache *bckup_xcache; @@ -3285,7 +3284,6 @@

> > nested_clone_valid_on_tunnel(struct xlate_ctx *ctx, const struct xport

> *xport,

> >      bckup_xcache = ctx->xin->xcache;

> >      ctx->xin->resubmit_stats =  NULL;

> >      ctx->xin->xcache = NULL;

> > -    odp_put_tnl_push_action(ctx->odp_actions, tnl_push_data);

> >      apply_nested_clone_actions(ctx, xport, out_dev);

> >      nested_act_flag =

> > is_tunnel_actions_clone_ready(ctx->odp_actions);

> >

> > @@ -3318,6 +3316,17 @@ build_tunnel_send(struct xlate_ctx *ctx, const

> struct xport *xport,

> >      int err;

> >      char buf_sip6[INET6_ADDRSTRLEN];

> >      char buf_dip6[INET6_ADDRSTRLEN];

> > +    /* Structures to backup Ethernet and IP of base_flow. */

> > +    struct flow old_base_flow;

> > +    struct flow old_flow;

> > +    struct dpif_flow_stats tmp_resubmit_stats;

> > +    const struct dpif_flow_stats *old_resubmit_stats = NULL;

> > +    struct xlate_cache *old_xcache = NULL;

> > +

> > +    /* Backup base_flow data. */

> > +    copy_flow(&old_base_flow, &ctx->base_flow);

> > +    copy_flow(&old_flow, &ctx->xin->flow);

> > +    memset(&tmp_resubmit_stats, 0, sizeof tmp_resubmit_stats);

> >

> >      err = tnl_route_lookup_flow(flow, &d_ip6, &s_ip6, &out_dev);

> >      if (err) {

> > @@ -3378,12 +3387,72 @@ build_tunnel_send(struct xlate_ctx *ctx, const

> struct xport *xport,

> >      tnl_push_data.tnl_port = odp_to_u32(tunnel_odp_port);

> >      tnl_push_data.out_port = odp_to_u32(out_dev->odp_port);

> >

> > -    /* After tunnel header has been added, packet_type of flow and

> base_flow

> > -     * need to be set to PT_ETH. */

> > -    ctx->xin->flow.packet_type = htonl(PT_ETH);

> > -    ctx->base_flow.packet_type = htonl(PT_ETH);

> > +    /* After tunnel header has been added, MAC and IP data of flow and

> > +     * base_flow need to be set properly, since there is not recirculation

> > +     * any more when sending packet to tunnel. */

> >

> > +    propagate_tunnel_data_to_flow(ctx, dmac, smac, s_ip6, s_ip,

> > +                                  tnl_params.is_ipv6,

> > + tnl_push_data.tnl_type);

> > +

> > +

> > +    size_t push_action_size = 0;

> > +    size_t clone_ofs = nl_msg_start_nested(ctx->odp_actions,

> > +                                           OVS_ACTION_ATTR_CLONE);

> >      odp_put_tnl_push_action(ctx->odp_actions, &tnl_push_data);

> > +    push_action_size = ctx->odp_actions->size;

> > +

> > +    /* Try to see if its possible to apply nested clone action.

> > +     * There is no way to know the nested clone is a valid on this

> > +     * tunnel without performing the translate. Doing a temporary

> > +     * translate for validation.

> > +     */

> > +    if (!nested_clone_valid_on_tunnel(ctx, xport, out_dev)) {

> > +        /* Datapath is not doing the recirculation now, so lets make it

> > +         * happen explicitly.

> > +         */

> > +        nl_msg_put_u32(ctx->odp_actions, OVS_ACTION_ATTR_RECIRC, 0);

> > +        nl_msg_end_nested(ctx->odp_actions, clone_ofs);

> > +        goto out;

> > +    }

> > +

> > +    if (ctx->xin->resubmit_stats) {

> > +        ovs_mutex_lock(&ctx->xin->resubmit_stats_mutex);

> > +        old_resubmit_stats = ctx->xin->resubmit_stats;

> > +        tmp_resubmit_stats = *ctx->xin->resubmit_stats;

> > +        tmp_resubmit_stats.n_bytes += tnl_push_data.header_len;

> > +        ctx->xin->resubmit_stats = &tmp_resubmit_stats;

> > +        ovs_mutex_unlock(&ctx->xin->resubmit_stats_mutex);

> 

> What exactly does this mutex protect?

> 

> > +

> > +        if (ctx->xin->xcache) {

> > +            struct xc_entry *entry;

> > +            entry = xlate_cache_add_entry(ctx->xin->xcache,

> XC_TUNNEL_HEADER);

> > +            entry->tunnel_hdr.hdr_size = tnl_push_data.header_len;

> > +            entry->tunnel_hdr.operation = ADD;

> > +            old_xcache = ctx->xin->xcache;

> 

> Is store/restore of xcache really necessary in the current approach?

> 

> > +        }

> > +    }

> > +

> > +    apply_nested_clone_actions(ctx, xport, out_dev);

> > +    if (ctx->odp_actions->size > push_action_size) {

> > +        /* Update the CLONE action only when combined */

> > +        nl_msg_end_nested(ctx->odp_actions, clone_ofs);

> > +    }

> > +    else {

> > +        /* No actions after the tunnel, no need of clone */

> > +        nl_msg_cancel_nested(ctx->odp_actions, clone_ofs);

> > +        odp_put_tnl_push_action(ctx->odp_actions, &tnl_push_data);

> > +    }

> > +    if (ctx->xin->resubmit_stats) {

> > +        /* Revert the resubmit stats after clone actions */

> > +        ctx->xin->resubmit_stats = old_resubmit_stats;

> > +        if (ctx->xin->xcache) {

> > +            ctx->xin->xcache = old_xcache;

> > +        }

> > +    }

> > +out:

> > +    /* Restore flow and base_flow data. */

> > +    copy_flow(&ctx->base_flow, &old_base_flow);

> > +    copy_flow(&ctx->xin->flow, &old_flow);

> >      return 0;

> >  }

> >

> > @@ -6133,6 +6202,8 @@ xlate_in_init(struct xlate_in *xin, struct

> ofproto_dpif *ofproto,

> >      xin->odp_actions = odp_actions;

> >      xin->in_packet_out = false;

> >

> > +    ovs_mutex_init_adaptive(&xin->resubmit_stats_mutex);

> > +

> >      /* Do recirc lookup. */

> >      xin->frozen_state = NULL;

> >      if (flow->recirc_id) {

> > diff --git a/ofproto/ofproto-dpif-xlate.h

> > b/ofproto/ofproto-dpif-xlate.h index 68e114a..8e48665 100644

> > --- a/ofproto/ofproto-dpif-xlate.h

> > +++ b/ofproto/ofproto-dpif-xlate.h

> > @@ -112,6 +112,7 @@ struct xlate_in {

> >       * This is normally null so the client has to set it manually after

> >       * calling xlate_in_init(). */

> >      const struct dpif_flow_stats *resubmit_stats;

> > +    struct ovs_mutex resubmit_stats_mutex;

> >

> >      /* Counters carried over from a pre-existing translation of a related flow.

> >       * This can occur due to, e.g., the translation of an ARP packet

> > that was diff --git a/ofproto/ofproto-dpif.c b/ofproto/ofproto-dpif.c

> > index 23ba1a3..a2c73c5 100644

> > --- a/ofproto/ofproto-dpif.c

> > +++ b/ofproto/ofproto-dpif.c

> > @@ -4591,7 +4591,7 @@ packet_xlate_revert(struct ofproto *ofproto

> > OVS_UNUSED,  static void  ofproto_dpif_xcache_execute(struct

> > ofproto_dpif *ofproto,

> >                              struct xlate_cache *xcache,

> > -                            const struct dpif_flow_stats *stats)

> > +                            struct dpif_flow_stats *stats)

> >      OVS_REQUIRES(ofproto_mutex)

> >  {

> >      struct xc_entry *entry;

> > @@ -4624,6 +4624,7 @@ ofproto_dpif_xcache_execute(struct

> ofproto_dpif *ofproto,

> >          case XC_GROUP:

> >          case XC_TNL_NEIGH:

> >          case XC_CONTROLLER:

> > +        case XC_TUNNEL_HEADER:

> >              xlate_push_stats_entry(entry, stats);

> >              break;

> >          default:

> > --

> > 2.7.4

> >
Joe Stringer June 27, 2017, 5:25 p.m. UTC | #3
On 27 June 2017 at 09:30, Chandran, Sugesh <sugesh.chandran@intel.com> wrote:
>
>
> Regards
> _Sugesh
>
>
>> -----Original Message-----
>> From: Joe Stringer [mailto:joe@ovn.org]
>> Sent: Monday, June 26, 2017 7:06 PM
>> To: Chandran, Sugesh <sugesh.chandran@intel.com>
>> Cc: ovs dev <dev@openvswitch.org>; Zoltán Balogh
>> <zoltan.balogh@ericsson.com>; Andy Zhou <azhou@ovn.org>
>> Subject: Re: [PATCH 4/4] tunneling: Avoid recirculation on datapath by
>> computing the recirculate actions at translate time.
>>
>> On 16 June 2017 at 09:45, Sugesh Chandran <sugesh.chandran@intel.com>
>> wrote:
>> > This patch set removes the recirculation of encapsulated tunnel
>> > packets if possible. It is done by computing the post tunnel actions
>> > at the time of translation. The combined nested action set are
>> > programmed in the datapath using CLONE action.
>> >
>> > Signed-off-by: Sugesh Chandran <sugesh.chandran@intel.com>
>> > Signed-off-by: Zoltán Balogh <zoltan.balogh@ericsson.com>
>> > Co-authored-by: Zoltán Balogh <zoltan.balogh@ericsson.com>
>> > ---
>>
>> It seems like this approach attempts to simulate the later bridge traversal to
>> generate the actions to be processed, forgets all of this processing, then if it
>> won't work, generates the recirc action.. but if it would work, goes back
>> through the latter bridge traversal again, this time keeping all of the
>> generated netlink actions; attributing stats; and applying side effects.
> [Sugesh] yes, that’s right!
>>
>> So, the first traversal needs to *not* attribute stats/side effects initially, until
>> we later do the check and determine that it's the right thing to do... but later
>> on, if we determine that it's the correct set of actions, then ideally we
>> wouldn't have to go and do the whole same translation again. The common
>> case should be that the latter bridge traversal is the actual actions we want to
>> do.. Could we instead, for this first traversal, clear out the
>> packet/resubmit_stats/etc, then provide an xlate_cache object to collect
>> stats/side effects, traverse the other bridge, then if we determine that it was
>> viable to do it this way, keep the buffer as-is, use xlate_cache to handle the
>> stats/side effects, then (if there is already an xin->xc) add temporary
>> xlate_cache entries to the end of the xin->xc?
> [Sugesh] Our first attempt is to use single translation. The issue that we faced for that approach is
> with resubmit_stats. We noticed the resubmit_stats is used in translation to update stats
> of ports, rule and etc in the translation phase.

Right, I don't think that resubmit stats should be passed through for
the initial 'dummy'/'trial' translation.

> If we need to revert the actions in case of trunc, the stats update made by resubmit_stats must be
> reverted, which is bit difficult to track. Even if we keep resubmit_stats as 0, stats may change
> for cases like adding tunnel.

Right. With regards to stats changing, xlate_cache needs to be
properly updated to track these changes. I believe that this series is
already doing the right thing here.

> As you suggested another option that we can try out would be as
>
> * Backup the xc and resubmit_stats before making the first traversal. Also clear the  xin->packet and set side-effects to false.
> * Create a tmp_xc and use it while doing the traversal. But will set the resubmit_stats as null.
> * Once the traversal complete, validate if its safe to combine the actions
> * If yes, perform the stats update using xlate_push_stats(tmp_xc). Append temporary xc entries into xin->xc

Yup, that's what I'm suggesting.

>>
>> Modulo need for the clone action, of course... but in my eyes even if we just
>> needed to shift a portion of the odp buffer back/forward to insert/remove a
>> clone action afterwards, then doing this memmove would be cheaper than
>> performing the whole translation for the latter bridge again.
> [Sugesh] May be I am missing something here, why do we need a memmove?
> We can do the clone_cancel in case to remove the clone action in case we wanted revert
> The actions.

I couldn't find a clone_cancel. This comment I made was a bit
speculative, because there are no actual clone actions involved in
this part of translation yet. It's more related to the other series by
Andy than this one.

However, for instance, if you have two bridges, and the first bridge
outputs packets to a patch port into the second bridge, then the
second bridge does some stateful processing like NAT, then finally the
first bridge's actions list does stuff after the output(patch), then
currently the second bridge will see a NATed packet. To avoid this
kind of situation, we can look at inserting clone action around the
output(patch), so only the cloned packet gets NATed. This tunneling
work could plausibly trigger a similar issue. Now, you don't
necessarily always want to insert a clone action around an
output(patch) because that could result in more processing in the
fastpath than necessary. So that's where Andy's series needs a bit
more work. Hopefully with some revision, that series can end up
resolving this kind of issue for patch ports as well as this tunneling
series.

>> Now, in the packet processing path for xlate_actions I wouldn't expect the
>> xlate_cache to be there today. If implementing the above, then we would
>> introduce xlate_cache to that path, which can be expensive due to things like
>> taking atomic refcounts on rules. So, I am making an assumption that in this
>> situation, performing one xlate_actions in the subsequent bridge with
>> xlate_cache being collected is cheaper than performing the two
>> xlate_actions with no xlate_cache. It might be worthwhile to check out that
>> assumption.
> [Sugesh] Will try to do in the above approach in v2 as its avoid additional translation.

OK, thanks.

>> For what it's worth, I think Andy had some thoughts on doing cloning in this
>> area, but he's OOO so won't be able to provide that feedback just yet. He
>> had posted the patches below. There is outstanding feedback on these so I
>> imagine that he will want to respin them. It might be worth taking a look to
>> see how they might interact with your
>> series:
>> http://patchwork.ozlabs.org/patch/767655/
>> http://patchwork.ozlabs.org/patch/767656/
> [Sugesh] Will review this patch series to see how it can work with our proposal

Great.
diff mbox

Patch

diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index 2b65dc7..7e10f1d 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -5029,24 +5029,8 @@  dp_execute_cb(void *aux_, struct dp_packet_batch *packets_,
 
     case OVS_ACTION_ATTR_TUNNEL_PUSH:
         if (*depth < MAX_RECIRC_DEPTH) {
-            struct dp_packet_batch tnl_pkt;
-            struct dp_packet_batch *orig_packets_ = packets_;
-            int err;
-
-            if (!may_steal) {
-                dp_packet_batch_clone(&tnl_pkt, packets_);
-                packets_ = &tnl_pkt;
-                dp_packet_batch_reset_cutlen(orig_packets_);
-            }
-
             dp_packet_batch_apply_cutlen(packets_);
-
-            err = push_tnl_action(pmd, a, packets_);
-            if (!err) {
-                (*depth)++;
-                dp_netdev_recirculate(pmd, packets_);
-                (*depth)--;
-            }
+            push_tnl_action(pmd, a, packets_);
             return;
         }
         break;
diff --git a/ofproto/ofproto-dpif-xlate-cache.c b/ofproto/ofproto-dpif-xlate-cache.c
index 9161701..945f169 100644
--- a/ofproto/ofproto-dpif-xlate-cache.c
+++ b/ofproto/ofproto-dpif-xlate-cache.c
@@ -89,7 +89,7 @@  xlate_cache_netdev(struct xc_entry *entry, const struct dpif_flow_stats *stats)
 /* Push stats and perform side effects of flow translation. */
 void
 xlate_push_stats_entry(struct xc_entry *entry,
-                       const struct dpif_flow_stats *stats)
+                       struct dpif_flow_stats *stats)
 {
     struct eth_addr dmac;
 
@@ -160,6 +160,14 @@  xlate_push_stats_entry(struct xc_entry *entry,
             entry->controller.am = NULL; /* One time only. */
         }
         break;
+    case XC_TUNNEL_HEADER:
+        if (entry->tunnel_hdr.operation == ADD) {
+            stats->n_bytes += stats->n_packets * entry->tunnel_hdr.hdr_size;
+        } else {
+            stats->n_bytes -= stats->n_packets * entry->tunnel_hdr.hdr_size;
+        }
+
+        break;
     default:
         OVS_NOT_REACHED();
     }
@@ -167,7 +175,7 @@  xlate_push_stats_entry(struct xc_entry *entry,
 
 void
 xlate_push_stats(struct xlate_cache *xcache,
-                 const struct dpif_flow_stats *stats)
+                 struct dpif_flow_stats *stats)
 {
     if (!stats->n_packets) {
         return;
@@ -245,6 +253,8 @@  xlate_cache_clear_entry(struct xc_entry *entry)
             entry->controller.am = NULL;
         }
         break;
+    case XC_TUNNEL_HEADER:
+        break;
     default:
         OVS_NOT_REACHED();
     }
diff --git a/ofproto/ofproto-dpif-xlate-cache.h b/ofproto/ofproto-dpif-xlate-cache.h
index 13f7cbc..4ead2f6 100644
--- a/ofproto/ofproto-dpif-xlate-cache.h
+++ b/ofproto/ofproto-dpif-xlate-cache.h
@@ -52,6 +52,7 @@  enum xc_type {
     XC_GROUP,
     XC_TNL_NEIGH,
     XC_CONTROLLER,
+    XC_TUNNEL_HEADER,
 };
 
 /* xlate_cache entries hold enough information to perform the side effects of
@@ -119,6 +120,16 @@  struct xc_entry {
             struct ofproto_dpif *ofproto;
             struct ofproto_async_msg *am;
         } controller;
+        struct {
+            uint64_t max_len;
+        } truncate;
+        struct {
+            enum {
+                ADD,
+                REMOVE,
+            } operation;
+            uint16_t hdr_size;
+        } tunnel_hdr;
     };
 };
 
@@ -134,8 +145,8 @@  struct xlate_cache {
 void xlate_cache_init(struct xlate_cache *);
 struct xlate_cache *xlate_cache_new(void);
 struct xc_entry *xlate_cache_add_entry(struct xlate_cache *, enum xc_type);
-void xlate_push_stats_entry(struct xc_entry *, const struct dpif_flow_stats *);
-void xlate_push_stats(struct xlate_cache *, const struct dpif_flow_stats *);
+void xlate_push_stats_entry(struct xc_entry *, struct dpif_flow_stats *);
+void xlate_push_stats(struct xlate_cache *, struct dpif_flow_stats *);
 void xlate_cache_clear_entry(struct xc_entry *);
 void xlate_cache_clear(struct xlate_cache *);
 void xlate_cache_uninit(struct xlate_cache *);
diff --git a/ofproto/ofproto-dpif-xlate.c b/ofproto/ofproto-dpif-xlate.c
index c110f2a..bf93e8b 100644
--- a/ofproto/ofproto-dpif-xlate.c
+++ b/ofproto/ofproto-dpif-xlate.c
@@ -3259,8 +3259,7 @@  is_tunnel_actions_clone_ready(struct ofpbuf *actions)
 
 static bool
 nested_clone_valid_on_tunnel(struct xlate_ctx *ctx, const struct xport *xport,
-                             struct xport *out_dev,
-                             struct ovs_action_push_tnl *tnl_push_data)
+                             struct xport *out_dev)
 {
     const struct dpif_flow_stats *bckup_resubmit_stats;
     struct xlate_cache *bckup_xcache;
@@ -3285,7 +3284,6 @@  nested_clone_valid_on_tunnel(struct xlate_ctx *ctx, const struct xport *xport,
     bckup_xcache = ctx->xin->xcache;
     ctx->xin->resubmit_stats =  NULL;
     ctx->xin->xcache = NULL;
-    odp_put_tnl_push_action(ctx->odp_actions, tnl_push_data);
     apply_nested_clone_actions(ctx, xport, out_dev);
     nested_act_flag = is_tunnel_actions_clone_ready(ctx->odp_actions);
 
@@ -3318,6 +3316,17 @@  build_tunnel_send(struct xlate_ctx *ctx, const struct xport *xport,
     int err;
     char buf_sip6[INET6_ADDRSTRLEN];
     char buf_dip6[INET6_ADDRSTRLEN];
+    /* Structures to backup Ethernet and IP of base_flow. */
+    struct flow old_base_flow;
+    struct flow old_flow;
+    struct dpif_flow_stats tmp_resubmit_stats;
+    const struct dpif_flow_stats *old_resubmit_stats = NULL;
+    struct xlate_cache *old_xcache = NULL;
+
+    /* Backup base_flow data. */
+    copy_flow(&old_base_flow, &ctx->base_flow);
+    copy_flow(&old_flow, &ctx->xin->flow);
+    memset(&tmp_resubmit_stats, 0, sizeof tmp_resubmit_stats);
 
     err = tnl_route_lookup_flow(flow, &d_ip6, &s_ip6, &out_dev);
     if (err) {
@@ -3378,12 +3387,72 @@  build_tunnel_send(struct xlate_ctx *ctx, const struct xport *xport,
     tnl_push_data.tnl_port = odp_to_u32(tunnel_odp_port);
     tnl_push_data.out_port = odp_to_u32(out_dev->odp_port);
 
-    /* After tunnel header has been added, packet_type of flow and base_flow
-     * need to be set to PT_ETH. */
-    ctx->xin->flow.packet_type = htonl(PT_ETH);
-    ctx->base_flow.packet_type = htonl(PT_ETH);
+    /* After tunnel header has been added, MAC and IP data of flow and
+     * base_flow need to be set properly, since there is not recirculation
+     * any more when sending packet to tunnel. */
 
+    propagate_tunnel_data_to_flow(ctx, dmac, smac, s_ip6, s_ip,
+                                  tnl_params.is_ipv6, tnl_push_data.tnl_type);
+
+
+    size_t push_action_size = 0;
+    size_t clone_ofs = nl_msg_start_nested(ctx->odp_actions,
+                                           OVS_ACTION_ATTR_CLONE);
     odp_put_tnl_push_action(ctx->odp_actions, &tnl_push_data);
+    push_action_size = ctx->odp_actions->size;
+
+    /* Try to see if its possible to apply nested clone action.
+     * There is no way to know the nested clone is a valid on this
+     * tunnel without performing the translate. Doing a temporary
+     * translate for validation.
+     */
+    if (!nested_clone_valid_on_tunnel(ctx, xport, out_dev)) {
+        /* Datapath is not doing the recirculation now, so lets make it
+         * happen explicitly.
+         */
+        nl_msg_put_u32(ctx->odp_actions, OVS_ACTION_ATTR_RECIRC, 0);
+        nl_msg_end_nested(ctx->odp_actions, clone_ofs);
+        goto out;
+    }
+
+    if (ctx->xin->resubmit_stats) {
+        ovs_mutex_lock(&ctx->xin->resubmit_stats_mutex);
+        old_resubmit_stats = ctx->xin->resubmit_stats;
+        tmp_resubmit_stats = *ctx->xin->resubmit_stats;
+        tmp_resubmit_stats.n_bytes += tnl_push_data.header_len;
+        ctx->xin->resubmit_stats = &tmp_resubmit_stats;
+        ovs_mutex_unlock(&ctx->xin->resubmit_stats_mutex);
+
+        if (ctx->xin->xcache) {
+            struct xc_entry *entry;
+            entry = xlate_cache_add_entry(ctx->xin->xcache, XC_TUNNEL_HEADER);
+            entry->tunnel_hdr.hdr_size = tnl_push_data.header_len;
+            entry->tunnel_hdr.operation = ADD;
+            old_xcache = ctx->xin->xcache;
+        }
+    }
+
+    apply_nested_clone_actions(ctx, xport, out_dev);
+    if (ctx->odp_actions->size > push_action_size) {
+        /* Update the CLONE action only when combined */
+        nl_msg_end_nested(ctx->odp_actions, clone_ofs);
+    }
+    else {
+        /* No actions after the tunnel, no need of clone */
+        nl_msg_cancel_nested(ctx->odp_actions, clone_ofs);
+        odp_put_tnl_push_action(ctx->odp_actions, &tnl_push_data);
+    }
+    if (ctx->xin->resubmit_stats) {
+        /* Revert the resubmit stats after clone actions */
+        ctx->xin->resubmit_stats = old_resubmit_stats;
+        if (ctx->xin->xcache) {
+            ctx->xin->xcache = old_xcache;
+        }
+    }
+out:
+    /* Restore flow and base_flow data. */
+    copy_flow(&ctx->base_flow, &old_base_flow);
+    copy_flow(&ctx->xin->flow, &old_flow);
     return 0;
 }
 
@@ -6133,6 +6202,8 @@  xlate_in_init(struct xlate_in *xin, struct ofproto_dpif *ofproto,
     xin->odp_actions = odp_actions;
     xin->in_packet_out = false;
 
+    ovs_mutex_init_adaptive(&xin->resubmit_stats_mutex);
+
     /* Do recirc lookup. */
     xin->frozen_state = NULL;
     if (flow->recirc_id) {
diff --git a/ofproto/ofproto-dpif-xlate.h b/ofproto/ofproto-dpif-xlate.h
index 68e114a..8e48665 100644
--- a/ofproto/ofproto-dpif-xlate.h
+++ b/ofproto/ofproto-dpif-xlate.h
@@ -112,6 +112,7 @@  struct xlate_in {
      * This is normally null so the client has to set it manually after
      * calling xlate_in_init(). */
     const struct dpif_flow_stats *resubmit_stats;
+    struct ovs_mutex resubmit_stats_mutex;
 
     /* Counters carried over from a pre-existing translation of a related flow.
      * This can occur due to, e.g., the translation of an ARP packet that was
diff --git a/ofproto/ofproto-dpif.c b/ofproto/ofproto-dpif.c
index 23ba1a3..a2c73c5 100644
--- a/ofproto/ofproto-dpif.c
+++ b/ofproto/ofproto-dpif.c
@@ -4591,7 +4591,7 @@  packet_xlate_revert(struct ofproto *ofproto OVS_UNUSED,
 static void
 ofproto_dpif_xcache_execute(struct ofproto_dpif *ofproto,
                             struct xlate_cache *xcache,
-                            const struct dpif_flow_stats *stats)
+                            struct dpif_flow_stats *stats)
     OVS_REQUIRES(ofproto_mutex)
 {
     struct xc_entry *entry;
@@ -4624,6 +4624,7 @@  ofproto_dpif_xcache_execute(struct ofproto_dpif *ofproto,
         case XC_GROUP:
         case XC_TNL_NEIGH:
         case XC_CONTROLLER:
+        case XC_TUNNEL_HEADER:
             xlate_push_stats_entry(entry, stats);
             break;
         default: