diff mbox series

[v4,bpf-next,6/9] bpf: cpumap: implement XDP_REDIRECT for eBPF programs attached to map entries

Message ID ef1a456ba3b76a61b7dc6302974f248a21d906dd.1593012598.git.lorenzo@kernel.org
State Changes Requested
Delegated to: BPF Maintainers
Headers show
Series introduce support for XDP programs in CPUMAP | expand

Commit Message

Lorenzo Bianconi June 24, 2020, 3:33 p.m. UTC
Introduce XDP_REDIRECT support for eBPF programs attached to cpumap
entries.
This patch has been tested on Marvell ESPRESSObin using a modified
version of xdp_redirect_cpu sample in order to attach a XDP program
to CPUMAP entries to perform a redirect on the mvneta interface.
In particular the following scenario has been tested:

rq (cpu0) --> mvneta - XDP_REDIRECT (cpu0) --> CPUMAP - XDP_REDIRECT (cpu1) --> mvneta

$./xdp_redirect_cpu -p xdp_cpu_map0 -d eth0 -c 1 -e xdp_redirect \
	-f xdp_redirect_kern.o -m tx_port -r eth0

tx: 285.2 Kpps rx: 285.2 Kpps

Attaching a simple XDP program on eth0 to perform XDP_TX gives
comparable results:

tx: 288.4 Kpps rx: 288.4 Kpps

Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
Co-developed-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
---
 include/net/xdp.h          |  1 +
 include/trace/events/xdp.h |  6 ++++--
 kernel/bpf/cpumap.c        | 17 +++++++++++++++--
 3 files changed, 20 insertions(+), 4 deletions(-)

Comments

Daniel Borkmann June 25, 2020, 9:28 p.m. UTC | #1
On 6/24/20 5:33 PM, Lorenzo Bianconi wrote:
> Introduce XDP_REDIRECT support for eBPF programs attached to cpumap
> entries.
> This patch has been tested on Marvell ESPRESSObin using a modified
> version of xdp_redirect_cpu sample in order to attach a XDP program
> to CPUMAP entries to perform a redirect on the mvneta interface.
> In particular the following scenario has been tested:
> 
> rq (cpu0) --> mvneta - XDP_REDIRECT (cpu0) --> CPUMAP - XDP_REDIRECT (cpu1) --> mvneta
> 
> $./xdp_redirect_cpu -p xdp_cpu_map0 -d eth0 -c 1 -e xdp_redirect \
> 	-f xdp_redirect_kern.o -m tx_port -r eth0
> 
> tx: 285.2 Kpps rx: 285.2 Kpps
> 
> Attaching a simple XDP program on eth0 to perform XDP_TX gives
> comparable results:
> 
> tx: 288.4 Kpps rx: 288.4 Kpps
> 
> Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
> Co-developed-by: Jesper Dangaard Brouer <brouer@redhat.com>
> Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
> Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
> ---
>   include/net/xdp.h          |  1 +
>   include/trace/events/xdp.h |  6 ++++--
>   kernel/bpf/cpumap.c        | 17 +++++++++++++++--
>   3 files changed, 20 insertions(+), 4 deletions(-)
> 
> diff --git a/include/net/xdp.h b/include/net/xdp.h
> index 83b9e0142b52..5be0d4d65b94 100644
> --- a/include/net/xdp.h
> +++ b/include/net/xdp.h
> @@ -99,6 +99,7 @@ struct xdp_frame {
>   };
>   
>   struct xdp_cpumap_stats {
> +	unsigned int redirect;
>   	unsigned int pass;
>   	unsigned int drop;
>   };
> diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h
> index e2c99f5bee39..cd24e8a59529 100644
> --- a/include/trace/events/xdp.h
> +++ b/include/trace/events/xdp.h
> @@ -190,6 +190,7 @@ TRACE_EVENT(xdp_cpumap_kthread,
>   		__field(int, sched)
>   		__field(unsigned int, xdp_pass)
>   		__field(unsigned int, xdp_drop)
> +		__field(unsigned int, xdp_redirect)
>   	),
>   
>   	TP_fast_assign(
> @@ -201,18 +202,19 @@ TRACE_EVENT(xdp_cpumap_kthread,
>   		__entry->sched	= sched;
>   		__entry->xdp_pass	= xdp_stats->pass;
>   		__entry->xdp_drop	= xdp_stats->drop;
> +		__entry->xdp_redirect	= xdp_stats->redirect;
>   	),
>   
>   	TP_printk("kthread"
>   		  " cpu=%d map_id=%d action=%s"
>   		  " processed=%u drops=%u"
>   		  " sched=%d"
> -		  " xdp_pass=%u xdp_drop=%u",
> +		  " xdp_pass=%u xdp_drop=%u xdp_redirect=%u",
>   		  __entry->cpu, __entry->map_id,
>   		  __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB),
>   		  __entry->processed, __entry->drops,
>   		  __entry->sched,
> -		  __entry->xdp_pass, __entry->xdp_drop)
> +		  __entry->xdp_pass, __entry->xdp_drop, __entry->xdp_redirect)
>   );
>   
>   TRACE_EVENT(xdp_cpumap_enqueue,
> diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
> index 4e4cd240f07b..c0b2f265ccb2 100644
> --- a/kernel/bpf/cpumap.c
> +++ b/kernel/bpf/cpumap.c
> @@ -240,7 +240,7 @@ static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu,
>   	xdp_set_return_frame_no_direct();
>   	xdp.rxq = &rxq;
>   
> -	rcu_read_lock();
> +	rcu_read_lock_bh();
>   
>   	prog = READ_ONCE(rcpu->prog);
>   	for (i = 0; i < n; i++) {
> @@ -266,6 +266,16 @@ static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu,
>   				stats->pass++;
>   			}
>   			break;
> +		case XDP_REDIRECT:
> +			err = xdp_do_redirect(xdpf->dev_rx, &xdp,
> +					      prog);
> +			if (unlikely(err)) {
> +				xdp_return_frame(xdpf);
> +				stats->drop++;
> +			} else {
> +				stats->redirect++;
> +			}

Could we do better with all the accounting and do this from /inside/ BPF tracing prog
instead (otherwise too bad we need to have it here even if the tracepoint is disabled)?

> +			break;
>   		default:
>   			bpf_warn_invalid_xdp_action(act);
>   			/* fallthrough */
> @@ -276,7 +286,10 @@ static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu,
>   		}
>   	}
>   
> -	rcu_read_unlock();
> +	if (stats->redirect)
> +		xdp_do_flush_map();
> +
> +	rcu_read_unlock_bh(); /* resched point, may call do_softirq() */
>   	xdp_clear_return_frame_no_direct();

Hm, this looks incorrect. Why do you call the xdp_clear_return_frame_no_direct() /after/
the possibility where there is a rescheduling point for softirq?

>   	return nframes;
>
Jesper Dangaard Brouer June 26, 2020, 7:49 a.m. UTC | #2
On Thu, 25 Jun 2020 23:28:59 +0200
Daniel Borkmann <daniel@iogearbox.net> wrote:

> > @@ -276,7 +286,10 @@ static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu,
> >   		}
> >   	}
> >   
> > -	rcu_read_unlock();
> > +	if (stats->redirect)
> > +		xdp_do_flush_map();
> > +
> > +	rcu_read_unlock_bh(); /* resched point, may call do_softirq() */
> >   	xdp_clear_return_frame_no_direct();  
> 
> Hm, this looks incorrect. Why do you call the xdp_clear_return_frame_no_direct() /after/
> the possibility where there is a rescheduling point for softirq?

Good you caught this! - This needs to be fixed.
Lorenzo Bianconi June 26, 2020, 7:59 a.m. UTC | #3
> On 6/24/20 5:33 PM, Lorenzo Bianconi wrote:

[...]

> > +	if (stats->redirect)
> > +		xdp_do_flush_map();
> > +
> > +	rcu_read_unlock_bh(); /* resched point, may call do_softirq() */
> >   	xdp_clear_return_frame_no_direct();
> 
> Hm, this looks incorrect. Why do you call the xdp_clear_return_frame_no_direct() /after/
> the possibility where there is a rescheduling point for softirq?

Hi Daniel,

yes, right. Thx for spotting this, I will fix in v5.

Regards,
Lorenzo

> 
> >   	return nframes;
> > 
>
Jesper Dangaard Brouer June 26, 2020, 9:18 a.m. UTC | #4
On Thu, 25 Jun 2020 23:28:59 +0200
Daniel Borkmann <daniel@iogearbox.net> wrote:

> > diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
> > index 4e4cd240f07b..c0b2f265ccb2 100644
> > --- a/kernel/bpf/cpumap.c
> > +++ b/kernel/bpf/cpumap.c
> > @@ -240,7 +240,7 @@ static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu,
> >   	xdp_set_return_frame_no_direct();
> >   	xdp.rxq = &rxq;
> >   
> > -	rcu_read_lock();
> > +	rcu_read_lock_bh();
> >   
> >   	prog = READ_ONCE(rcpu->prog);
> >   	for (i = 0; i < n; i++) {
> > @@ -266,6 +266,16 @@ static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu,
> >   				stats->pass++;
> >   			}
> >   			break;
> > +		case XDP_REDIRECT:
> > +			err = xdp_do_redirect(xdpf->dev_rx, &xdp,
> > +					      prog);
> > +			if (unlikely(err)) {
> > +				xdp_return_frame(xdpf);
> > +				stats->drop++;

I consider if this should be a redir_err counter.

> > +			} else {
> > +				stats->redirect++;
> > +			}  
> 
> Could we do better with all the accounting and do this from /inside/ BPF tracing prog
> instead (otherwise too bad we need to have it here even if the tracepoint is disabled)?

I'm on-the-fence with this one...

First of all the BPF-prog cannot see the return code of xdp_do_redirect.
So, it cannot give the correct/needed stats without this counter. It
would basically report the redirects as successful redirects. (This is
actually a re-occuring support issue, when end-users misconfigure
xdp_redirect sample and think they get good performance, even-though
packets are dropped).

Specifically for XDP_REDIRECT we need to update some state anyhow, such
that we know to call xdp_do_flush_map(). Thus removing the counter
would not gain much performance wise.
Jesper Dangaard Brouer June 26, 2020, 10:06 a.m. UTC | #5
On Wed, 24 Jun 2020 17:33:55 +0200
Lorenzo Bianconi <lorenzo@kernel.org> wrote:

> diff --git a/include/net/xdp.h b/include/net/xdp.h
> index 83b9e0142b52..5be0d4d65b94 100644
> --- a/include/net/xdp.h
> +++ b/include/net/xdp.h
> @@ -99,6 +99,7 @@ struct xdp_frame {
>  };
>  
>  struct xdp_cpumap_stats {
> +	unsigned int redirect;
>  	unsigned int pass;
>  	unsigned int drop;
>  };
> diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h
> index e2c99f5bee39..cd24e8a59529 100644
> --- a/include/trace/events/xdp.h
> +++ b/include/trace/events/xdp.h
> @@ -190,6 +190,7 @@ TRACE_EVENT(xdp_cpumap_kthread,
>  		__field(int, sched)
>  		__field(unsigned int, xdp_pass)
>  		__field(unsigned int, xdp_drop)
> +		__field(unsigned int, xdp_redirect)
>  	),
>  
>  	TP_fast_assign(
> @@ -201,18 +202,19 @@ TRACE_EVENT(xdp_cpumap_kthread,
>  		__entry->sched	= sched;
>  		__entry->xdp_pass	= xdp_stats->pass;
>  		__entry->xdp_drop	= xdp_stats->drop;
> +		__entry->xdp_redirect	= xdp_stats->redirect;
>  	),

Let me stress, that I think can do this in a followup patch (but before
a release).

I'm considering that we should store/give a pointer to xdp_stats
(struct xdp_cpumap_stats) and let the BPF tracing program do the
"decoding"/struct access to get these values.  (We will go from storing
12 bytes to 8 bytes (on 64-bit), so I don't expect much gain).


>  	TP_printk("kthread"
>  		  " cpu=%d map_id=%d action=%s"
>  		  " processed=%u drops=%u"
>  		  " sched=%d"
> -		  " xdp_pass=%u xdp_drop=%u",
> +		  " xdp_pass=%u xdp_drop=%u xdp_redirect=%u",
>  		  __entry->cpu, __entry->map_id,
>  		  __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB),
>  		  __entry->processed, __entry->drops,
>  		  __entry->sched,
> -		  __entry->xdp_pass, __entry->xdp_drop)
> +		  __entry->xdp_pass, __entry->xdp_drop, __entry->xdp_redirect)
>  );
>
diff mbox series

Patch

diff --git a/include/net/xdp.h b/include/net/xdp.h
index 83b9e0142b52..5be0d4d65b94 100644
--- a/include/net/xdp.h
+++ b/include/net/xdp.h
@@ -99,6 +99,7 @@  struct xdp_frame {
 };
 
 struct xdp_cpumap_stats {
+	unsigned int redirect;
 	unsigned int pass;
 	unsigned int drop;
 };
diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h
index e2c99f5bee39..cd24e8a59529 100644
--- a/include/trace/events/xdp.h
+++ b/include/trace/events/xdp.h
@@ -190,6 +190,7 @@  TRACE_EVENT(xdp_cpumap_kthread,
 		__field(int, sched)
 		__field(unsigned int, xdp_pass)
 		__field(unsigned int, xdp_drop)
+		__field(unsigned int, xdp_redirect)
 	),
 
 	TP_fast_assign(
@@ -201,18 +202,19 @@  TRACE_EVENT(xdp_cpumap_kthread,
 		__entry->sched	= sched;
 		__entry->xdp_pass	= xdp_stats->pass;
 		__entry->xdp_drop	= xdp_stats->drop;
+		__entry->xdp_redirect	= xdp_stats->redirect;
 	),
 
 	TP_printk("kthread"
 		  " cpu=%d map_id=%d action=%s"
 		  " processed=%u drops=%u"
 		  " sched=%d"
-		  " xdp_pass=%u xdp_drop=%u",
+		  " xdp_pass=%u xdp_drop=%u xdp_redirect=%u",
 		  __entry->cpu, __entry->map_id,
 		  __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB),
 		  __entry->processed, __entry->drops,
 		  __entry->sched,
-		  __entry->xdp_pass, __entry->xdp_drop)
+		  __entry->xdp_pass, __entry->xdp_drop, __entry->xdp_redirect)
 );
 
 TRACE_EVENT(xdp_cpumap_enqueue,
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index 4e4cd240f07b..c0b2f265ccb2 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -240,7 +240,7 @@  static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu,
 	xdp_set_return_frame_no_direct();
 	xdp.rxq = &rxq;
 
-	rcu_read_lock();
+	rcu_read_lock_bh();
 
 	prog = READ_ONCE(rcpu->prog);
 	for (i = 0; i < n; i++) {
@@ -266,6 +266,16 @@  static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu,
 				stats->pass++;
 			}
 			break;
+		case XDP_REDIRECT:
+			err = xdp_do_redirect(xdpf->dev_rx, &xdp,
+					      prog);
+			if (unlikely(err)) {
+				xdp_return_frame(xdpf);
+				stats->drop++;
+			} else {
+				stats->redirect++;
+			}
+			break;
 		default:
 			bpf_warn_invalid_xdp_action(act);
 			/* fallthrough */
@@ -276,7 +286,10 @@  static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu,
 		}
 	}
 
-	rcu_read_unlock();
+	if (stats->redirect)
+		xdp_do_flush_map();
+
+	rcu_read_unlock_bh(); /* resched point, may call do_softirq() */
 	xdp_clear_return_frame_no_direct();
 
 	return nframes;