diff mbox series

[bpf-next,2/7] lib: reciprocal_div: implement the improved algorithm on the paper mentioned

Message ID 20180625035421.2991-3-jakub.kicinski@netronome.com
State Changes Requested, archived
Delegated to: BPF Maintainers
Headers show
Series nfp: bpf: add multiplication, divide and memcpy from maps | expand

Commit Message

Jakub Kicinski June 25, 2018, 3:54 a.m. UTC
From: Jiong Wang <jiong.wang@netronome.com>

The new added "reciprocal_value_adv" implements the advanced version of the
algorithm described in Figure 4.2 of the paper except when dividend has MSB
set which would require u128 divide on host and actually could be easily
handled before calling the new "reciprocal_value_adv".

The advanced version requires more complex calculation to get the
reciprocal multiplier and other control variables, but then could reduce
the required emulation operations.

It makes no sense to use this advanced version for host divide emulation,
those extra complexities for calculating multiplier etc could completely
waive our saving on emulation operations.

However, it makes sense to use it for JIT divide code generation (for
example eBPF JIT backends) for which we are willing to trade performance of
JITed code with that of host. As shown by the following pseudo code, the
required emulation operations could go down from 6 (the basic version) to 3
or 4.

To use the result of "reciprocal_value_adv", suppose we want to calculate
n/d, the C-style pseudo code will be the following, it could be easily
changed to real code generation for other JIT targets.

  struct reciprocal_value_adv rvalue;
  u8 pre_shift, exp;

  if (d >= (1u << 31)) {
    result = n >= d;
    return;
  }
  rvalue = reciprocal_value_adv(d, 32)
  exp = rvalue.exp;
  if (rvalue.is_wide_m && !(d & 1)) {
    pre_shift = fls(d & -d) - 1;
    rvalue = reciprocal_value_adv(d >> pre_shift, 32 - pre_shift);
  } else {
    pre_shift = 0;
  }

  // code generation starts.
  if (imm == 1 << exp) {
    result = n >> exp;
  } else if (rvalue.is_wide_m) {
    // pre_shift must be zero when reached here.
    t = (n * rvalue.m) >> 32;
    result = n - t;
    result >>= 1;
    result += t;
    result >>= rvalue.sh - 1;
  } else {
    if (pre_shift)
      result = n >> pre_shift;
    result = ((u64)result * rvalue.m) >> 32;
    result >>= rvalue.sh;
  }

Signed-off-by: Jiong Wang <jiong.wang@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
---
 include/linux/reciprocal_div.h | 65 ++++++++++++++++++++++++++++++++++
 lib/reciprocal_div.c           | 37 +++++++++++++++++++
 2 files changed, 102 insertions(+)

Comments

Song Liu June 26, 2018, 6:21 a.m. UTC | #1
On Sun, Jun 24, 2018 at 8:54 PM, Jakub Kicinski
<jakub.kicinski@netronome.com> wrote:
> From: Jiong Wang <jiong.wang@netronome.com>
>
> The new added "reciprocal_value_adv" implements the advanced version of the
> algorithm described in Figure 4.2 of the paper except when dividend has MSB
> set which would require u128 divide on host and actually could be easily
> handled before calling the new "reciprocal_value_adv".
>
> The advanced version requires more complex calculation to get the
> reciprocal multiplier and other control variables, but then could reduce
> the required emulation operations.
>
> It makes no sense to use this advanced version for host divide emulation,
> those extra complexities for calculating multiplier etc could completely
> waive our saving on emulation operations.
>
> However, it makes sense to use it for JIT divide code generation (for
> example eBPF JIT backends) for which we are willing to trade performance of
> JITed code with that of host. As shown by the following pseudo code, the
> required emulation operations could go down from 6 (the basic version) to 3
> or 4.
>
> To use the result of "reciprocal_value_adv", suppose we want to calculate
> n/d, the C-style pseudo code will be the following, it could be easily
> changed to real code generation for other JIT targets.
>
>   struct reciprocal_value_adv rvalue;
>   u8 pre_shift, exp;
>
>   if (d >= (1u << 31)) {
>     result = n >= d;
>     return;
>   }
>   rvalue = reciprocal_value_adv(d, 32)
>   exp = rvalue.exp;
>   if (rvalue.is_wide_m && !(d & 1)) {
>     pre_shift = fls(d & -d) - 1;
>     rvalue = reciprocal_value_adv(d >> pre_shift, 32 - pre_shift);
>   } else {
>     pre_shift = 0;
>   }
>
>   // code generation starts.
>   if (imm == 1 << exp) {
>     result = n >> exp;
>   } else if (rvalue.is_wide_m) {
>     // pre_shift must be zero when reached here.
>     t = (n * rvalue.m) >> 32;
>     result = n - t;
>     result >>= 1;
>     result += t;
>     result >>= rvalue.sh - 1;
>   } else {
>     if (pre_shift)
>       result = n >> pre_shift;
>     result = ((u64)result * rvalue.m) >> 32;
>     result >>= rvalue.sh;
>   }
>
> Signed-off-by: Jiong Wang <jiong.wang@netronome.com>
> Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
> ---
>  include/linux/reciprocal_div.h | 65 ++++++++++++++++++++++++++++++++++
>  lib/reciprocal_div.c           | 37 +++++++++++++++++++
>  2 files changed, 102 insertions(+)
>
> diff --git a/include/linux/reciprocal_div.h b/include/linux/reciprocal_div.h
> index e031e9f2f9d8..5a695e4697d3 100644
> --- a/include/linux/reciprocal_div.h
> +++ b/include/linux/reciprocal_div.h
> @@ -25,6 +25,9 @@ struct reciprocal_value {
>         u8 sh1, sh2;
>  };
>
> +/* "reciprocal_value" and "reciprocal_divide" together implement the basic
> + * version of the algorithm described in Figure 4.1 of the paper.
> + */
>  struct reciprocal_value reciprocal_value(u32 d);
>
>  static inline u32 reciprocal_divide(u32 a, struct reciprocal_value R)
> @@ -33,4 +36,66 @@ static inline u32 reciprocal_divide(u32 a, struct reciprocal_value R)
>         return (t + ((a - t) >> R.sh1)) >> R.sh2;
>  }
>
> +struct reciprocal_value_adv {
> +       u32 m;
> +       u8 sh, exp;
> +       bool is_wide_m;
> +};
> +
> +/* "reciprocal_value_adv" implements the advanced version of the algorithm
> + * described in Figure 4.2 of the paper except when dividend has MSB set which
> + * would require u128 divide on host and actually could be easily handled before
> + * calling "reciprocal_value_adv".
> + *
> + * The advanced version requires more complex calculation to get the reciprocal
> + * multiplier and other control variables, but then could reduce the required
> + * emulation operations.
> + *
> + * It makes no sense to use this advanced version for host divide emulation,
> + * those extra complexities for calculating multiplier etc could completely
> + * waive our saving on emulation operations.
> + *
> + * However, it makes sense to use it for JIT divide code generation for which
> + * we are willing to trade performance of JITed code with that of host. As shown
> + * by the following pseudo code, the required emulation operations could go down
> + * from 6 (the basic version) to 3 or 4.
> + *
> + * To use the result of "reciprocal_value_adv", suppose we want to calculate
> + * n/d:
> + *
> + *   struct reciprocal_value_adv rvalue;
> + *   u8 pre_shift, exp;
> + *
> + *   if (d >= (1u << 31)) {
> + *     result = n >= d;
> + *     return;
> + *   }
> + *   rvalue = reciprocal_value_adv(d, 32)
> + *   exp = rvalue.exp;
> + *   if (rvalue.is_wide_m && !(d & 1)) {
> + *     pre_shift = fls(d & -d) - 1;
> + *     rvalue = reciprocal_value_adv(d >> pre_shift, 32 - pre_shift);
> + *   } else {
> + *     pre_shift = 0;
> + *   }
> + *
> + *   // code generation starts.
> + *   if (imm == 1 << exp) {
> + *     result = n >> exp;
> + *   } else if (rvalue.is_wide_m) {
> + *     // pre_shift must be zero when reached here.
> + *     t = (n * rvalue.m) >> 32;
> + *     result = n - t;
> + *     result >>= 1;
> + *     result += t;
> + *     result >>= rvalue.sh - 1;
> + *   } else {
> + *     if (pre_shift)
> + *       result = n >> pre_shift;
> + *     result = ((u64)result * rvalue.m) >> 32;
> + *     result >>= rvalue.sh;
> + *   }
> + */
> +struct reciprocal_value_adv reciprocal_value_adv(u32 d, u8 prec);
> +
>  #endif /* _LINUX_RECIPROCAL_DIV_H */
> diff --git a/lib/reciprocal_div.c b/lib/reciprocal_div.c
> index fcb4ce682c6f..a41501ebad7c 100644
> --- a/lib/reciprocal_div.c
> +++ b/lib/reciprocal_div.c
> @@ -26,3 +26,40 @@ struct reciprocal_value reciprocal_value(u32 d)
>         return R;
>  }
>  EXPORT_SYMBOL(reciprocal_value);
> +
> +struct reciprocal_value_adv reciprocal_value_adv(u32 d, u8 prec)
> +{
> +       struct reciprocal_value_adv R;
> +       u32 l, post_shift;
> +       u64 mhigh, mlow;
> +
> +       l = fls(d - 1);
> +       post_shift = l;
> +       /* NOTE: mlow/mhigh could overflow u64 when l == 32 which means d has
> +        * MSB set. This case needs to be handled before calling
> +        * "reciprocal_value_adv", please see the comment at
> +        * include/linux/reciprocal_div.h.
> +        */

Shall we handle l == 32 case better? I guess the concern here is extra
handling may
slow down the fast path? If that's the case, we should at least add a
WARNING on the
slow path.

Thanks,
Song


> +       mlow = 1ULL << (32 + l);
> +       do_div(mlow, d);
> +       mhigh = (1ULL << (32 + l)) + (1ULL << (32 + l - prec));
> +       do_div(mhigh, d);
> +
> +       for (; post_shift > 0; post_shift--) {
> +               u64 lo = mlow >> 1, hi = mhigh >> 1;
> +
> +               if (lo >= hi)
> +                       break;
> +
> +               mlow = lo;
> +               mhigh = hi;
> +       }
> +
> +       R.m = (u32)mhigh;
> +       R.sh = post_shift;
> +       R.exp = l;
> +       R.is_wide_m = mhigh > U32_MAX;
> +
> +       return R;
> +}
> +EXPORT_SYMBOL(reciprocal_value_adv);
> --
> 2.17.1
>
Jakub Kicinski June 26, 2018, 8:52 p.m. UTC | #2
On Mon, 25 Jun 2018 23:21:10 -0700, Song Liu wrote:
> > +struct reciprocal_value_adv reciprocal_value_adv(u32 d, u8 prec)
> > +{
> > +       struct reciprocal_value_adv R;
> > +       u32 l, post_shift;
> > +       u64 mhigh, mlow;
> > +
> > +       l = fls(d - 1);
> > +       post_shift = l;
> > +       /* NOTE: mlow/mhigh could overflow u64 when l == 32 which means d has
> > +        * MSB set. This case needs to be handled before calling
> > +        * "reciprocal_value_adv", please see the comment at
> > +        * include/linux/reciprocal_div.h.
> > +        */  
> 
> Shall we handle l == 32 case better? I guess the concern here is extra
> handling may slow down the fast path? If that's the case, we should
> at least add a WARNING on the slow path.

Agreed, I think Jiong is travelling, hence no response.  We'll respin.
Daniel Borkmann June 27, 2018, 8:54 a.m. UTC | #3
On 06/26/2018 10:52 PM, Jakub Kicinski wrote:
> On Mon, 25 Jun 2018 23:21:10 -0700, Song Liu wrote:
>>> +struct reciprocal_value_adv reciprocal_value_adv(u32 d, u8 prec)
>>> +{
>>> +       struct reciprocal_value_adv R;
>>> +       u32 l, post_shift;
>>> +       u64 mhigh, mlow;
>>> +
>>> +       l = fls(d - 1);
>>> +       post_shift = l;
>>> +       /* NOTE: mlow/mhigh could overflow u64 when l == 32 which means d has
>>> +        * MSB set. This case needs to be handled before calling
>>> +        * "reciprocal_value_adv", please see the comment at
>>> +        * include/linux/reciprocal_div.h.
>>> +        */  
>>
>> Shall we handle l == 32 case better? I guess the concern here is extra
>> handling may slow down the fast path? If that's the case, we should
>> at least add a WARNING on the slow path.
> 
> Agreed, I think Jiong is travelling, hence no response.  We'll respin.

Ok, since there's going to be a respin, I've tossed the current series from
patchwork in that case.

Thanks,
Daniel
diff mbox series

Patch

diff --git a/include/linux/reciprocal_div.h b/include/linux/reciprocal_div.h
index e031e9f2f9d8..5a695e4697d3 100644
--- a/include/linux/reciprocal_div.h
+++ b/include/linux/reciprocal_div.h
@@ -25,6 +25,9 @@  struct reciprocal_value {
 	u8 sh1, sh2;
 };
 
+/* "reciprocal_value" and "reciprocal_divide" together implement the basic
+ * version of the algorithm described in Figure 4.1 of the paper.
+ */
 struct reciprocal_value reciprocal_value(u32 d);
 
 static inline u32 reciprocal_divide(u32 a, struct reciprocal_value R)
@@ -33,4 +36,66 @@  static inline u32 reciprocal_divide(u32 a, struct reciprocal_value R)
 	return (t + ((a - t) >> R.sh1)) >> R.sh2;
 }
 
+struct reciprocal_value_adv {
+	u32 m;
+	u8 sh, exp;
+	bool is_wide_m;
+};
+
+/* "reciprocal_value_adv" implements the advanced version of the algorithm
+ * described in Figure 4.2 of the paper except when dividend has MSB set which
+ * would require u128 divide on host and actually could be easily handled before
+ * calling "reciprocal_value_adv".
+ *
+ * The advanced version requires more complex calculation to get the reciprocal
+ * multiplier and other control variables, but then could reduce the required
+ * emulation operations.
+ *
+ * It makes no sense to use this advanced version for host divide emulation,
+ * those extra complexities for calculating multiplier etc could completely
+ * waive our saving on emulation operations.
+ *
+ * However, it makes sense to use it for JIT divide code generation for which
+ * we are willing to trade performance of JITed code with that of host. As shown
+ * by the following pseudo code, the required emulation operations could go down
+ * from 6 (the basic version) to 3 or 4.
+ *
+ * To use the result of "reciprocal_value_adv", suppose we want to calculate
+ * n/d:
+ *
+ *   struct reciprocal_value_adv rvalue;
+ *   u8 pre_shift, exp;
+ *
+ *   if (d >= (1u << 31)) {
+ *     result = n >= d;
+ *     return;
+ *   }
+ *   rvalue = reciprocal_value_adv(d, 32)
+ *   exp = rvalue.exp;
+ *   if (rvalue.is_wide_m && !(d & 1)) {
+ *     pre_shift = fls(d & -d) - 1;
+ *     rvalue = reciprocal_value_adv(d >> pre_shift, 32 - pre_shift);
+ *   } else {
+ *     pre_shift = 0;
+ *   }
+ *
+ *   // code generation starts.
+ *   if (imm == 1 << exp) {
+ *     result = n >> exp;
+ *   } else if (rvalue.is_wide_m) {
+ *     // pre_shift must be zero when reached here.
+ *     t = (n * rvalue.m) >> 32;
+ *     result = n - t;
+ *     result >>= 1;
+ *     result += t;
+ *     result >>= rvalue.sh - 1;
+ *   } else {
+ *     if (pre_shift)
+ *       result = n >> pre_shift;
+ *     result = ((u64)result * rvalue.m) >> 32;
+ *     result >>= rvalue.sh;
+ *   }
+ */
+struct reciprocal_value_adv reciprocal_value_adv(u32 d, u8 prec);
+
 #endif /* _LINUX_RECIPROCAL_DIV_H */
diff --git a/lib/reciprocal_div.c b/lib/reciprocal_div.c
index fcb4ce682c6f..a41501ebad7c 100644
--- a/lib/reciprocal_div.c
+++ b/lib/reciprocal_div.c
@@ -26,3 +26,40 @@  struct reciprocal_value reciprocal_value(u32 d)
 	return R;
 }
 EXPORT_SYMBOL(reciprocal_value);
+
+struct reciprocal_value_adv reciprocal_value_adv(u32 d, u8 prec)
+{
+	struct reciprocal_value_adv R;
+	u32 l, post_shift;
+	u64 mhigh, mlow;
+
+	l = fls(d - 1);
+	post_shift = l;
+	/* NOTE: mlow/mhigh could overflow u64 when l == 32 which means d has
+	 * MSB set. This case needs to be handled before calling
+	 * "reciprocal_value_adv", please see the comment at
+	 * include/linux/reciprocal_div.h.
+	 */
+	mlow = 1ULL << (32 + l);
+	do_div(mlow, d);
+	mhigh = (1ULL << (32 + l)) + (1ULL << (32 + l - prec));
+	do_div(mhigh, d);
+
+	for (; post_shift > 0; post_shift--) {
+		u64 lo = mlow >> 1, hi = mhigh >> 1;
+
+		if (lo >= hi)
+			break;
+
+		mlow = lo;
+		mhigh = hi;
+	}
+
+	R.m = (u32)mhigh;
+	R.sh = post_shift;
+	R.exp = l;
+	R.is_wide_m = mhigh > U32_MAX;
+
+	return R;
+}
+EXPORT_SYMBOL(reciprocal_value_adv);