Message ID | 20180625035421.2991-3-jakub.kicinski@netronome.com |
---|---|
State | Changes Requested, archived |
Delegated to: | BPF Maintainers |
Headers | show |
Series | nfp: bpf: add multiplication, divide and memcpy from maps | expand |
On Sun, Jun 24, 2018 at 8:54 PM, Jakub Kicinski <jakub.kicinski@netronome.com> wrote: > From: Jiong Wang <jiong.wang@netronome.com> > > The new added "reciprocal_value_adv" implements the advanced version of the > algorithm described in Figure 4.2 of the paper except when dividend has MSB > set which would require u128 divide on host and actually could be easily > handled before calling the new "reciprocal_value_adv". > > The advanced version requires more complex calculation to get the > reciprocal multiplier and other control variables, but then could reduce > the required emulation operations. > > It makes no sense to use this advanced version for host divide emulation, > those extra complexities for calculating multiplier etc could completely > waive our saving on emulation operations. > > However, it makes sense to use it for JIT divide code generation (for > example eBPF JIT backends) for which we are willing to trade performance of > JITed code with that of host. As shown by the following pseudo code, the > required emulation operations could go down from 6 (the basic version) to 3 > or 4. > > To use the result of "reciprocal_value_adv", suppose we want to calculate > n/d, the C-style pseudo code will be the following, it could be easily > changed to real code generation for other JIT targets. > > struct reciprocal_value_adv rvalue; > u8 pre_shift, exp; > > if (d >= (1u << 31)) { > result = n >= d; > return; > } > rvalue = reciprocal_value_adv(d, 32) > exp = rvalue.exp; > if (rvalue.is_wide_m && !(d & 1)) { > pre_shift = fls(d & -d) - 1; > rvalue = reciprocal_value_adv(d >> pre_shift, 32 - pre_shift); > } else { > pre_shift = 0; > } > > // code generation starts. > if (imm == 1 << exp) { > result = n >> exp; > } else if (rvalue.is_wide_m) { > // pre_shift must be zero when reached here. > t = (n * rvalue.m) >> 32; > result = n - t; > result >>= 1; > result += t; > result >>= rvalue.sh - 1; > } else { > if (pre_shift) > result = n >> pre_shift; > result = ((u64)result * rvalue.m) >> 32; > result >>= rvalue.sh; > } > > Signed-off-by: Jiong Wang <jiong.wang@netronome.com> > Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com> > --- > include/linux/reciprocal_div.h | 65 ++++++++++++++++++++++++++++++++++ > lib/reciprocal_div.c | 37 +++++++++++++++++++ > 2 files changed, 102 insertions(+) > > diff --git a/include/linux/reciprocal_div.h b/include/linux/reciprocal_div.h > index e031e9f2f9d8..5a695e4697d3 100644 > --- a/include/linux/reciprocal_div.h > +++ b/include/linux/reciprocal_div.h > @@ -25,6 +25,9 @@ struct reciprocal_value { > u8 sh1, sh2; > }; > > +/* "reciprocal_value" and "reciprocal_divide" together implement the basic > + * version of the algorithm described in Figure 4.1 of the paper. > + */ > struct reciprocal_value reciprocal_value(u32 d); > > static inline u32 reciprocal_divide(u32 a, struct reciprocal_value R) > @@ -33,4 +36,66 @@ static inline u32 reciprocal_divide(u32 a, struct reciprocal_value R) > return (t + ((a - t) >> R.sh1)) >> R.sh2; > } > > +struct reciprocal_value_adv { > + u32 m; > + u8 sh, exp; > + bool is_wide_m; > +}; > + > +/* "reciprocal_value_adv" implements the advanced version of the algorithm > + * described in Figure 4.2 of the paper except when dividend has MSB set which > + * would require u128 divide on host and actually could be easily handled before > + * calling "reciprocal_value_adv". > + * > + * The advanced version requires more complex calculation to get the reciprocal > + * multiplier and other control variables, but then could reduce the required > + * emulation operations. > + * > + * It makes no sense to use this advanced version for host divide emulation, > + * those extra complexities for calculating multiplier etc could completely > + * waive our saving on emulation operations. > + * > + * However, it makes sense to use it for JIT divide code generation for which > + * we are willing to trade performance of JITed code with that of host. As shown > + * by the following pseudo code, the required emulation operations could go down > + * from 6 (the basic version) to 3 or 4. > + * > + * To use the result of "reciprocal_value_adv", suppose we want to calculate > + * n/d: > + * > + * struct reciprocal_value_adv rvalue; > + * u8 pre_shift, exp; > + * > + * if (d >= (1u << 31)) { > + * result = n >= d; > + * return; > + * } > + * rvalue = reciprocal_value_adv(d, 32) > + * exp = rvalue.exp; > + * if (rvalue.is_wide_m && !(d & 1)) { > + * pre_shift = fls(d & -d) - 1; > + * rvalue = reciprocal_value_adv(d >> pre_shift, 32 - pre_shift); > + * } else { > + * pre_shift = 0; > + * } > + * > + * // code generation starts. > + * if (imm == 1 << exp) { > + * result = n >> exp; > + * } else if (rvalue.is_wide_m) { > + * // pre_shift must be zero when reached here. > + * t = (n * rvalue.m) >> 32; > + * result = n - t; > + * result >>= 1; > + * result += t; > + * result >>= rvalue.sh - 1; > + * } else { > + * if (pre_shift) > + * result = n >> pre_shift; > + * result = ((u64)result * rvalue.m) >> 32; > + * result >>= rvalue.sh; > + * } > + */ > +struct reciprocal_value_adv reciprocal_value_adv(u32 d, u8 prec); > + > #endif /* _LINUX_RECIPROCAL_DIV_H */ > diff --git a/lib/reciprocal_div.c b/lib/reciprocal_div.c > index fcb4ce682c6f..a41501ebad7c 100644 > --- a/lib/reciprocal_div.c > +++ b/lib/reciprocal_div.c > @@ -26,3 +26,40 @@ struct reciprocal_value reciprocal_value(u32 d) > return R; > } > EXPORT_SYMBOL(reciprocal_value); > + > +struct reciprocal_value_adv reciprocal_value_adv(u32 d, u8 prec) > +{ > + struct reciprocal_value_adv R; > + u32 l, post_shift; > + u64 mhigh, mlow; > + > + l = fls(d - 1); > + post_shift = l; > + /* NOTE: mlow/mhigh could overflow u64 when l == 32 which means d has > + * MSB set. This case needs to be handled before calling > + * "reciprocal_value_adv", please see the comment at > + * include/linux/reciprocal_div.h. > + */ Shall we handle l == 32 case better? I guess the concern here is extra handling may slow down the fast path? If that's the case, we should at least add a WARNING on the slow path. Thanks, Song > + mlow = 1ULL << (32 + l); > + do_div(mlow, d); > + mhigh = (1ULL << (32 + l)) + (1ULL << (32 + l - prec)); > + do_div(mhigh, d); > + > + for (; post_shift > 0; post_shift--) { > + u64 lo = mlow >> 1, hi = mhigh >> 1; > + > + if (lo >= hi) > + break; > + > + mlow = lo; > + mhigh = hi; > + } > + > + R.m = (u32)mhigh; > + R.sh = post_shift; > + R.exp = l; > + R.is_wide_m = mhigh > U32_MAX; > + > + return R; > +} > +EXPORT_SYMBOL(reciprocal_value_adv); > -- > 2.17.1 >
On Mon, 25 Jun 2018 23:21:10 -0700, Song Liu wrote: > > +struct reciprocal_value_adv reciprocal_value_adv(u32 d, u8 prec) > > +{ > > + struct reciprocal_value_adv R; > > + u32 l, post_shift; > > + u64 mhigh, mlow; > > + > > + l = fls(d - 1); > > + post_shift = l; > > + /* NOTE: mlow/mhigh could overflow u64 when l == 32 which means d has > > + * MSB set. This case needs to be handled before calling > > + * "reciprocal_value_adv", please see the comment at > > + * include/linux/reciprocal_div.h. > > + */ > > Shall we handle l == 32 case better? I guess the concern here is extra > handling may slow down the fast path? If that's the case, we should > at least add a WARNING on the slow path. Agreed, I think Jiong is travelling, hence no response. We'll respin.
On 06/26/2018 10:52 PM, Jakub Kicinski wrote: > On Mon, 25 Jun 2018 23:21:10 -0700, Song Liu wrote: >>> +struct reciprocal_value_adv reciprocal_value_adv(u32 d, u8 prec) >>> +{ >>> + struct reciprocal_value_adv R; >>> + u32 l, post_shift; >>> + u64 mhigh, mlow; >>> + >>> + l = fls(d - 1); >>> + post_shift = l; >>> + /* NOTE: mlow/mhigh could overflow u64 when l == 32 which means d has >>> + * MSB set. This case needs to be handled before calling >>> + * "reciprocal_value_adv", please see the comment at >>> + * include/linux/reciprocal_div.h. >>> + */ >> >> Shall we handle l == 32 case better? I guess the concern here is extra >> handling may slow down the fast path? If that's the case, we should >> at least add a WARNING on the slow path. > > Agreed, I think Jiong is travelling, hence no response. We'll respin. Ok, since there's going to be a respin, I've tossed the current series from patchwork in that case. Thanks, Daniel
diff --git a/include/linux/reciprocal_div.h b/include/linux/reciprocal_div.h index e031e9f2f9d8..5a695e4697d3 100644 --- a/include/linux/reciprocal_div.h +++ b/include/linux/reciprocal_div.h @@ -25,6 +25,9 @@ struct reciprocal_value { u8 sh1, sh2; }; +/* "reciprocal_value" and "reciprocal_divide" together implement the basic + * version of the algorithm described in Figure 4.1 of the paper. + */ struct reciprocal_value reciprocal_value(u32 d); static inline u32 reciprocal_divide(u32 a, struct reciprocal_value R) @@ -33,4 +36,66 @@ static inline u32 reciprocal_divide(u32 a, struct reciprocal_value R) return (t + ((a - t) >> R.sh1)) >> R.sh2; } +struct reciprocal_value_adv { + u32 m; + u8 sh, exp; + bool is_wide_m; +}; + +/* "reciprocal_value_adv" implements the advanced version of the algorithm + * described in Figure 4.2 of the paper except when dividend has MSB set which + * would require u128 divide on host and actually could be easily handled before + * calling "reciprocal_value_adv". + * + * The advanced version requires more complex calculation to get the reciprocal + * multiplier and other control variables, but then could reduce the required + * emulation operations. + * + * It makes no sense to use this advanced version for host divide emulation, + * those extra complexities for calculating multiplier etc could completely + * waive our saving on emulation operations. + * + * However, it makes sense to use it for JIT divide code generation for which + * we are willing to trade performance of JITed code with that of host. As shown + * by the following pseudo code, the required emulation operations could go down + * from 6 (the basic version) to 3 or 4. + * + * To use the result of "reciprocal_value_adv", suppose we want to calculate + * n/d: + * + * struct reciprocal_value_adv rvalue; + * u8 pre_shift, exp; + * + * if (d >= (1u << 31)) { + * result = n >= d; + * return; + * } + * rvalue = reciprocal_value_adv(d, 32) + * exp = rvalue.exp; + * if (rvalue.is_wide_m && !(d & 1)) { + * pre_shift = fls(d & -d) - 1; + * rvalue = reciprocal_value_adv(d >> pre_shift, 32 - pre_shift); + * } else { + * pre_shift = 0; + * } + * + * // code generation starts. + * if (imm == 1 << exp) { + * result = n >> exp; + * } else if (rvalue.is_wide_m) { + * // pre_shift must be zero when reached here. + * t = (n * rvalue.m) >> 32; + * result = n - t; + * result >>= 1; + * result += t; + * result >>= rvalue.sh - 1; + * } else { + * if (pre_shift) + * result = n >> pre_shift; + * result = ((u64)result * rvalue.m) >> 32; + * result >>= rvalue.sh; + * } + */ +struct reciprocal_value_adv reciprocal_value_adv(u32 d, u8 prec); + #endif /* _LINUX_RECIPROCAL_DIV_H */ diff --git a/lib/reciprocal_div.c b/lib/reciprocal_div.c index fcb4ce682c6f..a41501ebad7c 100644 --- a/lib/reciprocal_div.c +++ b/lib/reciprocal_div.c @@ -26,3 +26,40 @@ struct reciprocal_value reciprocal_value(u32 d) return R; } EXPORT_SYMBOL(reciprocal_value); + +struct reciprocal_value_adv reciprocal_value_adv(u32 d, u8 prec) +{ + struct reciprocal_value_adv R; + u32 l, post_shift; + u64 mhigh, mlow; + + l = fls(d - 1); + post_shift = l; + /* NOTE: mlow/mhigh could overflow u64 when l == 32 which means d has + * MSB set. This case needs to be handled before calling + * "reciprocal_value_adv", please see the comment at + * include/linux/reciprocal_div.h. + */ + mlow = 1ULL << (32 + l); + do_div(mlow, d); + mhigh = (1ULL << (32 + l)) + (1ULL << (32 + l - prec)); + do_div(mhigh, d); + + for (; post_shift > 0; post_shift--) { + u64 lo = mlow >> 1, hi = mhigh >> 1; + + if (lo >= hi) + break; + + mlow = lo; + mhigh = hi; + } + + R.m = (u32)mhigh; + R.sh = post_shift; + R.exp = l; + R.is_wide_m = mhigh > U32_MAX; + + return R; +} +EXPORT_SYMBOL(reciprocal_value_adv);