lib/bch: Remove VLA usage

Message ID 20180529224207.GA13354@beast
State New
Delegated to: Boris Brezillon
Headers show
Series
  • lib/bch: Remove VLA usage
Related show

Commit Message

Kees Cook May 29, 2018, 10:42 p.m.
In the quest to remove all stack VLA usage from the kernel[1], this removes
the on-stack working buffers in favor of pre-allocated working buffers
(which were already used in other places). Since these routines must
already be serialized (since they work on bch->ecc_buf), adding the usage
of bch->ecc_work would be similarly safe. Additionally, since "max m" is
only 15, this was adjusted to just use a fixed size array in those cases.

[1] https://lkml.kernel.org/r/CA+55aFzCG-zNmZwX4A2FQpadafLfEzK6CC=qPXydAacU1RqZWA@mail.gmail.com

Signed-off-by: Kees Cook <keescook@chromium.org>
---
This is directed at linux-mtd because it's the only user of this library
and it's how it originally entered the kernel tree...
---
 include/linux/bch.h |  4 ++--
 lib/bch.c           | 27 +++++++++++++++------------
 2 files changed, 17 insertions(+), 14 deletions(-)

Comments

Ivan Djelic May 30, 2018, 1:46 p.m. | #1
On Tue, May 29, 2018 at 03:42:07PM -0700, Kees Cook wrote:
> In the quest to remove all stack VLA usage from the kernel[1], this removes
> the on-stack working buffers in favor of pre-allocated working buffers
> (which were already used in other places). Since these routines must
> already be serialized (since they work on bch->ecc_buf), adding the usage
> of bch->ecc_work would be similarly safe. Additionally, since "max m" is
> only 15, this was adjusted to just use a fixed size array in those cases.

Hi Kees,

Using an on-stack buffer instead of a pre-allocated buffer was done initially
for performance reasons.  For "usual" (m,t) values (for instance m=13, t=4),
there is a huge performance difference between the on-stack buffer version and
the kmalloc version. I didn't investigate the reason for this, but I ran a
quick benchmark on my PC:

little-endian, type sizes: int=4 long=8 longlong=8
cpu: Intel(R) Core(TM) i5 CPU         650  @ 3.20GHz
calibration: iter=4.9143µs niter=2034 nsamples=200 m=13 t=4

  Buffer allocation |  Encoding throughput (Mbit/s)
---------------------------------------------------
 on-stack, VLA      |   3988
 on-stack, fixed    |   4494
 kmalloc            |   1967

The first line shows the performance of the current code, using a VLA.
The second line shows the performance when r[] is allocated on the stack with
a fixed, constant size (the maximum allowed value).
The third line shows the performance when r is a pre-allocated working buffer.

In fact, when using a pre-allocated buffer there is no need to introduce 'ecc_work':
you can directly point 'r' to bch->ecc_buf and remove memcpy() surrounding the
'while (mlen--)' loop. Everything happens inside the 'bch->ecc_buf' buffer.
But with a big performance penalty. Looks like declaring a temporary buffer on the
stack to store ECC values allows GCC to do a better job at optimizing the loop.

So rather than introducing 'ecc_work', I suggest we compute the maximum allowed
size for r[] and use that:

sizeof(r) = sizeof(uint32_t)*(l+1)
l+1 = BCH_ECC_WORDS(bch) = DIV_ROUND_UP(m*t, 32)

We also know that:

m*t < 2^m - 1 (ECC maximum size)

therefore:

l+1 < DIV_ROUND_UP(2^m - 1, 32) < 2^(m-5)

So instead of 'uint32_t r[l+1]' we could declare 'uint32_t r[1 << (BCH_MAX_M-5)]'.
And replace 'sizeof(r)' with 'sizeof(*bch->ecc_buf)*(l+1)' in memset/memcpy calls.
In practice the actual maximum size of r[] is (1 << (15-5))*sizeof(uint32_t) = 4096 bytes.

What do you think ?
--
Ivan
 




> [1] https://lkml.kernel.org/r/CA+55aFzCG-zNmZwX4A2FQpadafLfEzK6CC=qPXydAacU1RqZWA@mail.gmail.com
> 
> Signed-off-by: Kees Cook <keescook@chromium.org>
> ---
> This is directed at linux-mtd because it's the only user of this library
> and it's how it originally entered the kernel tree...
> ---
>  include/linux/bch.h |  4 ++--
>  lib/bch.c           | 27 +++++++++++++++------------
>  2 files changed, 17 insertions(+), 14 deletions(-)
> 
> diff --git a/include/linux/bch.h b/include/linux/bch.h
> index 295b4ef153bb..4d46e6a73319 100644
> --- a/include/linux/bch.h
> +++ b/include/linux/bch.h
> @@ -39,7 +39,7 @@
>   * @a_log_tab:  Galois field GF(2^m) log lookup table
>   * @mod8_tab:   remainder generator polynomial lookup tables
>   * @ecc_buf:    ecc parity words buffer
> - * @ecc_buf2:   ecc parity words buffer
> + * @ecc_work:   ecc parity words working buffer
>   * @xi_tab:     GF(2^m) base for solving degree 2 polynomial roots
>   * @syn:        syndrome buffer
>   * @cache:      log-based polynomial representation buffer
> @@ -57,7 +57,7 @@ struct bch_control {
>  	uint16_t       *a_log_tab;
>  	uint32_t       *mod8_tab;
>  	uint32_t       *ecc_buf;
> -	uint32_t       *ecc_buf2;
> +	uint32_t       *ecc_work;
>  	unsigned int   *xi_tab;
>  	unsigned int   *syn;
>  	int            *cache;
> diff --git a/lib/bch.c b/lib/bch.c
> index bc89dfe4d1b3..f14eac93ecc4 100644
> --- a/lib/bch.c
> +++ b/lib/bch.c
> @@ -78,10 +78,12 @@
>  #define GF_M(_p)               (CONFIG_BCH_CONST_M)
>  #define GF_T(_p)               (CONFIG_BCH_CONST_T)
>  #define GF_N(_p)               ((1 << (CONFIG_BCH_CONST_M))-1)
> +#define BCH_MAX_M              (CONFIG_BCH_CONST_M)
>  #else
>  #define GF_M(_p)               ((_p)->m)
>  #define GF_T(_p)               ((_p)->t)
>  #define GF_N(_p)               ((_p)->n)
> +#define BCH_MAX_M              15
>  #endif
>  
>  #define BCH_ECC_WORDS(_p)      DIV_ROUND_UP(GF_M(_p)*GF_T(_p), 32)
> @@ -187,7 +189,7 @@ void encode_bch(struct bch_control *bch, const uint8_t *data,
>  	const unsigned int l = BCH_ECC_WORDS(bch)-1;
>  	unsigned int i, mlen;
>  	unsigned long m;
> -	uint32_t w, r[l+1];
> +	uint32_t w;
>  	const uint32_t * const tab0 = bch->mod8_tab;
>  	const uint32_t * const tab1 = tab0 + 256*(l+1);
>  	const uint32_t * const tab2 = tab1 + 256*(l+1);
> @@ -198,7 +200,7 @@ void encode_bch(struct bch_control *bch, const uint8_t *data,
>  		/* load ecc parity bytes into internal 32-bit buffer */
>  		load_ecc8(bch, bch->ecc_buf, ecc);
>  	} else {
> -		memset(bch->ecc_buf, 0, sizeof(r));
> +		memset(bch->ecc_work, 0, bch->ecc_bytes);
>  	}
>  
>  	/* process first unaligned data bytes */
> @@ -215,7 +217,7 @@ void encode_bch(struct bch_control *bch, const uint8_t *data,
>  	mlen  = len/4;
>  	data += 4*mlen;
>  	len  -= 4*mlen;
> -	memcpy(r, bch->ecc_buf, sizeof(r));
> +	memcpy(bch->ecc_work, bch->ecc_buf, bch->ecc_bytes);
>  
>  	/*
>  	 * split each 32-bit word into 4 polynomials of weight 8 as follows:
> @@ -229,6 +231,8 @@ void encode_bch(struct bch_control *bch, const uint8_t *data,
>  	 * xxxxxxxx  yyyyyyyy  zzzzzzzz  tttttttt  mod g = r0^r1^r2^r3
>  	 */
>  	while (mlen--) {
> +		uint32_t *r = bch->ecc_work;
> +
>  		/* input data is read in big-endian format */
>  		w = r[0]^cpu_to_be32(*pdata++);
>  		p0 = tab0 + (l+1)*((w >>  0) & 0xff);
> @@ -241,7 +245,7 @@ void encode_bch(struct bch_control *bch, const uint8_t *data,
>  
>  		r[l] = p0[l]^p1[l]^p2[l]^p3[l];
>  	}
> -	memcpy(bch->ecc_buf, r, sizeof(r));
> +	memcpy(bch->ecc_buf, bch->ecc_work, bch->ecc_bytes);
>  
>  	/* process last unaligned bytes */
>  	if (len)
> @@ -434,7 +438,7 @@ static int solve_linear_system(struct bch_control *bch, unsigned int *rows,
>  {
>  	const int m = GF_M(bch);
>  	unsigned int tmp, mask;
> -	int rem, c, r, p, k, param[m];
> +	int rem, c, r, p, k, param[BCH_MAX_M];
>  
>  	k = 0;
>  	mask = 1 << m;
> @@ -1009,10 +1013,10 @@ int decode_bch(struct bch_control *bch, const uint8_t *data, unsigned int len,
>  		}
>  		/* load received ecc or assume it was XORed in calc_ecc */
>  		if (recv_ecc) {
> -			load_ecc8(bch, bch->ecc_buf2, recv_ecc);
> +			load_ecc8(bch, bch->ecc_work, recv_ecc);
>  			/* XOR received and calculated ecc */
>  			for (i = 0, sum = 0; i < (int)ecc_words; i++) {
> -				bch->ecc_buf[i] ^= bch->ecc_buf2[i];
> +				bch->ecc_buf[i] ^= bch->ecc_work[i];
>  				sum |= bch->ecc_buf[i];
>  			}
>  			if (!sum)
> @@ -1114,7 +1118,7 @@ static int build_deg2_base(struct bch_control *bch)
>  {
>  	const int m = GF_M(bch);
>  	int i, j, r;
> -	unsigned int sum, x, y, remaining, ak = 0, xi[m];
> +	unsigned int sum, x, y, remaining, ak = 0, xi[BCH_MAX_M];
>  
>  	/* find k s.t. Tr(a^k) = 1 and 0 <= k < m */
>  	for (i = 0; i < m; i++) {
> @@ -1254,7 +1258,6 @@ struct bch_control *init_bch(int m, int t, unsigned int prim_poly)
>  	struct bch_control *bch = NULL;
>  
>  	const int min_m = 5;
> -	const int max_m = 15;
>  
>  	/* default primitive polynomials */
>  	static const unsigned int prim_poly_tab[] = {
> @@ -1270,7 +1273,7 @@ struct bch_control *init_bch(int m, int t, unsigned int prim_poly)
>  		goto fail;
>  	}
>  #endif
> -	if ((m < min_m) || (m > max_m))
> +	if ((m < min_m) || (m > BCH_MAX_M))
>  		/*
>  		 * values of m greater than 15 are not currently supported;
>  		 * supporting m > 15 would require changing table base type
> @@ -1300,7 +1303,7 @@ struct bch_control *init_bch(int m, int t, unsigned int prim_poly)
>  	bch->a_log_tab = bch_alloc((1+bch->n)*sizeof(*bch->a_log_tab), &err);
>  	bch->mod8_tab  = bch_alloc(words*1024*sizeof(*bch->mod8_tab), &err);
>  	bch->ecc_buf   = bch_alloc(words*sizeof(*bch->ecc_buf), &err);
> -	bch->ecc_buf2  = bch_alloc(words*sizeof(*bch->ecc_buf2), &err);
> +	bch->ecc_work  = bch_alloc(words*sizeof(*bch->ecc_work), &err);
>  	bch->xi_tab    = bch_alloc(m*sizeof(*bch->xi_tab), &err);
>  	bch->syn       = bch_alloc(2*t*sizeof(*bch->syn), &err);
>  	bch->cache     = bch_alloc(2*t*sizeof(*bch->cache), &err);
> @@ -1349,7 +1352,7 @@ void free_bch(struct bch_control *bch)
>  		kfree(bch->a_log_tab);
>  		kfree(bch->mod8_tab);
>  		kfree(bch->ecc_buf);
> -		kfree(bch->ecc_buf2);
> +		kfree(bch->ecc_work);
>  		kfree(bch->xi_tab);
>  		kfree(bch->syn);
>  		kfree(bch->cache);
> -- 
> 2.17.0
> 
> 
> -- 
> Kees Cook
> Pixel Security
Kees Cook May 30, 2018, 9:12 p.m. | #2
On Wed, May 30, 2018 at 6:46 AM, Ivan Djelic <ivan.djelic@parrot.com> wrote:
> On Tue, May 29, 2018 at 03:42:07PM -0700, Kees Cook wrote:
>> In the quest to remove all stack VLA usage from the kernel[1], this removes
>> the on-stack working buffers in favor of pre-allocated working buffers
>> (which were already used in other places). Since these routines must
>> already be serialized (since they work on bch->ecc_buf), adding the usage
>> of bch->ecc_work would be similarly safe. Additionally, since "max m" is
>> only 15, this was adjusted to just use a fixed size array in those cases.
>
> Hi Kees,
>
> Using an on-stack buffer instead of a pre-allocated buffer was done initially
> for performance reasons.  For "usual" (m,t) values (for instance m=13, t=4),
> there is a huge performance difference between the on-stack buffer version and
> the kmalloc version. I didn't investigate the reason for this, but I ran a
> quick benchmark on my PC:
>
> little-endian, type sizes: int=4 long=8 longlong=8
> cpu: Intel(R) Core(TM) i5 CPU         650  @ 3.20GHz
> calibration: iter=4.9143µs niter=2034 nsamples=200 m=13 t=4
>
>   Buffer allocation |  Encoding throughput (Mbit/s)
> ---------------------------------------------------
>  on-stack, VLA      |   3988
>  on-stack, fixed    |   4494
>  kmalloc            |   1967
>
> The first line shows the performance of the current code, using a VLA.
> The second line shows the performance when r[] is allocated on the stack with
> a fixed, constant size (the maximum allowed value).
> The third line shows the performance when r is a pre-allocated working buffer.
>
> In fact, when using a pre-allocated buffer there is no need to introduce 'ecc_work':
> you can directly point 'r' to bch->ecc_buf and remove memcpy() surrounding the
> 'while (mlen--)' loop. Everything happens inside the 'bch->ecc_buf' buffer.
> But with a big performance penalty. Looks like declaring a temporary buffer on the
> stack to store ECC values allows GCC to do a better job at optimizing the loop.
>
> So rather than introducing 'ecc_work', I suggest we compute the maximum allowed
> size for r[] and use that:
>
> sizeof(r) = sizeof(uint32_t)*(l+1)
> l+1 = BCH_ECC_WORDS(bch) = DIV_ROUND_UP(m*t, 32)
>
> We also know that:
>
> m*t < 2^m - 1 (ECC maximum size)
>
> therefore:
>
> l+1 < DIV_ROUND_UP(2^m - 1, 32) < 2^(m-5)
>
> So instead of 'uint32_t r[l+1]' we could declare 'uint32_t r[1 << (BCH_MAX_M-5)]'.
> And replace 'sizeof(r)' with 'sizeof(*bch->ecc_buf)*(l+1)' in memset/memcpy calls.
> In practice the actual maximum size of r[] is (1 << (15-5))*sizeof(uint32_t) = 4096 bytes.
>
> What do you think ?

I actually did that implementation first since I didn't realize how
large that allocation could get. 4096 is a HUGE stack allocation. The
kernel build warns at 2048. The defaults seen during allmodconfig are:

CONFIG_BCH_CONST_M=14
CONFIG_BCH_CONST_T=4

So those builds are already seeing a large stack allocation, but it
was hidden from the checking tools before because it was a dynamic
stack allocation:

lib/bch.c: In function ‘encode_bch’:
lib/bch.c:261:1: warning: the frame size of 2288 bytes is larger than
2048 bytes [-Wframe-larger-than=]

This could be masked in the Makefile, though, since this is already
the situation the code runs under. I'll send that patch...

-Kees
kbuild test robot May 31, 2018, 11:49 a.m. | #3
Hi Kees,

I love your patch! Perhaps something to improve:

[auto build test WARNING on linus/master]
[also build test WARNING on v4.17-rc7 next-20180530]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]

url:    https://github.com/0day-ci/linux/commits/Kees-Cook/lib-bch-Remove-VLA-usage/20180531-055540
reproduce:
        # apt-get install sparse
        make ARCH=x86_64 allmodconfig
        make C=1 CF=-D__CHECK_ENDIAN__


sparse warnings: (new ones prefixed by >>)

>> lib/bch.c:237:26: sparse: restricted __be32 degrades to integer

vim +237 lib/bch.c

437aa565 Ivan Djelic 2011-03-11  171  
437aa565 Ivan Djelic 2011-03-11  172  /**
437aa565 Ivan Djelic 2011-03-11  173   * encode_bch - calculate BCH ecc parity of data
437aa565 Ivan Djelic 2011-03-11  174   * @bch:   BCH control structure
437aa565 Ivan Djelic 2011-03-11  175   * @data:  data to encode
437aa565 Ivan Djelic 2011-03-11  176   * @len:   data length in bytes
437aa565 Ivan Djelic 2011-03-11  177   * @ecc:   ecc parity data, must be initialized by caller
437aa565 Ivan Djelic 2011-03-11  178   *
437aa565 Ivan Djelic 2011-03-11  179   * The @ecc parity array is used both as input and output parameter, in order to
437aa565 Ivan Djelic 2011-03-11  180   * allow incremental computations. It should be of the size indicated by member
437aa565 Ivan Djelic 2011-03-11  181   * @ecc_bytes of @bch, and should be initialized to 0 before the first call.
437aa565 Ivan Djelic 2011-03-11  182   *
437aa565 Ivan Djelic 2011-03-11  183   * The exact number of computed ecc parity bits is given by member @ecc_bits of
437aa565 Ivan Djelic 2011-03-11  184   * @bch; it may be less than m*t for large values of t.
437aa565 Ivan Djelic 2011-03-11  185   */
437aa565 Ivan Djelic 2011-03-11  186  void encode_bch(struct bch_control *bch, const uint8_t *data,
437aa565 Ivan Djelic 2011-03-11  187  		unsigned int len, uint8_t *ecc)
437aa565 Ivan Djelic 2011-03-11  188  {
437aa565 Ivan Djelic 2011-03-11  189  	const unsigned int l = BCH_ECC_WORDS(bch)-1;
437aa565 Ivan Djelic 2011-03-11  190  	unsigned int i, mlen;
437aa565 Ivan Djelic 2011-03-11  191  	unsigned long m;
da5dc7be Kees Cook   2018-05-29  192  	uint32_t w;
437aa565 Ivan Djelic 2011-03-11  193  	const uint32_t * const tab0 = bch->mod8_tab;
437aa565 Ivan Djelic 2011-03-11  194  	const uint32_t * const tab1 = tab0 + 256*(l+1);
437aa565 Ivan Djelic 2011-03-11  195  	const uint32_t * const tab2 = tab1 + 256*(l+1);
437aa565 Ivan Djelic 2011-03-11  196  	const uint32_t * const tab3 = tab2 + 256*(l+1);
437aa565 Ivan Djelic 2011-03-11  197  	const uint32_t *pdata, *p0, *p1, *p2, *p3;
437aa565 Ivan Djelic 2011-03-11  198  
437aa565 Ivan Djelic 2011-03-11  199  	if (ecc) {
437aa565 Ivan Djelic 2011-03-11  200  		/* load ecc parity bytes into internal 32-bit buffer */
437aa565 Ivan Djelic 2011-03-11  201  		load_ecc8(bch, bch->ecc_buf, ecc);
437aa565 Ivan Djelic 2011-03-11  202  	} else {
da5dc7be Kees Cook   2018-05-29  203  		memset(bch->ecc_work, 0, bch->ecc_bytes);
437aa565 Ivan Djelic 2011-03-11  204  	}
437aa565 Ivan Djelic 2011-03-11  205  
437aa565 Ivan Djelic 2011-03-11  206  	/* process first unaligned data bytes */
437aa565 Ivan Djelic 2011-03-11  207  	m = ((unsigned long)data) & 3;
437aa565 Ivan Djelic 2011-03-11  208  	if (m) {
437aa565 Ivan Djelic 2011-03-11  209  		mlen = (len < (4-m)) ? len : 4-m;
437aa565 Ivan Djelic 2011-03-11  210  		encode_bch_unaligned(bch, data, mlen, bch->ecc_buf);
437aa565 Ivan Djelic 2011-03-11  211  		data += mlen;
437aa565 Ivan Djelic 2011-03-11  212  		len  -= mlen;
437aa565 Ivan Djelic 2011-03-11  213  	}
437aa565 Ivan Djelic 2011-03-11  214  
437aa565 Ivan Djelic 2011-03-11  215  	/* process 32-bit aligned data words */
437aa565 Ivan Djelic 2011-03-11  216  	pdata = (uint32_t *)data;
437aa565 Ivan Djelic 2011-03-11  217  	mlen  = len/4;
437aa565 Ivan Djelic 2011-03-11  218  	data += 4*mlen;
437aa565 Ivan Djelic 2011-03-11  219  	len  -= 4*mlen;
da5dc7be Kees Cook   2018-05-29  220  	memcpy(bch->ecc_work, bch->ecc_buf, bch->ecc_bytes);
437aa565 Ivan Djelic 2011-03-11  221  
437aa565 Ivan Djelic 2011-03-11  222  	/*
437aa565 Ivan Djelic 2011-03-11  223  	 * split each 32-bit word into 4 polynomials of weight 8 as follows:
437aa565 Ivan Djelic 2011-03-11  224  	 *
437aa565 Ivan Djelic 2011-03-11  225  	 * 31 ...24  23 ...16  15 ... 8  7 ... 0
437aa565 Ivan Djelic 2011-03-11  226  	 * xxxxxxxx  yyyyyyyy  zzzzzzzz  tttttttt
437aa565 Ivan Djelic 2011-03-11  227  	 *                               tttttttt  mod g = r0 (precomputed)
437aa565 Ivan Djelic 2011-03-11  228  	 *                     zzzzzzzz  00000000  mod g = r1 (precomputed)
437aa565 Ivan Djelic 2011-03-11  229  	 *           yyyyyyyy  00000000  00000000  mod g = r2 (precomputed)
437aa565 Ivan Djelic 2011-03-11  230  	 * xxxxxxxx  00000000  00000000  00000000  mod g = r3 (precomputed)
437aa565 Ivan Djelic 2011-03-11  231  	 * xxxxxxxx  yyyyyyyy  zzzzzzzz  tttttttt  mod g = r0^r1^r2^r3
437aa565 Ivan Djelic 2011-03-11  232  	 */
437aa565 Ivan Djelic 2011-03-11  233  	while (mlen--) {
da5dc7be Kees Cook   2018-05-29  234  		uint32_t *r = bch->ecc_work;
da5dc7be Kees Cook   2018-05-29  235  
437aa565 Ivan Djelic 2011-03-11  236  		/* input data is read in big-endian format */
437aa565 Ivan Djelic 2011-03-11 @237  		w = r[0]^cpu_to_be32(*pdata++);
437aa565 Ivan Djelic 2011-03-11  238  		p0 = tab0 + (l+1)*((w >>  0) & 0xff);
437aa565 Ivan Djelic 2011-03-11  239  		p1 = tab1 + (l+1)*((w >>  8) & 0xff);
437aa565 Ivan Djelic 2011-03-11  240  		p2 = tab2 + (l+1)*((w >> 16) & 0xff);
437aa565 Ivan Djelic 2011-03-11  241  		p3 = tab3 + (l+1)*((w >> 24) & 0xff);
437aa565 Ivan Djelic 2011-03-11  242  
437aa565 Ivan Djelic 2011-03-11  243  		for (i = 0; i < l; i++)
437aa565 Ivan Djelic 2011-03-11  244  			r[i] = r[i+1]^p0[i]^p1[i]^p2[i]^p3[i];
437aa565 Ivan Djelic 2011-03-11  245  
437aa565 Ivan Djelic 2011-03-11  246  		r[l] = p0[l]^p1[l]^p2[l]^p3[l];
437aa565 Ivan Djelic 2011-03-11  247  	}
da5dc7be Kees Cook   2018-05-29  248  	memcpy(bch->ecc_buf, bch->ecc_work, bch->ecc_bytes);
437aa565 Ivan Djelic 2011-03-11  249  
437aa565 Ivan Djelic 2011-03-11  250  	/* process last unaligned bytes */
437aa565 Ivan Djelic 2011-03-11  251  	if (len)
437aa565 Ivan Djelic 2011-03-11  252  		encode_bch_unaligned(bch, data, len, bch->ecc_buf);
437aa565 Ivan Djelic 2011-03-11  253  
437aa565 Ivan Djelic 2011-03-11  254  	/* store ecc parity bytes into original parity buffer */
437aa565 Ivan Djelic 2011-03-11  255  	if (ecc)
437aa565 Ivan Djelic 2011-03-11  256  		store_ecc8(bch, ecc, bch->ecc_buf);
437aa565 Ivan Djelic 2011-03-11  257  }
437aa565 Ivan Djelic 2011-03-11  258  EXPORT_SYMBOL_GPL(encode_bch);
437aa565 Ivan Djelic 2011-03-11  259  

:::::: The code at line 237 was first introduced by commit
:::::: 437aa565e2656776a7104aaacd792fe789ea8b2d lib: add shared BCH ECC library

:::::: TO: Ivan Djelic <ivan.djelic@parrot.com>
:::::: CC: David Woodhouse <David.Woodhouse@intel.com>

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

Patch

diff --git a/include/linux/bch.h b/include/linux/bch.h
index 295b4ef153bb..4d46e6a73319 100644
--- a/include/linux/bch.h
+++ b/include/linux/bch.h
@@ -39,7 +39,7 @@ 
  * @a_log_tab:  Galois field GF(2^m) log lookup table
  * @mod8_tab:   remainder generator polynomial lookup tables
  * @ecc_buf:    ecc parity words buffer
- * @ecc_buf2:   ecc parity words buffer
+ * @ecc_work:   ecc parity words working buffer
  * @xi_tab:     GF(2^m) base for solving degree 2 polynomial roots
  * @syn:        syndrome buffer
  * @cache:      log-based polynomial representation buffer
@@ -57,7 +57,7 @@  struct bch_control {
 	uint16_t       *a_log_tab;
 	uint32_t       *mod8_tab;
 	uint32_t       *ecc_buf;
-	uint32_t       *ecc_buf2;
+	uint32_t       *ecc_work;
 	unsigned int   *xi_tab;
 	unsigned int   *syn;
 	int            *cache;
diff --git a/lib/bch.c b/lib/bch.c
index bc89dfe4d1b3..f14eac93ecc4 100644
--- a/lib/bch.c
+++ b/lib/bch.c
@@ -78,10 +78,12 @@ 
 #define GF_M(_p)               (CONFIG_BCH_CONST_M)
 #define GF_T(_p)               (CONFIG_BCH_CONST_T)
 #define GF_N(_p)               ((1 << (CONFIG_BCH_CONST_M))-1)
+#define BCH_MAX_M              (CONFIG_BCH_CONST_M)
 #else
 #define GF_M(_p)               ((_p)->m)
 #define GF_T(_p)               ((_p)->t)
 #define GF_N(_p)               ((_p)->n)
+#define BCH_MAX_M              15
 #endif
 
 #define BCH_ECC_WORDS(_p)      DIV_ROUND_UP(GF_M(_p)*GF_T(_p), 32)
@@ -187,7 +189,7 @@  void encode_bch(struct bch_control *bch, const uint8_t *data,
 	const unsigned int l = BCH_ECC_WORDS(bch)-1;
 	unsigned int i, mlen;
 	unsigned long m;
-	uint32_t w, r[l+1];
+	uint32_t w;
 	const uint32_t * const tab0 = bch->mod8_tab;
 	const uint32_t * const tab1 = tab0 + 256*(l+1);
 	const uint32_t * const tab2 = tab1 + 256*(l+1);
@@ -198,7 +200,7 @@  void encode_bch(struct bch_control *bch, const uint8_t *data,
 		/* load ecc parity bytes into internal 32-bit buffer */
 		load_ecc8(bch, bch->ecc_buf, ecc);
 	} else {
-		memset(bch->ecc_buf, 0, sizeof(r));
+		memset(bch->ecc_work, 0, bch->ecc_bytes);
 	}
 
 	/* process first unaligned data bytes */
@@ -215,7 +217,7 @@  void encode_bch(struct bch_control *bch, const uint8_t *data,
 	mlen  = len/4;
 	data += 4*mlen;
 	len  -= 4*mlen;
-	memcpy(r, bch->ecc_buf, sizeof(r));
+	memcpy(bch->ecc_work, bch->ecc_buf, bch->ecc_bytes);
 
 	/*
 	 * split each 32-bit word into 4 polynomials of weight 8 as follows:
@@ -229,6 +231,8 @@  void encode_bch(struct bch_control *bch, const uint8_t *data,
 	 * xxxxxxxx  yyyyyyyy  zzzzzzzz  tttttttt  mod g = r0^r1^r2^r3
 	 */
 	while (mlen--) {
+		uint32_t *r = bch->ecc_work;
+
 		/* input data is read in big-endian format */
 		w = r[0]^cpu_to_be32(*pdata++);
 		p0 = tab0 + (l+1)*((w >>  0) & 0xff);
@@ -241,7 +245,7 @@  void encode_bch(struct bch_control *bch, const uint8_t *data,
 
 		r[l] = p0[l]^p1[l]^p2[l]^p3[l];
 	}
-	memcpy(bch->ecc_buf, r, sizeof(r));
+	memcpy(bch->ecc_buf, bch->ecc_work, bch->ecc_bytes);
 
 	/* process last unaligned bytes */
 	if (len)
@@ -434,7 +438,7 @@  static int solve_linear_system(struct bch_control *bch, unsigned int *rows,
 {
 	const int m = GF_M(bch);
 	unsigned int tmp, mask;
-	int rem, c, r, p, k, param[m];
+	int rem, c, r, p, k, param[BCH_MAX_M];
 
 	k = 0;
 	mask = 1 << m;
@@ -1009,10 +1013,10 @@  int decode_bch(struct bch_control *bch, const uint8_t *data, unsigned int len,
 		}
 		/* load received ecc or assume it was XORed in calc_ecc */
 		if (recv_ecc) {
-			load_ecc8(bch, bch->ecc_buf2, recv_ecc);
+			load_ecc8(bch, bch->ecc_work, recv_ecc);
 			/* XOR received and calculated ecc */
 			for (i = 0, sum = 0; i < (int)ecc_words; i++) {
-				bch->ecc_buf[i] ^= bch->ecc_buf2[i];
+				bch->ecc_buf[i] ^= bch->ecc_work[i];
 				sum |= bch->ecc_buf[i];
 			}
 			if (!sum)
@@ -1114,7 +1118,7 @@  static int build_deg2_base(struct bch_control *bch)
 {
 	const int m = GF_M(bch);
 	int i, j, r;
-	unsigned int sum, x, y, remaining, ak = 0, xi[m];
+	unsigned int sum, x, y, remaining, ak = 0, xi[BCH_MAX_M];
 
 	/* find k s.t. Tr(a^k) = 1 and 0 <= k < m */
 	for (i = 0; i < m; i++) {
@@ -1254,7 +1258,6 @@  struct bch_control *init_bch(int m, int t, unsigned int prim_poly)
 	struct bch_control *bch = NULL;
 
 	const int min_m = 5;
-	const int max_m = 15;
 
 	/* default primitive polynomials */
 	static const unsigned int prim_poly_tab[] = {
@@ -1270,7 +1273,7 @@  struct bch_control *init_bch(int m, int t, unsigned int prim_poly)
 		goto fail;
 	}
 #endif
-	if ((m < min_m) || (m > max_m))
+	if ((m < min_m) || (m > BCH_MAX_M))
 		/*
 		 * values of m greater than 15 are not currently supported;
 		 * supporting m > 15 would require changing table base type
@@ -1300,7 +1303,7 @@  struct bch_control *init_bch(int m, int t, unsigned int prim_poly)
 	bch->a_log_tab = bch_alloc((1+bch->n)*sizeof(*bch->a_log_tab), &err);
 	bch->mod8_tab  = bch_alloc(words*1024*sizeof(*bch->mod8_tab), &err);
 	bch->ecc_buf   = bch_alloc(words*sizeof(*bch->ecc_buf), &err);
-	bch->ecc_buf2  = bch_alloc(words*sizeof(*bch->ecc_buf2), &err);
+	bch->ecc_work  = bch_alloc(words*sizeof(*bch->ecc_work), &err);
 	bch->xi_tab    = bch_alloc(m*sizeof(*bch->xi_tab), &err);
 	bch->syn       = bch_alloc(2*t*sizeof(*bch->syn), &err);
 	bch->cache     = bch_alloc(2*t*sizeof(*bch->cache), &err);
@@ -1349,7 +1352,7 @@  void free_bch(struct bch_control *bch)
 		kfree(bch->a_log_tab);
 		kfree(bch->mod8_tab);
 		kfree(bch->ecc_buf);
-		kfree(bch->ecc_buf2);
+		kfree(bch->ecc_work);
 		kfree(bch->xi_tab);
 		kfree(bch->syn);
 		kfree(bch->cache);