diff mbox

[PR,target/79170] fix memcmp builtin expansion sequence for rs6000 target.

Message ID 1485540665.5392.4.camel@linux.vnet.ibm.com
State New
Headers show

Commit Message

Aaron Sawdey Jan. 27, 2017, 6:11 p.m. UTC
This patch changes the generated sequence for builtin expansion of
memcmp to one that is actually correct. The old sequence did not work
right if the subf overflowed subtracting two 64-bit chunks, or if extsw
overflowed after subtraction of two 32-bit chunks. Included is an
expansion of the memcmp-1.c test case so that it catches these issues.

The new sequence looks like this

ld      A
ld      B
subfc.  R, B, A
bne
subfe   C, C, C
popcntd P, R
or      R, R, C
extsw   R, R

where ld/ld/subfc./bne is the repeated sequence to compare 64-bit
chunks, and subfe/popcntd/or/extsw is the code used to convert a 32-bit 
or larger difference to a signed 32-bit result.

Additionally for power9 subfc. is replaced by cmpld and
subfe/popcntd/or/extsw is replaced by setb.

The updated memcmp-1 testcase passes on ppc64le (p8/p9), ppc64 (p8/p9),
 ppc32 (p8), and x86_64. Bootstrap was successful on ppc64/ppc64le.
Assuming regtest on ppc64/ppc64le passes, ok for trunk?

2017-01-27  Aaron Sawdey  <acsawdey@linux.vnet.ibm.com>

	PR target/79170
	* gcc.dg/memcmp-1.c: Improved to catch failures seen in PR 79170.

2017-01-27  Aaron Sawdey  <acsawdey@linux.vnet.ibm.com>

	PR target/79170
	* config/rs6000/altivec.md (*setb_internal): Rename to setb_signed.
	(setb_unsigned) New pattern for setb with CCUNS.
	* config/rs6000/rs6000.c (expand_block_compare): Use a different
	subfc./subfe sequence to avoid overflow problems. Generate a
	shorter sequence with cmpld/setb for power9.
	* config/rs6000/rs6000.md (subf<mode>3_carry_dot2): Add a new pattern
	for generating subfc. instruction.
	(cmpstrsi): Add TARGET_POPCNTD predicate as the generate sequence
	now uses this instruction.

Comments

Segher Boessenkool Jan. 27, 2017, 11:43 p.m. UTC | #1
On Fri, Jan 27, 2017 at 12:11:05PM -0600, Aaron Sawdey wrote:
> The updated memcmp-1 testcase passes on ppc64le (p8/p9), ppc64 (p8/p9),
>  ppc32 (p8), and x86_64. Bootstrap was successful on ppc64/ppc64le.
> Assuming regtest on ppc64/ppc64le passes, ok for trunk?

> +	     ldbrx 10,6,9
> +	     ldbrx 9,7,9
> +	     subf. 9,9,10
> +	     bne 0,.L8

subfc. 9,9,10

> +	     addi 9,4,7
> +	     lwbrx 10,0,9
> +	     addi 9,5,7
> +	     lwbrx 9,0,9

It would be nice if this was

	li 9,7
	lwbrx 10,9,4
	lwbrx 9,9,5

but that is a generic problem I bet.

> +	     subfc 9,9,10
> +	     b .L9
>       .L8: # convert_label
> -             cntlzd 9,9
> -             addi 9,9,-1
> -             xori 9,9,0x3f
> +	     subfe 10,10,10
> +	     popcntd 9,9
> +	     rldimi 9,10,6,0
>       .L9: # final_label

The code does not generate rldimi anymore, always just "or".

> +	 while maintaining <0 / ==0 / >0 properties. This sequence works:
> +	 subfc L,A,B
> +	 subfe H,H,H
> +	 popcntd L,L
> +	 rldimi L,H,6,0

"or" here, as well.

> --- gcc/config/rs6000/rs6000.md	(revision 244952)
> +++ gcc/config/rs6000/rs6000.md	(working copy)
> @@ -2068,6 +2068,35 @@
>    "subfic %0,%1,%2"
>    [(set_attr "type" "add")])
>  
> +(define_insn_and_split "subf<mode>3_carry_dot2"
> +  [(set (match_operand:CC 3 "cc_reg_operand" "=x,?y")
> +	(compare:CC (minus:P (match_operand:P 2 "gpc_reg_operand" "r,r")
> +			       (match_operand:P 1 "gpc_reg_operand" "r,r"))
> +		    (const_int 0)))
> +   (set (match_operand:P 0 "gpc_reg_operand" "=r,r")
> +	(minus:P (match_dup 2)
> +		   (match_dup 1)))
> +   (set (reg:P CA_REGNO)
> +	(leu:P (match_dup 1)
> +	       (match_dup 2)))]
> +  "<MODE>mode == Pmode"
> +  "@
> +   subfc. %0,%1,%2
> +   #"
> +  "&& reload_completed && cc_reg_not_cr0_operand (operands[3], CCmode)"

So far so good...

> +  [(set (reg:P CA_REGNO)
> +	(leu:P (match_dup 1)
> +	       (match_dup 2)))
> +   (set (match_dup 0)
> +	(minus:P (match_dup 2)
> +		   (match_dup 1)))
> +   (set (match_dup 3)
> +	(compare:CC (match_dup 0)
> +		    (const_int 0)))]

This needs a "parallel" around the two pieces that together are the subfc
instruction you split to.  They also need to be in the correct order.  So:

  [(parallel [(set (match_dup 0)
		   (minus:P (match_dup 2)
			    (match_dup 1)))
	      (set (reg:P CA_REGNO)
		   (leu:P (match_dup 1)
			  (match_dup 2)))])
   (set (match_dup 3)
	(compare:CC (match_dup 0)
		    (const_int 0)))]

The rest looks good.  With those fixes the patch is approved for trunk
(if all testing works out ;-) )

Thanks,


Segher
Peter Bergner Jan. 30, 2017, 4 p.m. UTC | #2
On 1/27/17 5:43 PM, Segher Boessenkool wrote:
> On Fri, Jan 27, 2017 at 12:11:05PM -0600, Aaron Sawdey wrote:
>> +	     addi 9,4,7
>> +	     lwbrx 10,0,9
>> +	     addi 9,5,7
>> +	     lwbrx 9,0,9
>
> It would be nice if this was
>
> 	li 9,7
> 	lwbrx 10,9,4
> 	lwbrx 9,9,5

Nicer still, we want the base address as the RA operand
and the offset as the RB operand, so like so:

	li 9,7
	lwbrx 10,4,9
	lwbrx 9,5,9

On some processors, it matters performance wise.

Peter
diff mbox

Patch

Index: gcc/config/rs6000/altivec.md
===================================================================
--- gcc/config/rs6000/altivec.md	(revision 244952)
+++ gcc/config/rs6000/altivec.md	(working copy)
@@ -3838,7 +3838,7 @@ 
 ;; Otherwise, set operand 0 to 0.  Note that the result stored into
 ;; register operand 0 is non-zero iff either the LT or GT bits are on
 ;; within condition register operand 1.
-(define_insn "*setb_internal"
+(define_insn "setb_signed"
    [(set (match_operand:SI 0 "gpc_reg_operand" "=r")
 	 (if_then_else:SI (lt (match_operand:CC 1 "cc_reg_operand" "y")
 			      (const_int 0))
@@ -3851,6 +3851,19 @@ 
   "setb %0,%1"
   [(set_attr "type" "logical")])
 
+(define_insn "setb_unsigned"
+   [(set (match_operand:SI 0 "gpc_reg_operand" "=r")
+	 (if_then_else:SI (ltu (match_operand:CCUNS 1 "cc_reg_operand" "y")
+			      (const_int 0))
+			  (const_int -1)
+			  (if_then_else (gtu (match_dup 1)
+					    (const_int 0))
+					(const_int 1)
+					(const_int 0))))]
+  "TARGET_P9_MISC"
+  "setb %0,%1"
+  [(set_attr "type" "logical")])
+
 ;; Test byte within two ranges.
 ;;
 ;; The bytes of operand 1 are organized as xx:xx:xx:vv, where xx
Index: gcc/config/rs6000/rs6000.c
===================================================================
--- gcc/config/rs6000/rs6000.c	(revision 244952)
+++ gcc/config/rs6000/rs6000.c	(working copy)
@@ -17292,7 +17292,7 @@ 
   TYPE_NAME (V16QI_type_node) = tdecl;
 
   tdecl = add_builtin_type ("__vector __bool char", bool_V16QI_type_node);
-  TYPE_NAME ( bool_V16QI_type_node) = tdecl;
+  TYPE_NAME (bool_V16QI_type_node) = tdecl;
 
   tdecl = add_builtin_type ("__vector unsigned short", unsigned_V8HI_type_node);
   TYPE_NAME (unsigned_V8HI_type_node) = tdecl;
@@ -19458,24 +19458,31 @@ 
   rtx src1 = orig_src1;
   rtx src2 = orig_src2;
 
-  /* If this is not a fixed size compare, just call memcmp */
+  /* This case is complicated to handle because the subtract
+     with carry instructions do not generate the 64-bit
+     carry and so we must emit code to calculate it ourselves.
+     We choose not to implement this yet.  */
+  if (TARGET_32BIT && TARGET_POWERPC64)
+    return false;
+
+  /* If this is not a fixed size compare, just call memcmp.  */
   if (!CONST_INT_P (bytes_rtx))
     return false;
 
-  /* This must be a fixed size alignment */
+  /* This must be a fixed size alignment.  */
   if (!CONST_INT_P (align_rtx))
     return false;
 
   unsigned int base_align = UINTVAL (align_rtx) / BITS_PER_UNIT;
 
-  /* SLOW_UNALIGNED_ACCESS -- don't do unaligned stuff */
+  /* SLOW_UNALIGNED_ACCESS -- don't do unaligned stuff.  */
   if (SLOW_UNALIGNED_ACCESS (word_mode, MEM_ALIGN (orig_src1))
       || SLOW_UNALIGNED_ACCESS (word_mode, MEM_ALIGN (orig_src2)))
     return false;
 
   gcc_assert (GET_MODE (target) == SImode);
 
-  /* Anything to move? */
+  /* Anything to move?  */
   unsigned HOST_WIDE_INT bytes = UINTVAL (bytes_rtx);
   if (bytes == 0)
     return true;
@@ -19490,6 +19497,13 @@ 
 
   rtx tmp_reg_src1 = gen_reg_rtx (word_mode);
   rtx tmp_reg_src2 = gen_reg_rtx (word_mode);
+  /* P7/P8 code uses cond for subfc. but P9 uses
+     it for cmpld which needs CCUNSmode. */
+  rtx cond;
+  if (TARGET_P9_MISC)
+    cond = gen_reg_rtx (CCUNSmode);
+  else
+    cond = gen_reg_rtx (CCmode);
 
   /* If we have an LE target without ldbrx and word_mode is DImode,
      then we must avoid using word_mode.  */
@@ -19514,20 +19528,20 @@ 
 
   /* Example of generated code for 11 bytes aligned 1 byte:
      .L10:
-             ldbrx 10,6,9
-             ldbrx 9,7,9
-             subf. 9,9,10
-             bne 0,.L8
-             addi 9,4,7
-             lwbrx 10,0,9
-             addi 9,5,7
-             lwbrx 9,0,9
-             subf 9,9,10
-             b .L9
+	     ldbrx 10,6,9
+	     ldbrx 9,7,9
+	     subf. 9,9,10
+	     bne 0,.L8
+	     addi 9,4,7
+	     lwbrx 10,0,9
+	     addi 9,5,7
+	     lwbrx 9,0,9
+	     subfc 9,9,10
+	     b .L9
      .L8: # convert_label
-             cntlzd 9,9
-             addi 9,9,-1
-             xori 9,9,0x3f
+	     subfe 10,10,10
+	     popcntd 9,9
+	     rldimi 9,10,6,0
      .L9: # final_label
 
      We start off with DImode and have a compare/branch to something
@@ -19600,26 +19614,18 @@ 
 	    }
 	}
 
-      /* We previously did a block that need 64->32 conversion but
-	 the current block does not, so a label is needed to jump
-	 to the end.  */
-      if (generate_6432_conversion && !final_label
-	  && GET_MODE_SIZE (GET_MODE (target)) >= load_mode_size)
-	final_label = gen_label_rtx ();
-
-      /* Do we need a 64->32 conversion block?  */
       int remain = bytes - cmp_bytes;
-      if (GET_MODE_SIZE (GET_MODE (target)) < GET_MODE_SIZE (load_mode))
+      if (GET_MODE_SIZE (GET_MODE (target)) > GET_MODE_SIZE (load_mode))
 	{
-	  generate_6432_conversion = true;
-	  if (remain > 0 && !convert_label)
-	    convert_label = gen_label_rtx ();
-	}
-
-      if (GET_MODE_SIZE (GET_MODE (target)) >= GET_MODE_SIZE (load_mode))
-	{
 	  /* Target is larger than load size so we don't need to
 	     reduce result size.  */
+
+	  /* We previously did a block that need 64->32 conversion but
+	     the current block does not, so a label is needed to jump
+	     to the end.  */
+	  if (generate_6432_conversion && !final_label)
+	    final_label = gen_label_rtx ();
+
 	  if (remain > 0)
 	    {
 	      /* This is not the last block, branch to the end if the result
@@ -19627,11 +19633,12 @@ 
 	      if (!final_label)
 		final_label = gen_label_rtx ();
 	      rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_label);
-	      rtx cond = gen_reg_rtx (CCmode);
 	      rtx tmp = gen_rtx_MINUS (word_mode, tmp_reg_src1, tmp_reg_src2);
-	      rs6000_emit_dot_insn (tmp_reg_src2, tmp, 2, cond);
-	      emit_insn (gen_movsi (target, gen_lowpart (SImode, tmp_reg_src2)));
-	      rtx ne_rtx = gen_rtx_NE (VOIDmode, cond, const0_rtx);
+	      rtx cr = gen_reg_rtx (CCmode);
+	      rs6000_emit_dot_insn (tmp_reg_src2, tmp, 2, cr);
+	      emit_insn (gen_movsi (target,
+				    gen_lowpart (SImode, tmp_reg_src2)));
+	      rtx ne_rtx = gen_rtx_NE (VOIDmode, cr, const0_rtx);
 	      rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, ne_rtx,
 						 fin_ref, pc_rtx);
 	      rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
@@ -19662,7 +19669,11 @@ 
 	}
       else
 	{
+	  /* Do we need a 64->32 conversion block? We need the 64->32
+	     conversion even if target size == load_mode size because
+	     the subtract generates one extra bit.  */
 	  generate_6432_conversion = true;
+
 	  if (remain > 0)
 	    {
 	      if (!convert_label)
@@ -19670,9 +19681,22 @@ 
 
 	      /* Compare to zero and branch to convert_label if not zero.  */
 	      rtx cvt_ref = gen_rtx_LABEL_REF (VOIDmode, convert_label);
-	      rtx cond = gen_reg_rtx (CCmode);
-	      rtx tmp = gen_rtx_MINUS (DImode, tmp_reg_src1, tmp_reg_src2);
-	      rs6000_emit_dot_insn (tmp_reg_src2, tmp, 2, cond);
+	      if (TARGET_P9_MISC)
+		{
+		/* Generate a compare, and convert with a setb later.  */
+		  rtx cmp = gen_rtx_COMPARE (CCUNSmode, tmp_reg_src1,
+					     tmp_reg_src2);
+		  emit_insn (gen_rtx_SET (cond, cmp));
+		}
+	      else
+		/* Generate a subfc. and use the longer
+		   sequence for conversion.  */
+		if (TARGET_64BIT)
+		  emit_insn (gen_subfdi3_carry_dot2 (tmp_reg_src2, tmp_reg_src2,
+						     tmp_reg_src1, cond));
+		else
+		  emit_insn (gen_subfsi3_carry_dot2 (tmp_reg_src2, tmp_reg_src2,
+						     tmp_reg_src1, cond));
 	      rtx ne_rtx = gen_rtx_NE (VOIDmode, cond, const0_rtx);
 	      rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, ne_rtx,
 						 cvt_ref, pc_rtx);
@@ -19682,10 +19706,21 @@ 
 	    }
 	  else
 	    {
-	      /* Just do the subtract.  Since this is the last block the
-		 convert code will be generated immediately following.  */
-	      emit_insn (gen_subdi3 (tmp_reg_src2, tmp_reg_src1,
-				     tmp_reg_src2));
+	      /* Just do the subtract/compare.  Since this is the last block
+		 the convert code will be generated immediately following.  */
+	      if (TARGET_P9_MISC)
+		{
+		  rtx cmp = gen_rtx_COMPARE (CCUNSmode, tmp_reg_src1,
+					     tmp_reg_src2);
+		  emit_insn (gen_rtx_SET (cond, cmp));
+		}
+	      else
+		if (TARGET_64BIT)
+		  emit_insn (gen_subfdi3_carry (tmp_reg_src2, tmp_reg_src2,
+						tmp_reg_src1));
+		else
+		  emit_insn (gen_subfsi3_carry (tmp_reg_src2, tmp_reg_src2,
+						tmp_reg_src1));
 	    }
 	}
 
@@ -19699,12 +19734,46 @@ 
 	emit_label (convert_label);
 
       /* We need to produce DI result from sub, then convert to target SI
-	 while maintaining <0 / ==0 / >0 properties.
-	 Segher's sequence: cntlzd 3,3 ; addi 3,3,-1 ; xori 3,3,63 */
-      emit_insn (gen_clzdi2 (tmp_reg_src2, tmp_reg_src2));
-      emit_insn (gen_adddi3 (tmp_reg_src2, tmp_reg_src2, GEN_INT (-1)));
-      emit_insn (gen_xordi3 (tmp_reg_src2, tmp_reg_src2, GEN_INT (63)));
-      emit_insn (gen_movsi (target, gen_lowpart (SImode, tmp_reg_src2)));
+	 while maintaining <0 / ==0 / >0 properties. This sequence works:
+	 subfc L,A,B
+	 subfe H,H,H
+	 popcntd L,L
+	 rldimi L,H,6,0
+
+	 This is an alternate one Segher cooked up if somebody
+	 wants to expand this for something that doesn't have popcntd:
+	 subfc L,a,b
+	 subfe H,x,x
+	 addic t,L,-1
+	 subfe v,t,L
+	 or z,v,H
+
+	 And finally, p9 can just do this:
+	 cmpld A,B
+	 setb r */
+
+      if (TARGET_P9_MISC)
+	{
+	  emit_insn (gen_setb_unsigned (target, cond));
+	}
+      else
+	{
+	  if (TARGET_64BIT)
+	    {
+	      rtx tmp_reg_ca = gen_reg_rtx (DImode);
+	      emit_insn (gen_subfdi3_carry_in_xx (tmp_reg_ca));
+	      emit_insn (gen_popcntddi2 (tmp_reg_src2, tmp_reg_src2));
+	      emit_insn (gen_iordi3 (tmp_reg_src2, tmp_reg_src2, tmp_reg_ca));
+	      emit_insn (gen_movsi (target, gen_lowpart (SImode, tmp_reg_src2)));
+	    }
+	  else
+	    {
+	      rtx tmp_reg_ca = gen_reg_rtx (SImode);
+	      emit_insn (gen_subfsi3_carry_in_xx (tmp_reg_ca));
+	      emit_insn (gen_popcntdsi2 (tmp_reg_src2, tmp_reg_src2));
+	      emit_insn (gen_iorsi3 (target, tmp_reg_src2, tmp_reg_ca));
+	    }
+	}
     }
 
   if (final_label)
@@ -21246,7 +21315,7 @@ 
       regno = true_regnum (reg);
       if (regno < 0 || regno >= FIRST_PSEUDO_REGISTER)
 	return PSEUDO_REG_TYPE;
-    }	
+    }
 
   gcc_assert (regno >= 0);
 
Index: gcc/config/rs6000/rs6000.md
===================================================================
--- gcc/config/rs6000/rs6000.md	(revision 244952)
+++ gcc/config/rs6000/rs6000.md	(working copy)
@@ -2068,6 +2068,35 @@ 
   "subfic %0,%1,%2"
   [(set_attr "type" "add")])
 
+(define_insn_and_split "subf<mode>3_carry_dot2"
+  [(set (match_operand:CC 3 "cc_reg_operand" "=x,?y")
+	(compare:CC (minus:P (match_operand:P 2 "gpc_reg_operand" "r,r")
+			       (match_operand:P 1 "gpc_reg_operand" "r,r"))
+		    (const_int 0)))
+   (set (match_operand:P 0 "gpc_reg_operand" "=r,r")
+	(minus:P (match_dup 2)
+		   (match_dup 1)))
+   (set (reg:P CA_REGNO)
+	(leu:P (match_dup 1)
+	       (match_dup 2)))]
+  "<MODE>mode == Pmode"
+  "@
+   subfc. %0,%1,%2
+   #"
+  "&& reload_completed && cc_reg_not_cr0_operand (operands[3], CCmode)"
+  [(set (reg:P CA_REGNO)
+	(leu:P (match_dup 1)
+	       (match_dup 2)))
+   (set (match_dup 0)
+	(minus:P (match_dup 2)
+		   (match_dup 1)))
+   (set (match_dup 3)
+	(compare:CC (match_dup 0)
+		    (const_int 0)))]
+  ""
+  [(set_attr "type" "add")
+   (set_attr "dot" "yes")
+   (set_attr "length" "4,8")])
 
 (define_insn "subf<mode>3_carry"
   [(set (match_operand:P 0 "gpc_reg_operand" "=r")
@@ -9146,11 +9175,11 @@ 
                            (match_operand:BLK 2)))
 	      (use (match_operand:SI 3))
 	      (use (match_operand:SI 4))])]
-  ""
+  "TARGET_POPCNTD"
 {
   if (expand_block_compare (operands))
     DONE;
-  else	
+  else
     FAIL;
 })
 
Index: gcc/testsuite/gcc.dg/memcmp-1.c
===================================================================
--- gcc/testsuite/gcc.dg/memcmp-1.c	(revision 244952)
+++ gcc/testsuite/gcc.dg/memcmp-1.c	(working copy)
@@ -1,59 +1,125 @@ 
-/* Test memcmp builtin expansion for compilation and proper execution.  */
+/* Test memcmp/strncmp builtin expansion for compilation and proper execution.  */
 /* { dg-do run } */
 /* { dg-options "-O2" } */
 /* { dg-require-effective-target ptr32plus } */
 
 #include <stdio.h>
+#include <stdlib.h>
 #include <string.h>
-#include <stdlib.h>
+#include <stdint.h>
 
-#define RUN_TEST(SZ, ALIGN) test_memcmp_ ## SZ ## _ ## ALIGN ()
+int lib_memcmp(const void *a, const void *b, size_t n) asm("memcmp");
+int lib_strncmp(const char *a, const char *b, size_t n) asm("strncmp");
 
-#define DEF_TEST(SZ, ALIGN)                                                \
-static void test_memcmp_ ## SZ ## _ ## ALIGN (void) {                      \
-  char one[3 * (SZ > 10 ? SZ : 10)];  				   	   \
-  char two[3 * (SZ > 10 ? SZ : 10)];				   	   \
-  int i,j;								   \
-  for (i = 0 ; i < SZ ; i++)			   		           \
-    {							   		   \
-      int r1;								   \
-      char *a = one + (i & 1) * ALIGN;			   		   \
-      char *b = two + (i & 1) * ALIGN;			   		   \
-      memset (a, '-', SZ);					   	   \
-      memset (b, '-', SZ);					   	   \
-      a[i] = '1';					   		   \
-      b[i] = '2';					   		   \
-      a[SZ] = 0;							   \
-      b[SZ] = 0;					   		   \
-      if (!((r1 = memcmp (b, a, SZ)) > 0))   		   		   \
-        {								   \
-	  abort ();							   \
-	}								   \
-      if (!((r1 = memcmp (a, b, SZ)) < 0))			   	   \
-        {								   \
-	  abort ();							   \
-	}                            					   \
-      b[i] = '1';					   		   \
-      if (!((r1 = memcmp (a, b, SZ)) == 0))		   		   \
-        {								   \
-	  abort ();							   \
-	}                            					   \
-      for(j = i; j < SZ ; j++)			   		           \
-	{						   		   \
-	  a[j] = '1';            			   		   \
-	  b[j] = '2';			                   		   \
-	}						   		   \
-      if (!((r1 = memcmp (b, a, SZ)) > 0))			   	   \
-        {								   \
-	  abort ();							   \
-	}             							   \
-      if (!((r1 = memcmp (a, b, SZ)) < 0))			   	   \
-        {								   \
-	  abort ();							   \
-	}	           						   \
-    }							                   \
-}                                                                
+#ifndef NRAND
+#define NRAND 10000
+#endif
+#define MAX_SZ 200
 
+static void test_driver_memcmp (void (test_memcmp)(const char *, const char *, int),
+				void (test_strncmp)(const char *, const char *, int),
+				size_t sz, int align)
+{
+  char buf1[MAX_SZ*2+10],buf2[MAX_SZ*2+10];
+  size_t test_sz = (sz<MAX_SZ)?sz:MAX_SZ;
+  size_t diff_pos, zero_pos;
+  uint32_t e;
+  int i,j,l;
+  for(l=0;l<sz;l++) {
+    for(i=0;i<NRAND/sz;i++) {
+      for(j=0;j<l;j++) {
+	buf1[j] = random() & 0xff;
+	buf2[j] = buf1[j];
+      }
+      for(j=l;j<sz;j++) {
+	buf1[j] = random() & 0xff;
+	buf2[j] = random() & 0xff;
+      }
+    }
+    e = lib_memcmp(buf1,buf2,sz);
+    (*test_memcmp)(buf1,buf2,e);
+    e = lib_strncmp(buf1,buf2,sz);
+    (*test_strncmp)(buf1,buf2,e);
+  }
+  for(diff_pos = ((test_sz>10)?(test_sz-10):0); diff_pos < test_sz+10; diff_pos++)
+    for(zero_pos = ((test_sz>10)?(test_sz-10):0); zero_pos < test_sz+10; zero_pos++)
+      {
+	memset(buf1, 'A', 2*test_sz);
+	memset(buf2, 'A', 2*test_sz);
+	buf2[diff_pos] = 'B';
+	buf1[zero_pos] = 0;
+	buf2[zero_pos] = 0;
+	e = lib_memcmp(buf1,buf2,sz);
+	(*test_memcmp)(buf1,buf2,e);
+	(*test_memcmp)(buf2,buf1,-e);
+	(*test_memcmp)(buf2,buf2,0);
+	e = lib_strncmp(buf1,buf2,sz);
+	(*test_strncmp)(buf1,buf2,e);
+	(*test_strncmp)(buf2,buf1,-e);
+	(*test_strncmp)(buf2,buf2,0);
+	/* differing length: */
+	buf2[diff_pos] = 0;
+	e = lib_memcmp(buf1,buf2,sz);
+	(*test_memcmp)(buf1,buf2,e);
+	e = lib_strncmp(buf1,buf2,sz);
+	(*test_strncmp)(buf1,buf2,e);
+	memset(buf2+diff_pos,'B',sizeof(buf2)-diff_pos);
+	buf2[zero_pos] = 0;
+	e = lib_memcmp(buf1,buf2,sz);
+	(*test_memcmp)(buf1,buf2,e);
+	(*test_memcmp)(buf2,buf1,-e);
+	e = lib_strncmp(buf1,buf2,sz);
+	(*test_strncmp)(buf1,buf2,e);
+	(*test_strncmp)(buf2,buf1,-e);
+      }
+}
+
+#define RUN_TEST(SZ, ALIGN) test_driver_memcmp (test_memcmp_ ## SZ ## _ ## ALIGN, test_strncmp_ ## SZ ## _ ## ALIGN, SZ, ALIGN);
+
+#define DEF_TEST(SZ, ALIGN)						\
+  static void test_memcmp_ ## SZ ## _ ## ALIGN (const char *str1, const char *str2, int expect)	\
+{									\
+  char three[8192] __attribute__ ((aligned (4096)));			\
+  char four[8192] __attribute__ ((aligned (4096)));			\
+  char *a, *b;								\
+  int i,j,r;								\
+  for (j = 0; j < 2; j++)						\
+    {									\
+      for (i = 0; i < 2; i++)						\
+	{								\
+	  a = three+i*ALIGN+j*(4096-2*i*ALIGN);				\
+	  b = four+i*ALIGN+j*(4096-2*i*ALIGN);				\
+	  memcpy(a,str1,SZ);						\
+	  memcpy(b,str2,SZ);						\
+	  r = memcmp(a,b,SZ);						\
+	  if ( r < 0 && !(expect < 0) ) abort();			\
+	  if ( r > 0 && !(expect > 0) )	abort();			\
+	  if ( r == 0 && !(expect == 0) ) abort();			\
+	}								\
+    }									\
+}									\
+static void test_strncmp_ ## SZ ## _ ## ALIGN (const char *str1, const char *str2, int expect)	 \
+{									\
+  char three[8192] __attribute__ ((aligned (4096)));			\
+  char four[8192] __attribute__ ((aligned (4096)));			\
+  char *a, *b;								\
+  int i,j,r;								\
+  for (j = 0; j < 2; j++)						\
+    {									\
+      for (i = 0; i < 2; i++)						\
+	{								\
+	  a = three+i*ALIGN+j*(4096-2*i*ALIGN);				\
+	  b = four+i*ALIGN+j*(4096-2*i*ALIGN);				\
+	  strcpy(a,str1);						\
+	  strcpy(b,str2);						\
+	  r = strncmp(a,b,SZ);						\
+	  if ( r < 0 && !(expect < 0) ) abort();			\
+	  if ( r > 0 && !(expect > 0) )	abort();			\
+	  if ( r == 0 && !(expect == 0) ) abort();			\
+	}								\
+    }									\
+}
+
 #ifdef TEST_ALL
 DEF_TEST(1,1)
 DEF_TEST(1,2)
@@ -300,305 +366,302 @@ 
 DEF_TEST(49,4)
 DEF_TEST(49,8)
 DEF_TEST(49,16)
+DEF_TEST(100,1)
+DEF_TEST(100,2)
+DEF_TEST(100,4)
+DEF_TEST(100,8)
+DEF_TEST(100,16)
 #else
 DEF_TEST(3,1)
 DEF_TEST(4,1)
-DEF_TEST(4,2)
-DEF_TEST(4,4)
 DEF_TEST(5,1)
+DEF_TEST(5,8)
 DEF_TEST(6,1)
+DEF_TEST(6,4)
+DEF_TEST(6,8)
 DEF_TEST(7,1)
+DEF_TEST(7,2)
+DEF_TEST(7,4)
+DEF_TEST(7,8)
 DEF_TEST(8,1)
-DEF_TEST(8,2)
-DEF_TEST(8,4)
-DEF_TEST(8,8)
 DEF_TEST(9,1)
 DEF_TEST(16,1)
-DEF_TEST(16,2)
-DEF_TEST(16,4)
-DEF_TEST(16,8)
-DEF_TEST(16,16)
 DEF_TEST(32,1)
-DEF_TEST(32,2)
-DEF_TEST(32,4)
-DEF_TEST(32,8)
-DEF_TEST(32,16)
+DEF_TEST(100,1)
+DEF_TEST(100,8)
 #endif
 
 int
 main(int argc, char **argv)
 {
-
 #ifdef TEST_ALL
-  RUN_TEST(1,1);
-  RUN_TEST(1,2);
-  RUN_TEST(1,4);
-  RUN_TEST(1,8);
-  RUN_TEST(1,16);
-  RUN_TEST(2,1);
-  RUN_TEST(2,2);
-  RUN_TEST(2,4);
-  RUN_TEST(2,8);
-  RUN_TEST(2,16);
-  RUN_TEST(3,1);
-  RUN_TEST(3,2);
-  RUN_TEST(3,4);
-  RUN_TEST(3,8);
-  RUN_TEST(3,16);
-  RUN_TEST(4,1);
-  RUN_TEST(4,2);
-  RUN_TEST(4,4);
-  RUN_TEST(4,8);
-  RUN_TEST(4,16);
-  RUN_TEST(5,1);
-  RUN_TEST(5,2);
-  RUN_TEST(5,4);
-  RUN_TEST(5,8);
-  RUN_TEST(5,16);
-  RUN_TEST(6,1);
-  RUN_TEST(6,2);
-  RUN_TEST(6,4);
-  RUN_TEST(6,8);
-  RUN_TEST(6,16);
-  RUN_TEST(7,1);
-  RUN_TEST(7,2);
-  RUN_TEST(7,4);
-  RUN_TEST(7,8);
-  RUN_TEST(7,16);
-  RUN_TEST(8,1);
-  RUN_TEST(8,2);
-  RUN_TEST(8,4);
-  RUN_TEST(8,8);
-  RUN_TEST(8,16);
-  RUN_TEST(9,1);
-  RUN_TEST(9,2);
-  RUN_TEST(9,4);
-  RUN_TEST(9,8);
-  RUN_TEST(9,16);
-  RUN_TEST(10,1);
-  RUN_TEST(10,2);
-  RUN_TEST(10,4);
-  RUN_TEST(10,8);
-  RUN_TEST(10,16);
-  RUN_TEST(11,1);
-  RUN_TEST(11,2);
-  RUN_TEST(11,4);
-  RUN_TEST(11,8);
-  RUN_TEST(11,16);
-  RUN_TEST(12,1);
-  RUN_TEST(12,2);
-  RUN_TEST(12,4);
-  RUN_TEST(12,8);
-  RUN_TEST(12,16);
-  RUN_TEST(13,1);
-  RUN_TEST(13,2);
-  RUN_TEST(13,4);
-  RUN_TEST(13,8);
-  RUN_TEST(13,16);
-  RUN_TEST(14,1);
-  RUN_TEST(14,2);
-  RUN_TEST(14,4);
-  RUN_TEST(14,8);
-  RUN_TEST(14,16);
-  RUN_TEST(15,1);
-  RUN_TEST(15,2);
-  RUN_TEST(15,4);
-  RUN_TEST(15,8);
-  RUN_TEST(15,16);
-  RUN_TEST(16,1);
-  RUN_TEST(16,2);
-  RUN_TEST(16,4);
-  RUN_TEST(16,8);
-  RUN_TEST(16,16);
-  RUN_TEST(17,1);
-  RUN_TEST(17,2);
-  RUN_TEST(17,4);
-  RUN_TEST(17,8);
-  RUN_TEST(17,16);
-  RUN_TEST(18,1);
-  RUN_TEST(18,2);
-  RUN_TEST(18,4);
-  RUN_TEST(18,8);
-  RUN_TEST(18,16);
-  RUN_TEST(19,1);
-  RUN_TEST(19,2);
-  RUN_TEST(19,4);
-  RUN_TEST(19,8);
-  RUN_TEST(19,16);
-  RUN_TEST(20,1);
-  RUN_TEST(20,2);
-  RUN_TEST(20,4);
-  RUN_TEST(20,8);
-  RUN_TEST(20,16);
-  RUN_TEST(21,1);
-  RUN_TEST(21,2);
-  RUN_TEST(21,4);
-  RUN_TEST(21,8);
-  RUN_TEST(21,16);
-  RUN_TEST(22,1);
-  RUN_TEST(22,2);
-  RUN_TEST(22,4);
-  RUN_TEST(22,8);
-  RUN_TEST(22,16);
-  RUN_TEST(23,1);
-  RUN_TEST(23,2);
-  RUN_TEST(23,4);
-  RUN_TEST(23,8);
-  RUN_TEST(23,16);
-  RUN_TEST(24,1);
-  RUN_TEST(24,2);
-  RUN_TEST(24,4);
-  RUN_TEST(24,8);
-  RUN_TEST(24,16);
-  RUN_TEST(25,1);
-  RUN_TEST(25,2);
-  RUN_TEST(25,4);
-  RUN_TEST(25,8);
-  RUN_TEST(25,16);
-  RUN_TEST(26,1);
-  RUN_TEST(26,2);
-  RUN_TEST(26,4);
-  RUN_TEST(26,8);
-  RUN_TEST(26,16);
-  RUN_TEST(27,1);
-  RUN_TEST(27,2);
-  RUN_TEST(27,4);
-  RUN_TEST(27,8);
-  RUN_TEST(27,16);
-  RUN_TEST(28,1);
-  RUN_TEST(28,2);
-  RUN_TEST(28,4);
-  RUN_TEST(28,8);
-  RUN_TEST(28,16);
-  RUN_TEST(29,1);
-  RUN_TEST(29,2);
-  RUN_TEST(29,4);
-  RUN_TEST(29,8);
-  RUN_TEST(29,16);
-  RUN_TEST(30,1);
-  RUN_TEST(30,2);
-  RUN_TEST(30,4);
-  RUN_TEST(30,8);
-  RUN_TEST(30,16);
-  RUN_TEST(31,1);
-  RUN_TEST(31,2);
-  RUN_TEST(31,4);
-  RUN_TEST(31,8);
-  RUN_TEST(31,16);
-  RUN_TEST(32,1);
-  RUN_TEST(32,2);
-  RUN_TEST(32,4);
-  RUN_TEST(32,8);
-  RUN_TEST(32,16);
-  RUN_TEST(33,1);
-  RUN_TEST(33,2);
-  RUN_TEST(33,4);
-  RUN_TEST(33,8);
-  RUN_TEST(33,16);
-  RUN_TEST(34,1);
-  RUN_TEST(34,2);
-  RUN_TEST(34,4);
-  RUN_TEST(34,8);
-  RUN_TEST(34,16);
-  RUN_TEST(35,1);
-  RUN_TEST(35,2);
-  RUN_TEST(35,4);
-  RUN_TEST(35,8);
-  RUN_TEST(35,16);
-  RUN_TEST(36,1);
-  RUN_TEST(36,2);
-  RUN_TEST(36,4);
-  RUN_TEST(36,8);
-  RUN_TEST(36,16);
-  RUN_TEST(37,1);
-  RUN_TEST(37,2);
-  RUN_TEST(37,4);
-  RUN_TEST(37,8);
-  RUN_TEST(37,16);
-  RUN_TEST(38,1);
-  RUN_TEST(38,2);
-  RUN_TEST(38,4);
-  RUN_TEST(38,8);
-  RUN_TEST(38,16);
-  RUN_TEST(39,1);
-  RUN_TEST(39,2);
-  RUN_TEST(39,4);
-  RUN_TEST(39,8);
-  RUN_TEST(39,16);
-  RUN_TEST(40,1);
-  RUN_TEST(40,2);
-  RUN_TEST(40,4);
-  RUN_TEST(40,8);
-  RUN_TEST(40,16);
-  RUN_TEST(41,1);
-  RUN_TEST(41,2);
-  RUN_TEST(41,4);
-  RUN_TEST(41,8);
-  RUN_TEST(41,16);
-  RUN_TEST(42,1);
-  RUN_TEST(42,2);
-  RUN_TEST(42,4);
-  RUN_TEST(42,8);
-  RUN_TEST(42,16);
-  RUN_TEST(43,1);
-  RUN_TEST(43,2);
-  RUN_TEST(43,4);
-  RUN_TEST(43,8);
-  RUN_TEST(43,16);
-  RUN_TEST(44,1);
-  RUN_TEST(44,2);
-  RUN_TEST(44,4);
-  RUN_TEST(44,8);
-  RUN_TEST(44,16);
-  RUN_TEST(45,1);
-  RUN_TEST(45,2);
-  RUN_TEST(45,4);
-  RUN_TEST(45,8);
-  RUN_TEST(45,16);
-  RUN_TEST(46,1);
-  RUN_TEST(46,2);
-  RUN_TEST(46,4);
-  RUN_TEST(46,8);
-  RUN_TEST(46,16);
-  RUN_TEST(47,1);
-  RUN_TEST(47,2);
-  RUN_TEST(47,4);
-  RUN_TEST(47,8);
-  RUN_TEST(47,16);
-  RUN_TEST(48,1);
-  RUN_TEST(48,2);
-  RUN_TEST(48,4);
-  RUN_TEST(48,8);
-  RUN_TEST(48,16);
-  RUN_TEST(49,1);
-  RUN_TEST(49,2);
-  RUN_TEST(49,4);
-  RUN_TEST(49,8);
-  RUN_TEST(49,16);
+  RUN_TEST(1,1)
+    RUN_TEST(1,2)
+    RUN_TEST(1,4)
+    RUN_TEST(1,8)
+    RUN_TEST(1,16)
+    RUN_TEST(2,1)
+    RUN_TEST(2,2)
+    RUN_TEST(2,4)
+    RUN_TEST(2,8)
+    RUN_TEST(2,16)
+    RUN_TEST(3,1)
+    RUN_TEST(3,2)
+    RUN_TEST(3,4)
+    RUN_TEST(3,8)
+    RUN_TEST(3,16)
+    RUN_TEST(4,1)
+    RUN_TEST(4,2)
+    RUN_TEST(4,4)
+    RUN_TEST(4,8)
+    RUN_TEST(4,16)
+    RUN_TEST(5,1)
+    RUN_TEST(5,2)
+    RUN_TEST(5,4)
+    RUN_TEST(5,8)
+    RUN_TEST(5,16)
+    RUN_TEST(6,1)
+    RUN_TEST(6,2)
+    RUN_TEST(6,4)
+    RUN_TEST(6,8)
+    RUN_TEST(6,16)
+    RUN_TEST(7,1)
+    RUN_TEST(7,2)
+    RUN_TEST(7,4)
+    RUN_TEST(7,8)
+    RUN_TEST(7,16)
+    RUN_TEST(8,1)
+    RUN_TEST(8,2)
+    RUN_TEST(8,4)
+    RUN_TEST(8,8)
+    RUN_TEST(8,16)
+    RUN_TEST(9,1)
+    RUN_TEST(9,2)
+    RUN_TEST(9,4)
+    RUN_TEST(9,8)
+    RUN_TEST(9,16)
+    RUN_TEST(10,1)
+    RUN_TEST(10,2)
+    RUN_TEST(10,4)
+    RUN_TEST(10,8)
+    RUN_TEST(10,16)
+    RUN_TEST(11,1)
+    RUN_TEST(11,2)
+    RUN_TEST(11,4)
+    RUN_TEST(11,8)
+    RUN_TEST(11,16)
+    RUN_TEST(12,1)
+    RUN_TEST(12,2)
+    RUN_TEST(12,4)
+    RUN_TEST(12,8)
+    RUN_TEST(12,16)
+    RUN_TEST(13,1)
+    RUN_TEST(13,2)
+    RUN_TEST(13,4)
+    RUN_TEST(13,8)
+    RUN_TEST(13,16)
+    RUN_TEST(14,1)
+    RUN_TEST(14,2)
+    RUN_TEST(14,4)
+    RUN_TEST(14,8)
+    RUN_TEST(14,16)
+    RUN_TEST(15,1)
+    RUN_TEST(15,2)
+    RUN_TEST(15,4)
+    RUN_TEST(15,8)
+    RUN_TEST(15,16)
+    RUN_TEST(16,1)
+    RUN_TEST(16,2)
+    RUN_TEST(16,4)
+    RUN_TEST(16,8)
+    RUN_TEST(16,16)
+    RUN_TEST(17,1)
+    RUN_TEST(17,2)
+    RUN_TEST(17,4)
+    RUN_TEST(17,8)
+    RUN_TEST(17,16)
+    RUN_TEST(18,1)
+    RUN_TEST(18,2)
+    RUN_TEST(18,4)
+    RUN_TEST(18,8)
+    RUN_TEST(18,16)
+    RUN_TEST(19,1)
+    RUN_TEST(19,2)
+    RUN_TEST(19,4)
+    RUN_TEST(19,8)
+    RUN_TEST(19,16)
+    RUN_TEST(20,1)
+    RUN_TEST(20,2)
+    RUN_TEST(20,4)
+    RUN_TEST(20,8)
+    RUN_TEST(20,16)
+    RUN_TEST(21,1)
+    RUN_TEST(21,2)
+    RUN_TEST(21,4)
+    RUN_TEST(21,8)
+    RUN_TEST(21,16)
+    RUN_TEST(22,1)
+    RUN_TEST(22,2)
+    RUN_TEST(22,4)
+    RUN_TEST(22,8)
+    RUN_TEST(22,16)
+    RUN_TEST(23,1)
+    RUN_TEST(23,2)
+    RUN_TEST(23,4)
+    RUN_TEST(23,8)
+    RUN_TEST(23,16)
+    RUN_TEST(24,1)
+    RUN_TEST(24,2)
+    RUN_TEST(24,4)
+    RUN_TEST(24,8)
+    RUN_TEST(24,16)
+    RUN_TEST(25,1)
+    RUN_TEST(25,2)
+    RUN_TEST(25,4)
+    RUN_TEST(25,8)
+    RUN_TEST(25,16)
+    RUN_TEST(26,1)
+    RUN_TEST(26,2)
+    RUN_TEST(26,4)
+    RUN_TEST(26,8)
+    RUN_TEST(26,16)
+    RUN_TEST(27,1)
+    RUN_TEST(27,2)
+    RUN_TEST(27,4)
+    RUN_TEST(27,8)
+    RUN_TEST(27,16)
+    RUN_TEST(28,1)
+    RUN_TEST(28,2)
+    RUN_TEST(28,4)
+    RUN_TEST(28,8)
+    RUN_TEST(28,16)
+    RUN_TEST(29,1)
+    RUN_TEST(29,2)
+    RUN_TEST(29,4)
+    RUN_TEST(29,8)
+    RUN_TEST(29,16)
+    RUN_TEST(30,1)
+    RUN_TEST(30,2)
+    RUN_TEST(30,4)
+    RUN_TEST(30,8)
+    RUN_TEST(30,16)
+    RUN_TEST(31,1)
+    RUN_TEST(31,2)
+    RUN_TEST(31,4)
+    RUN_TEST(31,8)
+    RUN_TEST(31,16)
+    RUN_TEST(32,1)
+    RUN_TEST(32,2)
+    RUN_TEST(32,4)
+    RUN_TEST(32,8)
+    RUN_TEST(32,16)
+    RUN_TEST(33,1)
+    RUN_TEST(33,2)
+    RUN_TEST(33,4)
+    RUN_TEST(33,8)
+    RUN_TEST(33,16)
+    RUN_TEST(34,1)
+    RUN_TEST(34,2)
+    RUN_TEST(34,4)
+    RUN_TEST(34,8)
+    RUN_TEST(34,16)
+    RUN_TEST(35,1)
+    RUN_TEST(35,2)
+    RUN_TEST(35,4)
+    RUN_TEST(35,8)
+    RUN_TEST(35,16)
+    RUN_TEST(36,1)
+    RUN_TEST(36,2)
+    RUN_TEST(36,4)
+    RUN_TEST(36,8)
+    RUN_TEST(36,16)
+    RUN_TEST(37,1)
+    RUN_TEST(37,2)
+    RUN_TEST(37,4)
+    RUN_TEST(37,8)
+    RUN_TEST(37,16)
+    RUN_TEST(38,1)
+    RUN_TEST(38,2)
+    RUN_TEST(38,4)
+    RUN_TEST(38,8)
+    RUN_TEST(38,16)
+    RUN_TEST(39,1)
+    RUN_TEST(39,2)
+    RUN_TEST(39,4)
+    RUN_TEST(39,8)
+    RUN_TEST(39,16)
+    RUN_TEST(40,1)
+    RUN_TEST(40,2)
+    RUN_TEST(40,4)
+    RUN_TEST(40,8)
+    RUN_TEST(40,16)
+    RUN_TEST(41,1)
+    RUN_TEST(41,2)
+    RUN_TEST(41,4)
+    RUN_TEST(41,8)
+    RUN_TEST(41,16)
+    RUN_TEST(42,1)
+    RUN_TEST(42,2)
+    RUN_TEST(42,4)
+    RUN_TEST(42,8)
+    RUN_TEST(42,16)
+    RUN_TEST(43,1)
+    RUN_TEST(43,2)
+    RUN_TEST(43,4)
+    RUN_TEST(43,8)
+    RUN_TEST(43,16)
+    RUN_TEST(44,1)
+    RUN_TEST(44,2)
+    RUN_TEST(44,4)
+    RUN_TEST(44,8)
+    RUN_TEST(44,16)
+    RUN_TEST(45,1)
+    RUN_TEST(45,2)
+    RUN_TEST(45,4)
+    RUN_TEST(45,8)
+    RUN_TEST(45,16)
+    RUN_TEST(46,1)
+    RUN_TEST(46,2)
+    RUN_TEST(46,4)
+    RUN_TEST(46,8)
+    RUN_TEST(46,16)
+    RUN_TEST(47,1)
+    RUN_TEST(47,2)
+    RUN_TEST(47,4)
+    RUN_TEST(47,8)
+    RUN_TEST(47,16)
+    RUN_TEST(48,1)
+    RUN_TEST(48,2)
+    RUN_TEST(48,4)
+    RUN_TEST(48,8)
+    RUN_TEST(48,16)
+    RUN_TEST(49,1)
+    RUN_TEST(49,2)
+    RUN_TEST(49,4)
+    RUN_TEST(49,8)
+    RUN_TEST(49,16)
+    RUN_TEST(100,1)
+    RUN_TEST(100,2)
+    RUN_TEST(100,4)
+    RUN_TEST(100,8)
+    RUN_TEST(100,16)
 #else
-  RUN_TEST(3,1);
-  RUN_TEST(4,1);
-  RUN_TEST(4,2);
-  RUN_TEST(4,4);
-  RUN_TEST(5,1);
-  RUN_TEST(6,1);
-  RUN_TEST(7,1);
-  RUN_TEST(8,1);
-  RUN_TEST(8,2);
-  RUN_TEST(8,4);
-  RUN_TEST(8,8);
-  RUN_TEST(9,1);
-  RUN_TEST(16,1);
-  RUN_TEST(16,2);
-  RUN_TEST(16,4);
-  RUN_TEST(16,8);
-  RUN_TEST(16,16);
-  RUN_TEST(32,1);
-  RUN_TEST(32,2);
-  RUN_TEST(32,4);
-  RUN_TEST(32,8);
-  RUN_TEST(32,16);
+    RUN_TEST(3,1)
+    RUN_TEST(4,1)
+    RUN_TEST(5,1)
+    RUN_TEST(5,8)
+    RUN_TEST(6,1)
+    RUN_TEST(6,4)
+    RUN_TEST(6,8)
+    RUN_TEST(7,1)
+    RUN_TEST(7,2)
+    RUN_TEST(7,4)
+    RUN_TEST(7,8)
+    RUN_TEST(8,1)
+    RUN_TEST(9,1)
+    RUN_TEST(16,1)
+    RUN_TEST(32,1)
+    RUN_TEST(100,1)
+    RUN_TEST(100,8)
 #endif
-
-  return 0;
 }