diff mbox

[AArch64] Add vec_shr pattern for 64-bit vectors using ush{l,r}; enable tests.

Message ID 546623BD.6020009@arm.com
State New
Headers show

Commit Message

Alan Lawrence Nov. 14, 2014, 3:46 p.m. UTC
...Patch attached...

Alan Lawrence wrote:
> Following recent vectorizer changes to reductions via shifts, AArch64 will now 
> reduce loops such as this
> 
> unsigned char in[8] = {1, 3, 5, 7, 9, 11, 13, 15};
> 
> int
> main (unsigned char argc, char **argv)
> {
>    unsigned char prod = 1;
> 
>    /* Prevent constant propagation of the entire loop below.  */
>    asm volatile ("" : : : "memory");
> 
>    for (unsigned char i = 0; i < 8; i++)
>      prod *= in[i];
> 
>    if (prod != 17)
>        __builtin_printf("Failed %d\n", prod);
> 
>    return 0;
> }
> 
> using an 'ext' instruction from aarch64_expand_vec_perm_const:
> 
> main:
>          adrp    x0, .LANCHOR0
>          movi    v2.2s, 0    <=== note reg used here
>          ldr     d1, [x0, #:lo12:.LANCHOR0]
>          ext     v0.8b, v1.8b, v2.8b, #4
>          mul     v1.8b, v1.8b, v0.8b
>          ext     v0.8b, v1.8b, v2.8b, #2
>          mul     v0.8b, v1.8b, v0.8b
>          ext     v2.8b, v0.8b, v2.8b, #1
>          mul     v0.8b, v0.8b, v2.8b
>          umov    w1, v0.b[0]
> 
> The 'ext' works for both 64-bit vectors, and 128-bit vectors; but for 64-bit 
> vectors, we can do slightly better using ushr; this patch improves the above to:
> 
> main:
>          adrp    x0, .LANCHOR0
>          ldr     d0, [x0, #:lo12:.LANCHOR0]
>          ushr d1, d0, 32
>          mul     v0.8b, v0.8b, v1.8b
>          ushr d1, d0, 16
>          mul     v0.8b, v0.8b, v1.8b
>          ushr d1, d0, 8
>          mul     v0.8b, v0.8b, v1.8b
>          umov    w1, v0.b[0]
> 	...
> 
> Tested with bootstrap + check-gcc on aarch64-none-linux-gnu.
> Cross-testing of check-gcc on aarch64_be-none-elf in progress.
> 
> Ok if no regressions on big-endian?
> 
> Cheers,
> --Alan
> 
> gcc/ChangeLog:
> 
> 	* config/aarch64/aarch64-simd.md (vec_shr<mode>): New.
> 
> gcc/testsuite/ChangeLog:
> 
> 	* lib/target-supports.exp
> 	(check_effective_target_whole_vector_shift): Add aarch64{,_be}.
> 
> 
>

Comments

Alan Lawrence Nov. 17, 2014, 12:04 p.m. UTC | #1
I confirm no regressions on aarch64_be-none-elf.

--Alan

Alan Lawrence wrote:
> ...Patch attached...
> 
> Alan Lawrence wrote:
>> Following recent vectorizer changes to reductions via shifts, AArch64 will now 
>> reduce loops such as this
>>
>> unsigned char in[8] = {1, 3, 5, 7, 9, 11, 13, 15};
>>
>> int
>> main (unsigned char argc, char **argv)
>> {
>>    unsigned char prod = 1;
>>
>>    /* Prevent constant propagation of the entire loop below.  */
>>    asm volatile ("" : : : "memory");
>>
>>    for (unsigned char i = 0; i < 8; i++)
>>      prod *= in[i];
>>
>>    if (prod != 17)
>>        __builtin_printf("Failed %d\n", prod);
>>
>>    return 0;
>> }
>>
>> using an 'ext' instruction from aarch64_expand_vec_perm_const:
>>
>> main:
>>          adrp    x0, .LANCHOR0
>>          movi    v2.2s, 0    <=== note reg used here
>>          ldr     d1, [x0, #:lo12:.LANCHOR0]
>>          ext     v0.8b, v1.8b, v2.8b, #4
>>          mul     v1.8b, v1.8b, v0.8b
>>          ext     v0.8b, v1.8b, v2.8b, #2
>>          mul     v0.8b, v1.8b, v0.8b
>>          ext     v2.8b, v0.8b, v2.8b, #1
>>          mul     v0.8b, v0.8b, v2.8b
>>          umov    w1, v0.b[0]
>>
>> The 'ext' works for both 64-bit vectors, and 128-bit vectors; but for 64-bit 
>> vectors, we can do slightly better using ushr; this patch improves the above to:
>>
>> main:
>>          adrp    x0, .LANCHOR0
>>          ldr     d0, [x0, #:lo12:.LANCHOR0]
>>          ushr d1, d0, 32
>>          mul     v0.8b, v0.8b, v1.8b
>>          ushr d1, d0, 16
>>          mul     v0.8b, v0.8b, v1.8b
>>          ushr d1, d0, 8
>>          mul     v0.8b, v0.8b, v1.8b
>>          umov    w1, v0.b[0]
>> 	...
>>
>> Tested with bootstrap + check-gcc on aarch64-none-linux-gnu.
>> Cross-testing of check-gcc on aarch64_be-none-elf in progress.
>>
>> Ok if no regressions on big-endian?
>>
>> Cheers,
>> --Alan
>>
>> gcc/ChangeLog:
>>
>> 	* config/aarch64/aarch64-simd.md (vec_shr<mode>): New.
>>
>> gcc/testsuite/ChangeLog:
>>
>> 	* lib/target-supports.exp
>> 	(check_effective_target_whole_vector_shift): Add aarch64{,_be}.
>>
>>
Marcus Shawcroft Nov. 21, 2014, 4:07 p.m. UTC | #2
On 14 November 2014 15:46, Alan Lawrence <alan.lawrence@arm.com> wrote:

>> gcc/ChangeLog:
>>
>>         * config/aarch64/aarch64-simd.md (vec_shr<mode>): New.
>>
>> gcc/testsuite/ChangeLog:
>>
>>         * lib/target-supports.exp
>>         (check_effective_target_whole_vector_shift): Add aarch64{,_be}.

OK /Marcus
diff mbox

Patch

diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index ef196e4b6fb39c0d2fd9ebfee76abab8369b1e92..397cb5186dd4ff000307f3b14bb4964d84c79469 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -779,6 +779,21 @@ 
   }
 )
 
+;; For 64-bit modes we use ushl/r, as this does not require a SIMD zero.
+(define_insn "vec_shr_<mode>"
+  [(set (match_operand:VD 0 "register_operand" "=w")
+        (lshiftrt:VD (match_operand:VD 1 "register_operand" "w")
+		     (match_operand:SI 2 "immediate_operand" "i")))]
+  "TARGET_SIMD"
+  {
+    if (BYTES_BIG_ENDIAN)
+      return "ushl %d0, %d1, %2";
+    else
+      return "ushr %d0, %d1, %2";
+  }
+  [(set_attr "type" "neon_shift_imm")]
+)
+
 (define_insn "aarch64_simd_vec_setv2di"
   [(set (match_operand:V2DI 0 "register_operand" "=w,w")
         (vec_merge:V2DI
diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp
index 3361c2f9e8d98c5d1cc194617db6281127db2277..464c910777a53867110b462f121c02525d8dd140 100644
--- a/gcc/testsuite/lib/target-supports.exp
+++ b/gcc/testsuite/lib/target-supports.exp
@@ -3335,6 +3335,7 @@  proc check_effective_target_vect_shift { } {
 proc check_effective_target_whole_vector_shift { } {
     if { [istarget i?86-*-*] || [istarget x86_64-*-*]
 	 || [istarget ia64-*-*]
+	 || [istarget aarch64*-*-*]
 	 || ([check_effective_target_arm32]
 	     && [check_effective_target_arm_little_endian])
 	 || ([istarget mips*-*-*]