@@ -4845,6 +4845,57 @@ (define_expand "aarch64_<sur><addsub>hn2<mode>"
}
)
+;; div optimizations using narrowings
+;; we can do the division e.g. shorts by 255 faster by calculating it as
+;; (x + ((x + 257) >> 8)) >> 8 assuming the operation is done in
+;; double the precision of x.
+;;
+;; If we imagine a short as being composed of two blocks of bytes then
+;; adding 257 or 0b0000_0001_0000_0001 to the number is equivalen to
+;; adding 1 to each sub component:
+;;
+;; short value of 16-bits
+;; ┌──────────────┬────────────────┐
+;; │ │ │
+;; └──────────────┴────────────────┘
+;; 8-bit part1 ▲ 8-bit part2 ▲
+;; │ │
+;; │ │
+;; +1 +1
+;;
+;; after the first addition, we have to shift right by 8, and narrow the
+;; results back to a byte. Remember that the addition must be done in
+;; double the precision of the input. Since 8 is half the size of a short
+;; we can use a narrowing halfing instruction in AArch64, addhn which also
+;; does the addition in a wider precision and narrows back to a byte. The
+;; shift itself is implicit in the operation as it writes back only the top
+;; half of the result. i.e. bits 2*esize-1:esize.
+;;
+;; Since we have narrowed the result of the first part back to a byte, for
+;; the second addition we can use a widening addition, uaddw.
+;;
+;; For the finaly shift, since it's unsigned arithmatic we emit an ushr by 8
+;; to shift and the vectorizer.
+;;
+;; The shift is later optimized by combine to a uzp2 with movi #0.
+(define_expand "udiv_pow2_bitmask<mode>2"
+ [(match_operand:VQN 0 "register_operand")
+ (match_operand:VQN 1 "register_operand")]
+ "TARGET_SIMD"
+{
+ rtx addend = gen_reg_rtx (<MODE>mode);
+ rtx val = aarch64_simd_gen_const_vector_dup (<VNARROWQ2>mode, 1);
+ emit_move_insn (addend, lowpart_subreg (<MODE>mode, val, <VNARROWQ2>mode));
+ rtx tmp1 = gen_reg_rtx (<VNARROWQ>mode);
+ rtx tmp2 = gen_reg_rtx (<MODE>mode);
+ emit_insn (gen_aarch64_addhn<mode> (tmp1, operands[1], addend));
+ unsigned bitsize = GET_MODE_UNIT_BITSIZE (<VNARROWQ>mode);
+ rtx shift_vector = aarch64_simd_gen_const_vector_dup (<MODE>mode, bitsize);
+ emit_insn (gen_aarch64_uaddw<Vnarrowq> (tmp2, operands[1], tmp1));
+ emit_insn (gen_aarch64_simd_lshr<mode> (operands[0], tmp2, shift_vector));
+ DONE;
+})
+
;; pmul.
(define_insn "aarch64_pmul<mode>"
new file mode 100644
@@ -0,0 +1,70 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O2 -std=c99 -fdump-tree-vect -save-temps" } */
+/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
+
+#include <stdint.h>
+
+/*
+** draw_bitmap1:
+** ...
+** umull2 v[0-9]+.8h, v[0-9]+.16b, v[0-9]+.16b
+** umull v[0-9]+.8h, v[0-9]+.8b, v[0-9]+.8b
+** addhn v[0-9]+.8b, v[0-9]+.8h, v[0-9]+.8h
+** addhn v[0-9]+.8b, v[0-9]+.8h, v[0-9]+.8h
+** uaddw v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8b
+** uaddw v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8b
+** uzp2 v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b
+** ...
+*/
+void draw_bitmap1(uint8_t* restrict pixel, uint8_t level, int n)
+{
+ for (int i = 0; i < (n & -16); i+=1)
+ pixel[i] = (pixel[i] * level) / 0xff;
+}
+
+void draw_bitmap2(uint8_t* restrict pixel, uint8_t level, int n)
+{
+ for (int i = 0; i < (n & -16); i+=1)
+ pixel[i] = (pixel[i] * level) / 0xfe;
+}
+
+/*
+** draw_bitmap3:
+** ...
+** umull2 v[0-9]+.4s, v[0-9]+.8h, v[0-9]+.8h
+** umull v[0-9]+.4s, v[0-9]+.4h, v[0-9]+.4h
+** addhn v[0-9]+.4h, v[0-9]+.4s, v[0-9]+.4s
+** addhn v[0-9]+.4h, v[0-9]+.4s, v[0-9]+.4s
+** uaddw v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4h
+** uaddw v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4h
+** uzp2 v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h
+** ...
+*/
+void draw_bitmap3(uint16_t* restrict pixel, uint16_t level, int n)
+{
+ for (int i = 0; i < (n & -16); i+=1)
+ pixel[i] = (pixel[i] * level) / 0xffffU;
+}
+
+/*
+** draw_bitmap4:
+** ...
+** umull2 v[0-9]+.2d, v[0-9]+.4s, v[0-9]+.4s
+** umull v[0-9]+.2d, v[0-9]+.2s, v[0-9]+.2s
+** addhn v[0-9]+.2s, v[0-9]+.2d, v[0-9]+.2d
+** addhn v[0-9]+.2s, v[0-9]+.2d, v[0-9]+.2d
+** uaddw v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2s
+** uaddw v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2s
+** uzp2 v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s
+** ...
+*/
+/* Costing for long vectorization seems off, so disable
+ the cost model to test the codegen. */
+__attribute__ ((optimize("-fno-vect-cost-model")))
+void draw_bitmap4(uint32_t* restrict pixel, uint32_t level, int n)
+{
+ for (int i = 0; i < (n & -16); i+=1)
+ pixel[i] = (pixel[i] * (uint64_t)level) / 0xffffffffUL;
+}
+
+/* { dg-final { scan-tree-dump-times "\.DIV_POW2_BITMASK" 6 "vect" } } */