Message ID | 20210716135022.489455-2-pc@us.ibm.com |
---|---|
State | New |
Headers | show |
Series | rs6000: Add SSE4.1 "blend", "ceil", "floor" | expand |
Hi Paul, Thanks! LGTM. Recommend that maintainers approve. Bill On 7/16/21 8:50 AM, Paul A. Clarke wrote: > _mm_blend_epi16 and _mm_blendv_epi8 were added earlier. > Add these four to complete the set. > > 2021-07-16 Paul A. Clarke <pc@us.ibm.com> > > gcc > * config/rs6000/smmintrin.h (_mm_blend_pd, _mm_blendv_pd, > _mm_blend_ps, _mm_blendv_ps): New. > --- > v2: > - Per review from Bill, rewrote _mm_blend_pd and _mm_blendv_pd to use > vec_perm instead of gather/unpack/select. > > gcc/config/rs6000/smmintrin.h | 60 +++++++++++++++++++++++++++++++++++ > 1 file changed, 60 insertions(+) > > diff --git a/gcc/config/rs6000/smmintrin.h b/gcc/config/rs6000/smmintrin.h > index 6a010fdbb96f..69e54702a877 100644 > --- a/gcc/config/rs6000/smmintrin.h > +++ b/gcc/config/rs6000/smmintrin.h > @@ -116,6 +116,66 @@ _mm_blendv_epi8 (__m128i __A, __m128i __B, __m128i __mask) > return (__m128i) vec_sel ((__v16qu) __A, (__v16qu) __B, __lmask); > } > > +__inline __m128d > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm_blend_pd (__m128d __A, __m128d __B, const int __imm8) > +{ > + __v16qu __pcv[] = > + { > + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, > + { 16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15 }, > + { 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 }, > + { 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 } > + }; > + __v16qu __r = vec_perm ((__v16qu) __A, (__v16qu)__B, __pcv[__imm8]); > + return (__m128d) __r; > +} > + > +__inline __m128d > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm_blendv_pd (__m128d __A, __m128d __B, __m128d __mask) > +{ > + const __v2di __zero = {0}; > + const __vector __bool long long __boolmask = vec_cmplt ((__v2di) __mask, __zero); > + return (__m128d) vec_sel ((__v2du) __A, (__v2du) __B, (__v2du) __boolmask); > +} > + > +__inline __m128 > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm_blend_ps (__m128 __A, __m128 __B, const int __imm8) > +{ > + __v16qu __pcv[] = > + { > + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, > + { 16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, > + { 0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15 }, > + { 16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15 }, > + { 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 12, 13, 14, 15 }, > + { 16, 17, 18, 19, 4, 5, 6, 7, 24, 25, 26, 27, 12, 13, 14, 15 }, > + { 0, 1, 2, 3, 20, 21, 22, 23, 24, 25, 26, 27, 12, 13, 14, 15 }, > + { 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 12, 13, 14, 15 }, > + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 28, 29, 30, 31 }, > + { 16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 28, 29, 30, 31 }, > + { 0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31 }, > + { 16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31 }, > + { 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 }, > + { 16, 17, 18, 19, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 }, > + { 0, 1, 2, 3, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 }, > + { 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 }, > + }; > + __v16qu __r = vec_perm ((__v16qu) __A, (__v16qu)__B, __pcv[__imm8]); > + return (__m128) __r; > +} > + > +__inline __m128 > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm_blendv_ps (__m128 __A, __m128 __B, __m128 __mask) > +{ > + const __v4si __zero = {0}; > + const __vector __bool int __boolmask = vec_cmplt ((__v4si) __mask, __zero); > + return (__m128) vec_sel ((__v4su) __A, (__v4su) __B, (__v4su) __boolmask); > +} > + > __inline int > __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > _mm_testz_si128 (__m128i __A, __m128i __B)
Hi! On Fri, Jul 16, 2021 at 08:50:17AM -0500, Paul A. Clarke wrote: > _mm_blend_epi16 and _mm_blendv_epi8 were added earlier. > Add these four to complete the set. > > 2021-07-16 Paul A. Clarke <pc@us.ibm.com> > > gcc > * config/rs6000/smmintrin.h (_mm_blend_pd, _mm_blendv_pd, > _mm_blend_ps, _mm_blendv_ps): New. I'm not sure if this is allowed like this in changelogs? In either case it is more obvious / aesthetically pleasing / etc. to write "gcc/". But also, it is fine to leave out this one, it being the default :-) The patch is fiune for trunk. Thank you! Segher
diff --git a/gcc/config/rs6000/smmintrin.h b/gcc/config/rs6000/smmintrin.h index 6a010fdbb96f..69e54702a877 100644 --- a/gcc/config/rs6000/smmintrin.h +++ b/gcc/config/rs6000/smmintrin.h @@ -116,6 +116,66 @@ _mm_blendv_epi8 (__m128i __A, __m128i __B, __m128i __mask) return (__m128i) vec_sel ((__v16qu) __A, (__v16qu) __B, __lmask); } +__inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_blend_pd (__m128d __A, __m128d __B, const int __imm8) +{ + __v16qu __pcv[] = + { + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + { 16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15 }, + { 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 }, + { 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 } + }; + __v16qu __r = vec_perm ((__v16qu) __A, (__v16qu)__B, __pcv[__imm8]); + return (__m128d) __r; +} + +__inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_blendv_pd (__m128d __A, __m128d __B, __m128d __mask) +{ + const __v2di __zero = {0}; + const __vector __bool long long __boolmask = vec_cmplt ((__v2di) __mask, __zero); + return (__m128d) vec_sel ((__v2du) __A, (__v2du) __B, (__v2du) __boolmask); +} + +__inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_blend_ps (__m128 __A, __m128 __B, const int __imm8) +{ + __v16qu __pcv[] = + { + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + { 16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + { 0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15 }, + { 16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15 }, + { 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 12, 13, 14, 15 }, + { 16, 17, 18, 19, 4, 5, 6, 7, 24, 25, 26, 27, 12, 13, 14, 15 }, + { 0, 1, 2, 3, 20, 21, 22, 23, 24, 25, 26, 27, 12, 13, 14, 15 }, + { 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 12, 13, 14, 15 }, + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 28, 29, 30, 31 }, + { 16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 28, 29, 30, 31 }, + { 0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31 }, + { 16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31 }, + { 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 }, + { 16, 17, 18, 19, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 }, + { 0, 1, 2, 3, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 }, + { 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 }, + }; + __v16qu __r = vec_perm ((__v16qu) __A, (__v16qu)__B, __pcv[__imm8]); + return (__m128) __r; +} + +__inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_blendv_ps (__m128 __A, __m128 __B, __m128 __mask) +{ + const __v4si __zero = {0}; + const __vector __bool int __boolmask = vec_cmplt ((__v4si) __mask, __zero); + return (__m128) vec_sel ((__v4su) __A, (__v4su) __B, (__v4su) __boolmask); +} + __inline int __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_testz_si128 (__m128i __A, __m128i __B)