diff mbox series

[1/2] rs6000: Support _mm_insert_epi{8,32,64}

Message ID 20200923221245.243397-1-pc@us.ibm.com
State New
Headers show
Series [1/2] rs6000: Support _mm_insert_epi{8,32,64} | expand

Commit Message

Paul A. Clarke Sept. 23, 2020, 10:12 p.m. UTC
Add compatibility implementations for SSE4.1 intrinsics
_mm_insert_epi8, _mm_insert_epi32, _mm_insert_epi64.

2020-09-23  Paul A. Clarke  <pc@us.ibm.com>

gcc/
	* config/rs6000/smmintrin.h (_mm_insert_epi8): New.
	(_mm_insert_epi32): New.
	(_mm_insert_epi64): New.
---
 gcc/config/rs6000/smmintrin.h | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

Comments

Segher Boessenkool Sept. 24, 2020, 11:22 p.m. UTC | #1
Hi!

On Wed, Sep 23, 2020 at 05:12:44PM -0500, Paul A. Clarke wrote:
> +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
> +_mm_insert_epi8 (__m128i const __A, int const __D, int const __N)
> +{
> +  __v16qi result = (__v16qi)__A;
> +
> +  result [(__N & 0b1111)] = __D;

Hrm, GCC supports binary constants like this since 2007, so okay.  But I
have to wonder if this improves anything over hex (or decimal even!)
The parens are superfluous (and only hinder legibility), fwiw.

> +_mm_insert_epi64 (__m128i const __A, long long const __D, int const __N)
> +{
> +  __v2di result = (__v2di)__A;
> +
> +  result [(__N & 0b1)] = __D;

Especially single-digit numbers look really goofy (like 0x0, but even
worse for binary somehow).

Anyway, okay for trunk, with or without those things improved.  Thanks!


Segher
Paul A. Clarke Sept. 25, 2020, 2:07 p.m. UTC | #2
On Thu, Sep 24, 2020 at 06:22:10PM -0500, Segher Boessenkool wrote:
> On Wed, Sep 23, 2020 at 05:12:44PM -0500, Paul A. Clarke wrote:
> > +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
> > +_mm_insert_epi8 (__m128i const __A, int const __D, int const __N)
> > +{
> > +  __v16qi result = (__v16qi)__A;
> > +
> > +  result [(__N & 0b1111)] = __D;
> 
> Hrm, GCC supports binary constants like this since 2007, so okay.  But I
> have to wonder if this improves anything over hex (or decimal even!)
> The parens are superfluous (and only hinder legibility), fwiw.
> 
> > +_mm_insert_epi64 (__m128i const __A, long long const __D, int const __N)
> > +{
> > +  __v2di result = (__v2di)__A;
> > +
> > +  result [(__N & 0b1)] = __D;
> 
> Especially single-digit numbers look really goofy (like 0x0, but even
> worse for binary somehow).
> 
> Anyway, okay for trunk, with or without those things improved.  Thanks!

I was trying to obviously and consistently convey the sizes of the masks,
but I really want to convey _why_ there are masks, so let me try a
different approach, below.

--

Add compatibility implementations for SSE4.1 intrinsics
_mm_insert_epi8, _mm_insert_epi32, _mm_insert_epi64.

2020-09-25  Paul A. Clarke  <pc@us.ibm.com>

gcc/
	* config/rs6000/smmintrin.h (_mm_insert_epi8): New.
	(_mm_insert_epi32): New.
	(_mm_insert_epi64): New.
---
 gcc/config/rs6000/smmintrin.h | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/gcc/config/rs6000/smmintrin.h b/gcc/config/rs6000/smmintrin.h
index d78ddba99d9..8128c417978 100644
--- a/gcc/config/rs6000/smmintrin.h
+++ b/gcc/config/rs6000/smmintrin.h
@@ -42,6 +42,36 @@
 #include <altivec.h>
 #include <tmmintrin.h>
 
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_insert_epi8 (__m128i const __A, int const __D, int const __N)
+{
+  __v16qi result = (__v16qi)__A;
+
+  result [__N % (sizeof result / sizeof result[0])] = __D;
+
+  return (__m128i) result;
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_insert_epi32 (__m128i const __A, int const __D, int const __N)
+{
+  __v4si result = (__v4si)__A;
+
+  result [__N % (sizeof result / sizeof result[0])] = __D;
+
+  return (__m128i) result;
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_insert_epi64 (__m128i const __A, long long const __D, int const __N)
+{
+  __v2di result = (__v2di)__A;
+
+  result [__N % (sizeof result / sizeof result[0])] = __D;
+
+  return (__m128i) result;
+}
+
 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_extract_epi8 (__m128i __X, const int __N)
 {
Peter Bergner Sept. 25, 2020, 3:19 p.m. UTC | #3
On 9/24/20 6:22 PM, Segher Boessenkool wrote:
>> +  result [(__N & 0b1111)] = __D;
> 
> Hrm, GCC supports binary constants like this since 2007, so okay.  But I
> have to wonder if this improves anything over hex (or decimal even!)
> The parens are superfluous (and only hinder legibility), fwiw.

+1 for using hex constants when using them with logical ops like '&'.

Peter
Segher Boessenkool Sept. 25, 2020, 11:22 p.m. UTC | #4
Hi!

On Fri, Sep 25, 2020 at 09:07:46AM -0500, Paul A. Clarke wrote:
> On Thu, Sep 24, 2020 at 06:22:10PM -0500, Segher Boessenkool wrote:
> > > +  result [(__N & 0b1111)] = __D;
> > 
> > Hrm, GCC supports binary constants like this since 2007, so okay.  But I
> > have to wonder if this improves anything over hex (or decimal even!)
> > The parens are superfluous (and only hinder legibility), fwiw.

> > > +  result [(__N & 0b1)] = __D;
> > 
> > Especially single-digit numbers look really goofy (like 0x0, but even
> > worse for binary somehow).
> > 
> > Anyway, okay for trunk, with or without those things improved.  Thanks!
> 
> I was trying to obviously and consistently convey the sizes of the masks,
> but I really want to convey _why_ there are masks, so let me try a
> different approach, below.

> +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
> +_mm_insert_epi8 (__m128i const __A, int const __D, int const __N)
> +{
> +  __v16qi result = (__v16qi)__A;
> +
> +  result [__N % (sizeof result / sizeof result[0])] = __D;
> +
> +  return (__m128i) result;
> +}

I don't think this helps explain things, sorry.  Just add a comment
if you want to explain things?  Simpler and works perfectly always.

To read these files I always open the x86 ISA docs anyway; I think
everyone will have to anyway, so you do not have to explain all details
you emulate, only the very tricky ones, or implementation choices, that
kind of thing.


Segher
diff mbox series

Patch

diff --git a/gcc/config/rs6000/smmintrin.h b/gcc/config/rs6000/smmintrin.h
index d78ddba99d9..92100ccd674 100644
--- a/gcc/config/rs6000/smmintrin.h
+++ b/gcc/config/rs6000/smmintrin.h
@@ -42,6 +42,36 @@ 
 #include <altivec.h>
 #include <tmmintrin.h>
 
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_insert_epi8 (__m128i const __A, int const __D, int const __N)
+{
+  __v16qi result = (__v16qi)__A;
+
+  result [(__N & 0b1111)] = __D;
+
+  return (__m128i) result;
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_insert_epi32 (__m128i const __A, int const __D, int const __N)
+{
+  __v4si result = (__v4si)__A;
+
+  result [(__N & 0b11)] = __D;
+
+  return (__m128i) result;
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_insert_epi64 (__m128i const __A, long long const __D, int const __N)
+{
+  __v2di result = (__v2di)__A;
+
+  result [(__N & 0b1)] = __D;
+
+  return (__m128i) result;
+}
+
 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_extract_epi8 (__m128i __X, const int __N)
 {