diff mbox

[RFC,i386] : Use STV pass to load/store any TImode constant using SSE insns

Message ID CAFULd4afCkFamkaWA3F_30AaW9ZerBWKj90F2AR_z4wTjNGcNA@mail.gmail.com
State New
Headers show

Commit Message

Uros Bizjak April 27, 2016, 7:58 p.m. UTC
Hello!

This RFC patch illustrates the idea of using STV pass to load/store
any TImode constant using SSE insns. The testcase:

--cut here--
__int128 x;

__int128 test_1 (void)
{
  x = (__int128) 0x00112233;
}

__int128 test_2 (void)
{
  x = ((__int128) 0x0011223344556677 << 64);
}

__int128 test_3 (void)
{
  x = ((__int128) 0x0011223344556677 << 64) + (__int128) 0x0011223344556677;
}
--cut here--

currently compiles (-O2) on x86_64 to:

test_1:
        movq    $1122867, x(%rip)
        movq    $0, x+8(%rip)
        ret

test_2:
        xorl    %eax, %eax
        movabsq $4822678189205111, %rdx
        movq    %rax, x(%rip)
        movq    %rdx, x+8(%rip)
        ret

test_3:
        movabsq $4822678189205111, %rax
        movabsq $4822678189205111, %rdx
        movq    %rax, x(%rip)
        movq    %rdx, x+8(%rip)
        ret

However, using the attached patch, we compile all tests to:

test:
        movdqa  .LC0(%rip), %xmm0
        movaps  %xmm0, x(%rip)
        ret

Ilya, HJ - do you think new sequences are better, or - as suggested by
Jakub - they are beneficial with STV pass, as we are now able to load
any immediate value? A variant of this patch can also be used to load
DImode values to 32bit STV pass.

Uros.

Comments

H.J. Lu April 27, 2016, 8:35 p.m. UTC | #1
On Wed, Apr 27, 2016 at 12:58 PM, Uros Bizjak <ubizjak@gmail.com> wrote:
> Hello!
>
> This RFC patch illustrates the idea of using STV pass to load/store
> any TImode constant using SSE insns. The testcase:
>
> --cut here--
> __int128 x;
>
> __int128 test_1 (void)
> {
>   x = (__int128) 0x00112233;
> }
>
> __int128 test_2 (void)
> {
>   x = ((__int128) 0x0011223344556677 << 64);
> }
>
> __int128 test_3 (void)
> {
>   x = ((__int128) 0x0011223344556677 << 64) + (__int128) 0x0011223344556677;
> }
> --cut here--
>
> currently compiles (-O2) on x86_64 to:
>
> test_1:
>         movq    $1122867, x(%rip)
>         movq    $0, x+8(%rip)
>         ret
>
> test_2:
>         xorl    %eax, %eax
>         movabsq $4822678189205111, %rdx
>         movq    %rax, x(%rip)
>         movq    %rdx, x+8(%rip)
>         ret
>
> test_3:
>         movabsq $4822678189205111, %rax
>         movabsq $4822678189205111, %rdx
>         movq    %rax, x(%rip)
>         movq    %rdx, x+8(%rip)
>         ret
>
> However, using the attached patch, we compile all tests to:
>
> test:
>         movdqa  .LC0(%rip), %xmm0
>         movaps  %xmm0, x(%rip)
>         ret
>
> Ilya, HJ - do you think new sequences are better, or - as suggested by
> Jakub - they are beneficial with STV pass, as we are now able to load

I like it.  It is on my todo list :-).

> any immediate value? A variant of this patch can also be used to load
> DImode values to 32bit STV pass.
>

Yes:

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=70763
Ilya Enkovich April 28, 2016, 10:36 a.m. UTC | #2
2016-04-27 22:58 GMT+03:00 Uros Bizjak <ubizjak@gmail.com>:
> Hello!
>
> This RFC patch illustrates the idea of using STV pass to load/store
> any TImode constant using SSE insns. The testcase:
>
> --cut here--
> __int128 x;
>
> __int128 test_1 (void)
> {
>   x = (__int128) 0x00112233;
> }
>
> __int128 test_2 (void)
> {
>   x = ((__int128) 0x0011223344556677 << 64);
> }
>
> __int128 test_3 (void)
> {
>   x = ((__int128) 0x0011223344556677 << 64) + (__int128) 0x0011223344556677;
> }
> --cut here--
>
> currently compiles (-O2) on x86_64 to:
>
> test_1:
>         movq    $1122867, x(%rip)
>         movq    $0, x+8(%rip)
>         ret
>
> test_2:
>         xorl    %eax, %eax
>         movabsq $4822678189205111, %rdx
>         movq    %rax, x(%rip)
>         movq    %rdx, x+8(%rip)
>         ret
>
> test_3:
>         movabsq $4822678189205111, %rax
>         movabsq $4822678189205111, %rdx
>         movq    %rax, x(%rip)
>         movq    %rdx, x+8(%rip)
>         ret
>
> However, using the attached patch, we compile all tests to:
>
> test:
>         movdqa  .LC0(%rip), %xmm0
>         movaps  %xmm0, x(%rip)
>         ret
>
> Ilya, HJ - do you think new sequences are better, or - as suggested by
> Jakub - they are beneficial with STV pass, as we are now able to load
> any immediate value? A variant of this patch can also be used to load
> DImode values to 32bit STV pass.
>
> Uros.

Hi,

Why don't we have two movq instructions in all three cases now?  Is it
because of late split?

I wouldn't say SSE load+store is always better than two movq instructions.
But it obviously can enable bigger chains for STV which is good.  I think
you should adjust a cost model to handle immediates for proper decision.

That's what I have in my draft for DImode immediates:

@@ -3114,6 +3123,20 @@ scalar_chain::build (bitmap candidates,
unsigned insn_uid)
   BITMAP_FREE (queue);
 }

+/* Return a cost of building a vector costant
+   instead of using a scalar one.  */
+
+int
+scalar_chain::vector_const_cost (rtx exp)
+{
+  gcc_assert (CONST_INT_P (exp));
+
+  if (const0_operand (exp, GET_MODE (exp))
+      || constm1_operand (exp, GET_MODE (exp)))
+    return COSTS_N_INSNS (1);
+  return ix86_cost->sse_load[1];
+}
+
 /* Compute a gain for chain conversion.  */

 int
@@ -3145,11 +3168,25 @@ scalar_chain::compute_convert_gain ()
               || GET_CODE (src) == IOR
               || GET_CODE (src) == XOR
               || GET_CODE (src) == AND)
-       gain += ix86_cost->add;
+       {
+         gain += ix86_cost->add;
+         if (CONST_INT_P (XEXP (src, 0)))
+           gain -= scalar_chain::vector_const_cost (XEXP (src, 0));
+         if (CONST_INT_P (XEXP (src, 1)))
+           gain -= scalar_chain::vector_const_cost (XEXP (src, 1));
+       }
       else if (GET_CODE (src) == COMPARE)
        {
          /* Assume comparison cost is the same.  */
        }
+      else if (GET_CODE (src) == CONST_INT)
+       {
+         if (REG_P (dst))
+           gain += COSTS_N_INSNS (2);
+         else if (MEM_P (dst))
+           gain += 2 * ix86_cost->int_store[2] - ix86_cost->sse_store[1];
+         gain -= scalar_chain::vector_const_cost (src);
+       }
       else
        gcc_unreachable ();
Jakub Jelinek April 28, 2016, 10:41 a.m. UTC | #3
On Thu, Apr 28, 2016 at 01:36:30PM +0300, Ilya Enkovich wrote:
> @@ -3145,11 +3168,25 @@ scalar_chain::compute_convert_gain ()
>                || GET_CODE (src) == IOR
>                || GET_CODE (src) == XOR
>                || GET_CODE (src) == AND)
> -       gain += ix86_cost->add;
> +       {
> +         gain += ix86_cost->add;
> +         if (CONST_INT_P (XEXP (src, 0)))
> +           gain -= scalar_chain::vector_const_cost (XEXP (src, 0));
> +         if (CONST_INT_P (XEXP (src, 1)))
> +           gain -= scalar_chain::vector_const_cost (XEXP (src, 1));
> +       }
>        else if (GET_CODE (src) == COMPARE)
>         {
>           /* Assume comparison cost is the same.  */
>         }
> +      else if (GET_CODE (src) == CONST_INT)
> +       {
> +         if (REG_P (dst))
> +           gain += COSTS_N_INSNS (2);
> +         else if (MEM_P (dst))
> +           gain += 2 * ix86_cost->int_store[2] - ix86_cost->sse_store[1];
> +         gain -= scalar_chain::vector_const_cost (src);
> +       }
>        else
>         gcc_unreachable ();

Where does the 2 come from?  Is it that the STV pass right now supports only
2 * wordsize modes?  Also, I don't think we should treat equally constants
that fit into the 32-bit immediates and constants that don't, the latter,
when movabsq needs to be used, are more costly.

	Jakub
Uros Bizjak April 28, 2016, 10:43 a.m. UTC | #4
On Thu, Apr 28, 2016 at 12:36 PM, Ilya Enkovich <enkovich.gnu@gmail.com> wrote:
> 2016-04-27 22:58 GMT+03:00 Uros Bizjak <ubizjak@gmail.com>:
>> Hello!
>>
>> This RFC patch illustrates the idea of using STV pass to load/store
>> any TImode constant using SSE insns. The testcase:
>>
>> --cut here--
>> __int128 x;
>>
>> __int128 test_1 (void)
>> {
>>   x = (__int128) 0x00112233;
>> }
>>
>> __int128 test_2 (void)
>> {
>>   x = ((__int128) 0x0011223344556677 << 64);
>> }
>>
>> __int128 test_3 (void)
>> {
>>   x = ((__int128) 0x0011223344556677 << 64) + (__int128) 0x0011223344556677;
>> }
>> --cut here--
>>
>> currently compiles (-O2) on x86_64 to:
>>
>> test_1:
>>         movq    $1122867, x(%rip)
>>         movq    $0, x+8(%rip)
>>         ret
>>
>> test_2:
>>         xorl    %eax, %eax
>>         movabsq $4822678189205111, %rdx
>>         movq    %rax, x(%rip)
>>         movq    %rdx, x+8(%rip)
>>         ret
>>
>> test_3:
>>         movabsq $4822678189205111, %rax
>>         movabsq $4822678189205111, %rdx
>>         movq    %rax, x(%rip)
>>         movq    %rdx, x+8(%rip)
>>         ret
>>
>> However, using the attached patch, we compile all tests to:
>>
>> test:
>>         movdqa  .LC0(%rip), %xmm0
>>         movaps  %xmm0, x(%rip)
>>         ret
>>
>> Ilya, HJ - do you think new sequences are better, or - as suggested by
>> Jakub - they are beneficial with STV pass, as we are now able to load
>> any immediate value? A variant of this patch can also be used to load
>> DImode values to 32bit STV pass.
>>
>> Uros.
>
> Hi,
>
> Why don't we have two movq instructions in all three cases now?  Is it
> because of late split?

movq can handle only 32bit sign-extended immediates. There is actually
room for improvement in test_2, where we could directly store 0 to
x(%rip).

Uros.

> I wouldn't say SSE load+store is always better than two movq instructions.
> But it obviously can enable bigger chains for STV which is good.  I think
> you should adjust a cost model to handle immediates for proper decision.
>
> That's what I have in my draft for DImode immediates:
>
> @@ -3114,6 +3123,20 @@ scalar_chain::build (bitmap candidates,
> unsigned insn_uid)
>    BITMAP_FREE (queue);
>  }
>
> +/* Return a cost of building a vector costant
> +   instead of using a scalar one.  */
> +
> +int
> +scalar_chain::vector_const_cost (rtx exp)
> +{
> +  gcc_assert (CONST_INT_P (exp));
> +
> +  if (const0_operand (exp, GET_MODE (exp))
> +      || constm1_operand (exp, GET_MODE (exp)))
> +    return COSTS_N_INSNS (1);
> +  return ix86_cost->sse_load[1];
> +}
> +
>  /* Compute a gain for chain conversion.  */
>
>  int
> @@ -3145,11 +3168,25 @@ scalar_chain::compute_convert_gain ()
>                || GET_CODE (src) == IOR
>                || GET_CODE (src) == XOR
>                || GET_CODE (src) == AND)
> -       gain += ix86_cost->add;
> +       {
> +         gain += ix86_cost->add;
> +         if (CONST_INT_P (XEXP (src, 0)))
> +           gain -= scalar_chain::vector_const_cost (XEXP (src, 0));
> +         if (CONST_INT_P (XEXP (src, 1)))
> +           gain -= scalar_chain::vector_const_cost (XEXP (src, 1));
> +       }
>        else if (GET_CODE (src) == COMPARE)
>         {
>           /* Assume comparison cost is the same.  */
>         }
> +      else if (GET_CODE (src) == CONST_INT)
> +       {
> +         if (REG_P (dst))
> +           gain += COSTS_N_INSNS (2);
> +         else if (MEM_P (dst))
> +           gain += 2 * ix86_cost->int_store[2] - ix86_cost->sse_store[1];
> +         gain -= scalar_chain::vector_const_cost (src);
> +       }
>        else
>         gcc_unreachable ();
Ilya Enkovich April 28, 2016, 10:45 a.m. UTC | #5
2016-04-28 13:41 GMT+03:00 Jakub Jelinek <jakub@redhat.com>:
> On Thu, Apr 28, 2016 at 01:36:30PM +0300, Ilya Enkovich wrote:
>> @@ -3145,11 +3168,25 @@ scalar_chain::compute_convert_gain ()
>>                || GET_CODE (src) == IOR
>>                || GET_CODE (src) == XOR
>>                || GET_CODE (src) == AND)
>> -       gain += ix86_cost->add;
>> +       {
>> +         gain += ix86_cost->add;
>> +         if (CONST_INT_P (XEXP (src, 0)))
>> +           gain -= scalar_chain::vector_const_cost (XEXP (src, 0));
>> +         if (CONST_INT_P (XEXP (src, 1)))
>> +           gain -= scalar_chain::vector_const_cost (XEXP (src, 1));
>> +       }
>>        else if (GET_CODE (src) == COMPARE)
>>         {
>>           /* Assume comparison cost is the same.  */
>>         }
>> +      else if (GET_CODE (src) == CONST_INT)
>> +       {
>> +         if (REG_P (dst))
>> +           gain += COSTS_N_INSNS (2);
>> +         else if (MEM_P (dst))
>> +           gain += 2 * ix86_cost->int_store[2] - ix86_cost->sse_store[1];
>> +         gain -= scalar_chain::vector_const_cost (src);
>> +       }
>>        else
>>         gcc_unreachable ();
>
> Where does the 2 come from?  Is it that the STV pass right now supports only
> 2 * wordsize modes?  Also, I don't think we should treat equally constants
> that fit into the 32-bit immediates and constants that don't, the latter,
> when movabsq needs to be used, are more costly.

This variant is for DImode going to split into two SImode.  TImode chains
have own cost model.

Thanks,
Ilya

>
>         Jakub
Ilya Enkovich April 28, 2016, 10:49 a.m. UTC | #6
2016-04-28 13:43 GMT+03:00 Uros Bizjak <ubizjak@gmail.com>:
> On Thu, Apr 28, 2016 at 12:36 PM, Ilya Enkovich <enkovich.gnu@gmail.com> wrote:
>> 2016-04-27 22:58 GMT+03:00 Uros Bizjak <ubizjak@gmail.com>:
>>> Hello!
>>>
>>> This RFC patch illustrates the idea of using STV pass to load/store
>>> any TImode constant using SSE insns. The testcase:
>>>
>>> --cut here--
>>> __int128 x;
>>>
>>> __int128 test_1 (void)
>>> {
>>>   x = (__int128) 0x00112233;
>>> }
>>>
>>> __int128 test_2 (void)
>>> {
>>>   x = ((__int128) 0x0011223344556677 << 64);
>>> }
>>>
>>> __int128 test_3 (void)
>>> {
>>>   x = ((__int128) 0x0011223344556677 << 64) + (__int128) 0x0011223344556677;
>>> }
>>> --cut here--
>>>
>>> currently compiles (-O2) on x86_64 to:
>>>
>>> test_1:
>>>         movq    $1122867, x(%rip)
>>>         movq    $0, x+8(%rip)
>>>         ret
>>>
>>> test_2:
>>>         xorl    %eax, %eax
>>>         movabsq $4822678189205111, %rdx
>>>         movq    %rax, x(%rip)
>>>         movq    %rdx, x+8(%rip)
>>>         ret
>>>
>>> test_3:
>>>         movabsq $4822678189205111, %rax
>>>         movabsq $4822678189205111, %rdx
>>>         movq    %rax, x(%rip)
>>>         movq    %rdx, x+8(%rip)
>>>         ret
>>>
>>> However, using the attached patch, we compile all tests to:
>>>
>>> test:
>>>         movdqa  .LC0(%rip), %xmm0
>>>         movaps  %xmm0, x(%rip)
>>>         ret
>>>
>>> Ilya, HJ - do you think new sequences are better, or - as suggested by
>>> Jakub - they are beneficial with STV pass, as we are now able to load
>>> any immediate value? A variant of this patch can also be used to load
>>> DImode values to 32bit STV pass.
>>>
>>> Uros.
>>
>> Hi,
>>
>> Why don't we have two movq instructions in all three cases now?  Is it
>> because of late split?
>
> movq can handle only 32bit sign-extended immediates. There is actually
> room for improvement in test_2, where we could directly store 0 to
> x(%rip).

Right.  In this case timode_scalar_chain::compute_convert_gain should
analyze immediate values used in a chain.

Thanks,
Ilya

>
> Uros.
>
>> I wouldn't say SSE load+store is always better than two movq instructions.
>> But it obviously can enable bigger chains for STV which is good.  I think
>> you should adjust a cost model to handle immediates for proper decision.
>>
>> That's what I have in my draft for DImode immediates:
>>
>> @@ -3114,6 +3123,20 @@ scalar_chain::build (bitmap candidates,
>> unsigned insn_uid)
>>    BITMAP_FREE (queue);
>>  }
>>
>> +/* Return a cost of building a vector costant
>> +   instead of using a scalar one.  */
>> +
>> +int
>> +scalar_chain::vector_const_cost (rtx exp)
>> +{
>> +  gcc_assert (CONST_INT_P (exp));
>> +
>> +  if (const0_operand (exp, GET_MODE (exp))
>> +      || constm1_operand (exp, GET_MODE (exp)))
>> +    return COSTS_N_INSNS (1);
>> +  return ix86_cost->sse_load[1];
>> +}
>> +
>>  /* Compute a gain for chain conversion.  */
>>
>>  int
>> @@ -3145,11 +3168,25 @@ scalar_chain::compute_convert_gain ()
>>                || GET_CODE (src) == IOR
>>                || GET_CODE (src) == XOR
>>                || GET_CODE (src) == AND)
>> -       gain += ix86_cost->add;
>> +       {
>> +         gain += ix86_cost->add;
>> +         if (CONST_INT_P (XEXP (src, 0)))
>> +           gain -= scalar_chain::vector_const_cost (XEXP (src, 0));
>> +         if (CONST_INT_P (XEXP (src, 1)))
>> +           gain -= scalar_chain::vector_const_cost (XEXP (src, 1));
>> +       }
>>        else if (GET_CODE (src) == COMPARE)
>>         {
>>           /* Assume comparison cost is the same.  */
>>         }
>> +      else if (GET_CODE (src) == CONST_INT)
>> +       {
>> +         if (REG_P (dst))
>> +           gain += COSTS_N_INSNS (2);
>> +         else if (MEM_P (dst))
>> +           gain += 2 * ix86_cost->int_store[2] - ix86_cost->sse_store[1];
>> +         gain -= scalar_chain::vector_const_cost (src);
>> +       }
>>        else
>>         gcc_unreachable ();
Jakub Jelinek April 29, 2016, 7:41 a.m. UTC | #7
On Thu, Apr 28, 2016 at 01:45:36PM +0300, Ilya Enkovich wrote:
> > Where does the 2 come from?  Is it that the STV pass right now supports only
> > 2 * wordsize modes?  Also, I don't think we should treat equally constants
> > that fit into the 32-bit immediates and constants that don't, the latter,
> > when movabsq needs to be used, are more costly.
> 
> This variant is for DImode going to split into two SImode.  TImode chains
> have own cost model.

Ok.  Still, we should make sure the cost of movabsq is significantly higher
than cost of other immediates (not only because the other immediates can be
used directly in the instructions, while for movabsq you need to first use
that insn to initialize some reg and then use that reg in other insn, but
also because of the movabsq latency).

	Jakub
Uros Bizjak April 29, 2016, 9:48 a.m. UTC | #8
On Thu, Apr 28, 2016 at 12:36 PM, Ilya Enkovich <enkovich.gnu@gmail.com> wrote:

> That's what I have in my draft for DImode immediates:
>
> @@ -3114,6 +3123,20 @@ scalar_chain::build (bitmap candidates,
> unsigned insn_uid)
>    BITMAP_FREE (queue);
>  }
>
> +/* Return a cost of building a vector costant
> +   instead of using a scalar one.  */
> +
> +int
> +scalar_chain::vector_const_cost (rtx exp)
> +{
> +  gcc_assert (CONST_INT_P (exp));
> +
> +  if (const0_operand (exp, GET_MODE (exp))
> +      || constm1_operand (exp, GET_MODE (exp)))

The above should just use

standard_sse_constant_p (exp, V2DImode).

Uros.
Ilya Enkovich April 29, 2016, 11:26 a.m. UTC | #9
2016-04-29 12:48 GMT+03:00 Uros Bizjak <ubizjak@gmail.com>:
> On Thu, Apr 28, 2016 at 12:36 PM, Ilya Enkovich <enkovich.gnu@gmail.com> wrote:
>
>> That's what I have in my draft for DImode immediates:
>>
>> @@ -3114,6 +3123,20 @@ scalar_chain::build (bitmap candidates,
>> unsigned insn_uid)
>>    BITMAP_FREE (queue);
>>  }
>>
>> +/* Return a cost of building a vector costant
>> +   instead of using a scalar one.  */
>> +
>> +int
>> +scalar_chain::vector_const_cost (rtx exp)
>> +{
>> +  gcc_assert (CONST_INT_P (exp));
>> +
>> +  if (const0_operand (exp, GET_MODE (exp))
>> +      || constm1_operand (exp, GET_MODE (exp)))
>
> The above should just use
>
> standard_sse_constant_p (exp, V2DImode).

Thanks for the tip!  Surprisingly this replacement caused a different
cost for non-standard constants.  Looking at it in GDB I found:

(gdb) p exp
$3 = (rtx) 0x7ffff7f0b560
(gdb) pr
warning: Expression is not an assignment (and might have no effect)
(const_int -1085102592571150096 [0xf0f0f0f0f0f0f0f0])
(gdb) p constm1_operand (exp,GET_MODE (exp))
$4 = 1

Do I misuse constm1_operand?

Thanks,
Ilya

>
> Uros.
Uros Bizjak April 29, 2016, 11:56 a.m. UTC | #10
On Fri, Apr 29, 2016 at 1:26 PM, Ilya Enkovich <enkovich.gnu@gmail.com> wrote:
> 2016-04-29 12:48 GMT+03:00 Uros Bizjak <ubizjak@gmail.com>:
>> On Thu, Apr 28, 2016 at 12:36 PM, Ilya Enkovich <enkovich.gnu@gmail.com> wrote:
>>
>>> That's what I have in my draft for DImode immediates:
>>>
>>> @@ -3114,6 +3123,20 @@ scalar_chain::build (bitmap candidates,
>>> unsigned insn_uid)
>>>    BITMAP_FREE (queue);
>>>  }
>>>
>>> +/* Return a cost of building a vector costant
>>> +   instead of using a scalar one.  */
>>> +
>>> +int
>>> +scalar_chain::vector_const_cost (rtx exp)
>>> +{
>>> +  gcc_assert (CONST_INT_P (exp));
>>> +
>>> +  if (const0_operand (exp, GET_MODE (exp))
>>> +      || constm1_operand (exp, GET_MODE (exp)))
>>
>> The above should just use
>>
>> standard_sse_constant_p (exp, V2DImode).
>
> Thanks for the tip!  Surprisingly this replacement caused a different
> cost for non-standard constants.  Looking at it in GDB I found:
>
> (gdb) p exp
> $3 = (rtx) 0x7ffff7f0b560
> (gdb) pr
> warning: Expression is not an assignment (and might have no effect)
> (const_int -1085102592571150096 [0xf0f0f0f0f0f0f0f0])
> (gdb) p constm1_operand (exp,GET_MODE (exp))
> $4 = 1
>
> Do I misuse constm1_operand?

No, it is just a typo that crept in constm1_operand:

;; Match exactly -1.
(define_predicate "constm1_operand"
  (and (match_code "const_int")
       (match_test "op = constm1_rtx")))

There should be a test, not an assignment.

Uros.
diff mbox

Patch

Index: i386.c
===================================================================
--- i386.c	(revision 235526)
+++ i386.c	(working copy)
@@ -2854,29 +2854,16 @@  timode_scalar_to_vector_candidate_p (rtx_insn *ins
 
   if (MEM_P (dst))
     {
-      /* Check for store.  Only support store from register or standard
-	 SSE constants.  Memory must be aligned or unaligned store is
-	 optimal.  */
-      if (misaligned_operand (dst, TImode)
-	  && !TARGET_SSE_UNALIGNED_STORE_OPTIMAL)
-	return false;
-
-      switch (GET_CODE (src))
-	{
-	default:
-	  return false;
-
-	case REG:
-	  return true;
-
-	case CONST_INT:
-	  return standard_sse_constant_p (src, TImode);
-	}
+      /* Check for store.  Memory must be aligned
+	 or unaligned store is optimal.  */
+      return ((REG_P (src) || CONST_SCALAR_INT_P (src))
+	      && (!misaligned_operand (dst, TImode)
+		  || TARGET_SSE_UNALIGNED_STORE_OPTIMAL));
     }
   else if (MEM_P (src))
     {
-      /* Check for load.  Memory must be aligned or unaligned load is
-	 optimal.  */
+      /* Check for load.  Memory must be aligned
+	 or unaligned load is optimal.  */
       return (REG_P (dst)
 	      && (!misaligned_operand (src, TImode)
 		  || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL));
@@ -3744,6 +3731,7 @@  timode_scalar_chain::convert_insn (rtx_insn *insn)
 	  PUT_MODE (XEXP (tmp, 0), V1TImode);
       }
       /* FALLTHRU */
+
     case MEM:
       PUT_MODE (dst, V1TImode);
       break;
@@ -3759,28 +3747,26 @@  timode_scalar_chain::convert_insn (rtx_insn *insn)
       PUT_MODE (src, V1TImode);
       break;
 
-    case CONST_INT:
-      switch (standard_sse_constant_p (src, TImode))
-	{
-	case 1:
-	  src = CONST0_RTX (GET_MODE (dst));
-	  break;
-	case 2:
-	  src = CONSTM1_RTX (GET_MODE (dst));
-	  break;
-	default:
-	  gcc_unreachable ();
-	}
-      if (NONDEBUG_INSN_P (insn))
-	{
-	  rtx tmp = gen_reg_rtx (V1TImode);
-	  /* Since there are no instructions to store standard SSE
-	     constant, temporary register usage is required.  */
-	  emit_conversion_insns (gen_rtx_SET (dst, tmp), insn);
-	  dst = tmp;
-	}
-      break;
+    CASE_CONST_SCALAR_INT:
+      {
+	rtx vec = gen_rtx_CONST_VECTOR (V1TImode, gen_rtvec (1, src));
 
+	if (NONDEBUG_INSN_P (insn))
+	  {
+	    rtx tmp = gen_reg_rtx (V1TImode);
+
+	    if (!standard_sse_constant_p (src, TImode))
+	      vec = validize_mem (force_const_mem (V1TImode, vec));
+
+	    /* We can only store from a SSE register.  */
+	    emit_conversion_insns (gen_rtx_SET (dst, tmp), insn);
+	    dst = tmp;
+	  }
+
+	src = vec;
+	break;
+      }
+  
     default:
       gcc_unreachable ();
     }
@@ -14784,8 +14770,7 @@  ix86_legitimate_constant_p (machine_mode mode, rtx
 #endif
       break;
 
-    case CONST_INT:
-    case CONST_WIDE_INT:
+    CASE_CONST_SCALAR_INT:
       switch (mode)
 	{
 	case TImode:
@@ -14823,10 +14808,7 @@  ix86_cannot_force_const_mem (machine_mode mode, rt
   /* We can always put integral constants and vectors in memory.  */
   switch (GET_CODE (x))
     {
-    case CONST_INT:
-    case CONST_WIDE_INT:
-    case CONST_DOUBLE:
-    case CONST_VECTOR:
+    CASE_CONST_ANY:
       return false;
 
     default: