@@ -32563,6 +32543,15 @@
nargs = 1;
klass = load;
memory = 0;
+ switch (icode)
+ {
+ case CODE_FOR_sse4_1_movntdqa:
+ case CODE_FOR_avx2_movntdqa:
+ aligned_mem = true;
+ break;
+ default:
+ break;
+ }
break;
case VOID_FTYPE_PV2SF_V4SF:
case VOID_FTYPE_PV4DI_V4DI:
@@ -5278,6 +5278,7 @@
%vmovlps\t{%2, %0|%q0, %2}"
[(set_attr "isa" "noavx,avx,noavx,avx,*")
(set_attr "type" "sseshuf,sseshuf,ssemov,ssemov,ssemov")
+ (set_attr "ssememalign" "64")
(set_attr "length_immediate" "1,1,*,*,*")
(set_attr "prefix" "orig,vex,orig,vex,maybe_vex")
(set_attr "mode" "V4SF,V4SF,V2SF,V2SF,V2SF")])
@@ -7066,6 +7067,7 @@
#"
[(set_attr "isa" "noavx,avx,noavx,avx,*,*,*")
(set_attr "type" "ssemov,ssemov,sselog,sselog,ssemov,fmov,imov")
+ (set_attr "ssememalign" "64")
(set_attr "prefix_data16" "1,*,*,*,*,*,*")
(set_attr "prefix" "orig,vex,orig,vex,*,*,*")
(set_attr "mode" "V1DF,V1DF,V2DF,V2DF,DF,DF,DF")])
@@ -7134,6 +7136,7 @@
(const_string "imov")
]
(const_string "ssemov")))
+ (set_attr "ssememalign" "64")
(set_attr "prefix_data16" "*,1,*,*,*,*,1,*,*,*,*")
(set_attr "length_immediate" "*,*,*,*,*,1,*,*,*,*,*")
(set_attr "prefix" "maybe_vex,orig,vex,orig,vex,orig,orig,vex,*,*,*")
> However, I don't think ix86_expand_special_args_builtin is needed. It
> is the duty of the programmer to pass properly aligned address to
> these builtins. The compiler shouldn't magically "fix" invalid code,
> in the same way as it shouldn't "break" valid code as in the case
> above.
What I'm trying to do in ix86_expand_special_args_builtin is not in any way
fixing invalid code. The problem is that those builtins don't have a memory
as an argument, they have pointer, and the MEM is compiler created from
scratch. As it uses just gen_rtx_MEM, it always means just BITS_PER_UNIT
alignment on those MEMs, so the problem is that the combiner hook will
reject any changes whatsoever on such instructions. get_pointer_alignment
(added in the patch) adds there alignment info from what the compiler can
find out (e.g. if the pointer points to a decl with certain alignment
etc.), but if it is e.g. an argument of a function, it will still return
likely BITS_PER_UNIT even when on valid code it will always be properly
aligned (you'd need to use __builtin_assume_aligned or similar to force the
alignment there). I went through the special builtins for which gen_rtx_MEM
is called and they apparently fall into just two categories, one is various
builtins for unaligned vector loads or for < 16 byte load/store instructions
(type 5 in AVX spec, so no #UD on misalign, at most #AC if enabled), and the
other are non-temporal loads/stores, which both pre-AVX and AVX+ require
strict alignment. Only for the latter category I'm forcing the mode
alignment, because even if the compiler can't prove correct alignment based
on ccp etc., valid program must have such MEMs properly aligned and if we
create MEMs with smaller alignment, the combine hook will just punt on them
always.
So, ok for trunk this way? I guess 4.8 patch should wait some time before
being backported.
2013-12-04 Jakub Jelinek <jakub@redhat.com>
Uros Bizjak <ubizjak@gmail.com>
PR target/59163
* config/i386/i386.c (ix86_legitimate_combined_insn): If for
!TARGET_AVX there is misaligned MEM operand with vector mode
and get_attr_ssememalign is 0, return false.
(ix86_expand_special_args_builtin): Add get_pointer_alignment
computed alignment and for non-temporal loads/stores also
at least GET_MODE_ALIGNMENT as MEM_ALIGN.
* config/i386/sse.md
(<sse>_loadu<ssemodesuffix><avxsizesuffix><mask_name>,
<sse>_storeu<ssemodesuffix><avxsizesuffix>,
<sse2_avx_avx512f>_loaddqu<mode><mask_name>,
<sse2_avx_avx512f>_storedqu<mode>, <sse3>_lddqu<avxsizesuffix>,
sse_vmrcpv4sf2, sse_vmrsqrtv4sf2, sse2_cvtdq2pd, sse_movhlps,
sse_movlhps, sse_storehps, sse_loadhps, sse_loadlps,
*vec_interleave_highv2df, *vec_interleave_lowv2df,
*vec_extractv2df_1_sse, sse2_movsd, sse4_1_<code>v8qiv8hi2,
sse4_1_<code>v4qiv4si2, sse4_1_<code>v4hiv4si2,
sse4_1_<code>v2qiv2di2, sse4_1_<code>v2hiv2di2,
sse4_1_<code>v2siv2di2, sse4_2_pcmpestr, *sse4_2_pcmpestr_unaligned,
sse4_2_pcmpestri, sse4_2_pcmpestrm, sse4_2_pcmpestr_cconly,
sse4_2_pcmpistr, *sse4_2_pcmpistr_unaligned, sse4_2_pcmpistri,
sse4_2_pcmpistrm, sse4_2_pcmpistr_cconly): Add ssememalign attribute.
* config/i386/i386.md (ssememalign): New define_attr.
* g++.dg/torture/pr59163.C: New test.
@@ -5733,6 +5733,17 @@ ix86_legitimate_combined_insn (rtx insn)
bool win;
int j;
+ /* For pre-AVX disallow unaligned loads/stores where the
+ instructions don't support it. */
+ if (!TARGET_AVX
+ && VECTOR_MODE_P (GET_MODE (op))
+ && misaligned_operand (op, GET_MODE (op)))
+ {
+ int min_align = get_attr_ssememalign (insn);
+ if (min_align == 0)
+ return false;
+ }
+
/* A unary operator may be accepted by the predicate, but it
is irrelevant for matching constraints. */
if (UNARY_P (op))
@@ -32481,11 +32492,12 @@ ix86_expand_args_builtin (const struct b
static rtx
ix86_expand_special_args_builtin (const struct builtin_description *d,
- tree exp, rtx target)
+ tree exp, rtx target)
{
tree arg;
rtx pat, op;
unsigned int i, nargs, arg_adjust, memory;
+ bool aligned_mem = false;
struct
{
rtx op;
@@ -32531,6 +32543,15 @@ ix86_expand_special_args_builtin (const
nargs = 1;
klass = load;
memory = 0;
+ switch (icode)
+ {
+ case CODE_FOR_sse4_1_movntdqa:
+ case CODE_FOR_avx2_movntdqa:
+ aligned_mem = true;
+ break;
+ default:
+ break;
+ }
break;
case VOID_FTYPE_PV2SF_V4SF:
case VOID_FTYPE_PV4DI_V4DI:
@@ -32548,6 +32569,26 @@ ix86_expand_special_args_builtin (const
klass = store;
/* Reserve memory operand for target. */
memory = ARRAY_SIZE (args);
+ switch (icode)
+ {
+ /* These builtins and instructions require the memory
+ to be properly aligned. */
+ case CODE_FOR_avx_movntv4di:
+ case CODE_FOR_sse2_movntv2di:
+ case CODE_FOR_avx_movntv8sf:
+ case CODE_FOR_sse_movntv4sf:
+ case CODE_FOR_sse4a_vmmovntv4sf:
+ case CODE_FOR_avx_movntv4df:
+ case CODE_FOR_sse2_movntv2df:
+ case CODE_FOR_sse4a_vmmovntv2df:
+ case CODE_FOR_sse2_movntidi:
+ case CODE_FOR_sse_movntq:
+ case CODE_FOR_sse2_movntisi:
+ aligned_mem = true;
+ break;
+ default:
+ break;
+ }
break;
case V4SF_FTYPE_V4SF_PCV2SF:
case V2DF_FTYPE_V2DF_PCDOUBLE:
@@ -32604,6 +32645,11 @@ ix86_expand_special_args_builtin (const
{
op = ix86_zero_extend_to_Pmode (op);
target = gen_rtx_MEM (tmode, op);
+ unsigned int align = get_pointer_alignment (arg);
+ if (aligned_mem && align < GET_MODE_ALIGNMENT (tmode))
+ align = GET_MODE_ALIGNMENT (tmode);
+ if (MEM_ALIGN (target) < align)
+ set_mem_align (target, align);
}
else
target = force_reg (tmode, op);
@@ -32649,8 +32695,11 @@ ix86_expand_special_args_builtin (const
/* This must be the memory operand. */
op = ix86_zero_extend_to_Pmode (op);
op = gen_rtx_MEM (mode, op);
- gcc_assert (GET_MODE (op) == mode
- || GET_MODE (op) == VOIDmode);
+ unsigned int align = get_pointer_alignment (arg);
+ if (aligned_mem && align < GET_MODE_ALIGNMENT (mode))
+ align = GET_MODE_ALIGNMENT (mode);
+ if (MEM_ALIGN (op) < align)
+ set_mem_align (op, align);
}
else
{
@@ -931,6 +931,7 @@ (define_insn "<sse>_loadu<ssemodesuffix>
}
[(set_attr "type" "ssemov")
(set_attr "movu" "1")
+ (set_attr "ssememalign" "8")
(set_attr "prefix" "maybe_vex")
(set (attr "mode")
(cond [(match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL")
@@ -961,6 +962,7 @@ (define_insn "<sse>_storeu<ssemodesuffix
}
[(set_attr "type" "ssemov")
(set_attr "movu" "1")
+ (set_attr "ssememalign" "8")
(set_attr "prefix" "maybe_vex")
(set (attr "mode")
(cond [(ior (match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL")
@@ -1020,6 +1022,7 @@ (define_insn "<sse2_avx_avx512f>_loaddqu
}
[(set_attr "type" "ssemov")
(set_attr "movu" "1")
+ (set_attr "ssememalign" "8")
(set (attr "prefix_data16")
(if_then_else
(match_test "TARGET_AVX")
@@ -1059,6 +1062,7 @@ (define_insn "<sse2_avx_avx512f>_storedq
}
[(set_attr "type" "ssemov")
(set_attr "movu" "1")
+ (set_attr "ssememalign" "8")
(set (attr "prefix_data16")
(if_then_else
(match_test "TARGET_AVX")
@@ -1105,6 +1109,7 @@ (define_insn "<sse3>_lddqu<avxsizesuffix
"%vlddqu\t{%1, %0|%0, %1}"
[(set_attr "type" "ssemov")
(set_attr "movu" "1")
+ (set_attr "ssememalign" "8")
(set (attr "prefix_data16")
(if_then_else
(match_test "TARGET_AVX")
@@ -1369,6 +1374,7 @@ (define_insn "sse_vmrcpv4sf2"
vrcpss\t{%1, %2, %0|%0, %2, %k1}"
[(set_attr "isa" "noavx,avx")
(set_attr "type" "sse")
+ (set_attr "ssememalign" "32")
(set_attr "atom_sse_attr" "rcp")
(set_attr "btver2_sse_attr" "rcp")
(set_attr "prefix" "orig,vex")
@@ -1509,6 +1515,7 @@ (define_insn "sse_vmrsqrtv4sf2"
vrsqrtss\t{%1, %2, %0|%0, %2, %k1}"
[(set_attr "isa" "noavx,avx")
(set_attr "type" "sse")
+ (set_attr "ssememalign" "32")
(set_attr "prefix" "orig,vex")
(set_attr "mode" "SF")])
@@ -3853,6 +3860,7 @@ (define_insn "sse2_cvtdq2pd"
"%vcvtdq2pd\t{%1, %0|%0, %q1}"
[(set_attr "type" "ssecvt")
(set_attr "prefix" "maybe_vex")
+ (set_attr "ssememalign" "64")
(set_attr "mode" "V2DF")])
(define_insn "<mask_codefor>avx512f_cvtpd2dq512<mask_name>"
@@ -4725,6 +4733,7 @@ (define_insn "sse_movhlps"
%vmovhps\t{%2, %0|%q0, %2}"
[(set_attr "isa" "noavx,avx,noavx,avx,*")
(set_attr "type" "ssemov")
+ (set_attr "ssememalign" "64")
(set_attr "prefix" "orig,vex,orig,vex,maybe_vex")
(set_attr "mode" "V4SF,V4SF,V2SF,V2SF,V2SF")])
@@ -4770,6 +4779,7 @@ (define_insn "sse_movlhps"
%vmovlps\t{%2, %H0|%H0, %2}"
[(set_attr "isa" "noavx,avx,noavx,avx,*")
(set_attr "type" "ssemov")
+ (set_attr "ssememalign" "64")
(set_attr "prefix" "orig,vex,orig,vex,maybe_vex")
(set_attr "mode" "V4SF,V4SF,V2SF,V2SF,V2SF")])
@@ -5174,6 +5184,7 @@ (define_insn "sse_storehps"
%vmovhlps\t{%1, %d0|%d0, %1}
%vmovlps\t{%H1, %d0|%d0, %H1}"
[(set_attr "type" "ssemov")
+ (set_attr "ssememalign" "64")
(set_attr "prefix" "maybe_vex")
(set_attr "mode" "V2SF,V4SF,V2SF")])
@@ -5213,6 +5224,7 @@ (define_insn "sse_loadhps"
%vmovlps\t{%2, %H0|%H0, %2}"
[(set_attr "isa" "noavx,avx,noavx,avx,*")
(set_attr "type" "ssemov")
+ (set_attr "ssememalign" "64")
(set_attr "prefix" "orig,vex,orig,vex,maybe_vex")
(set_attr "mode" "V2SF,V2SF,V4SF,V4SF,V2SF")])
@@ -5266,6 +5278,7 @@ (define_insn "sse_loadlps"
%vmovlps\t{%2, %0|%q0, %2}"
[(set_attr "isa" "noavx,avx,noavx,avx,*")
(set_attr "type" "sseshuf,sseshuf,ssemov,ssemov,ssemov")
+ (set_attr "ssememalign" "64")
(set_attr "length_immediate" "1,1,*,*,*")
(set_attr "prefix" "orig,vex,orig,vex,maybe_vex")
(set_attr "mode" "V4SF,V4SF,V2SF,V2SF,V2SF")])
@@ -6224,7 +6237,8 @@ (define_insn "*vec_interleave_highv2df"
vmovlpd\t{%H1, %2, %0|%0, %2, %H1}
%vmovhpd\t{%1, %0|%q0, %1}"
[(set_attr "isa" "noavx,avx,sse3,noavx,avx,*")
- (set_attr "type" "sselog,sselog,sselog,ssemov,ssemov,ssemov")
+ (set_attr "type" "sselog,sselog,sselog,ssemov,ssemov,ssemov")
+ (set_attr "ssememalign" "64")
(set_attr "prefix_data16" "*,*,*,1,*,1")
(set_attr "prefix" "orig,vex,maybe_vex,orig,vex,maybe_vex")
(set_attr "mode" "V2DF,V2DF,DF,V1DF,V1DF,V1DF")])
@@ -6368,6 +6382,7 @@ (define_insn "*vec_interleave_lowv2df"
%vmovlpd\t{%2, %H0|%H0, %2}"
[(set_attr "isa" "noavx,avx,sse3,noavx,avx,*")
(set_attr "type" "sselog,sselog,sselog,ssemov,ssemov,ssemov")
+ (set_attr "ssememalign" "64")
(set_attr "prefix_data16" "*,*,*,1,*,1")
(set_attr "prefix" "orig,vex,maybe_vex,orig,vex,maybe_vex")
(set_attr "mode" "V2DF,V2DF,DF,V1DF,V1DF,V1DF")])
@@ -6959,6 +6974,7 @@ (define_insn "*vec_extractv2df_1_sse"
movhlps\t{%1, %0|%0, %1}
movlps\t{%H1, %0|%0, %H1}"
[(set_attr "type" "ssemov")
+ (set_attr "ssememalign" "64")
(set_attr "mode" "V2SF,V4SF,V2SF")])
;; Avoid combining registers from different units in a single alternative,
@@ -7163,6 +7179,7 @@ (define_insn "sse2_movsd"
(const_string "1")
(const_string "*")))
(set_attr "length_immediate" "*,*,*,*,*,1,*,*,*")
+ (set_attr "ssememalign" "64")
(set_attr "prefix" "orig,vex,orig,vex,maybe_vex,orig,orig,vex,maybe_vex")
(set_attr "mode" "DF,DF,V1DF,V1DF,V1DF,V2DF,V1DF,V1DF,V1DF")])
@@ -11459,6 +11476,7 @@ (define_insn "sse4_1_<code>v8qiv8hi2"
"TARGET_SSE4_1"
"%vpmov<extsuffix>bw\t{%1, %0|%0, %q1}"
[(set_attr "type" "ssemov")
+ (set_attr "ssememalign" "64")
(set_attr "prefix_extra" "1")
(set_attr "prefix" "maybe_vex")
(set_attr "mode" "TI")])
@@ -11499,6 +11517,7 @@ (define_insn "sse4_1_<code>v4qiv4si2"
"TARGET_SSE4_1"
"%vpmov<extsuffix>bd\t{%1, %0|%0, %k1}"
[(set_attr "type" "ssemov")
+ (set_attr "ssememalign" "32")
(set_attr "prefix_extra" "1")
(set_attr "prefix" "maybe_vex")
(set_attr "mode" "TI")])
@@ -11534,6 +11553,7 @@ (define_insn "sse4_1_<code>v4hiv4si2"
"TARGET_SSE4_1"
"%vpmov<extsuffix>wd\t{%1, %0|%0, %q1}"
[(set_attr "type" "ssemov")
+ (set_attr "ssememalign" "64")
(set_attr "prefix_extra" "1")
(set_attr "prefix" "maybe_vex")
(set_attr "mode" "TI")])
@@ -11576,6 +11596,7 @@ (define_insn "sse4_1_<code>v2qiv2di2"
"TARGET_SSE4_1"
"%vpmov<extsuffix>bq\t{%1, %0|%0, %w1}"
[(set_attr "type" "ssemov")
+ (set_attr "ssememalign" "16")
(set_attr "prefix_extra" "1")
(set_attr "prefix" "maybe_vex")
(set_attr "mode" "TI")])
@@ -11613,6 +11634,7 @@ (define_insn "sse4_1_<code>v2hiv2di2"
"TARGET_SSE4_1"
"%vpmov<extsuffix>wq\t{%1, %0|%0, %k1}"
[(set_attr "type" "ssemov")
+ (set_attr "ssememalign" "32")
(set_attr "prefix_extra" "1")
(set_attr "prefix" "maybe_vex")
(set_attr "mode" "TI")])
@@ -11646,6 +11668,7 @@ (define_insn "sse4_1_<code>v2siv2di2"
"TARGET_SSE4_1"
"%vpmov<extsuffix>dq\t{%1, %0|%0, %q1}"
[(set_attr "type" "ssemov")
+ (set_attr "ssememalign" "64")
(set_attr "prefix_extra" "1")
(set_attr "prefix" "maybe_vex")
(set_attr "mode" "TI")])
@@ -11939,6 +11962,7 @@ (define_insn_and_split "sse4_2_pcmpestr"
[(set_attr "type" "sselog")
(set_attr "prefix_data16" "1")
(set_attr "prefix_extra" "1")
+ (set_attr "ssememalign" "8")
(set_attr "length_immediate" "1")
(set_attr "memory" "none,load")
(set_attr "mode" "TI")])
@@ -12001,6 +12025,7 @@ (define_insn_and_split "*sse4_2_pcmpestr
[(set_attr "type" "sselog")
(set_attr "prefix_data16" "1")
(set_attr "prefix_extra" "1")
+ (set_attr "ssememalign" "8")
(set_attr "length_immediate" "1")
(set_attr "memory" "load")
(set_attr "mode" "TI")])
@@ -12028,6 +12053,7 @@ (define_insn "sse4_2_pcmpestri"
(set_attr "prefix_data16" "1")
(set_attr "prefix_extra" "1")
(set_attr "prefix" "maybe_vex")
+ (set_attr "ssememalign" "8")
(set_attr "length_immediate" "1")
(set_attr "btver2_decode" "vector")
(set_attr "memory" "none,load")
@@ -12055,6 +12081,7 @@ (define_insn "sse4_2_pcmpestrm"
[(set_attr "type" "sselog")
(set_attr "prefix_data16" "1")
(set_attr "prefix_extra" "1")
+ (set_attr "ssememalign" "8")
(set_attr "length_immediate" "1")
(set_attr "prefix" "maybe_vex")
(set_attr "btver2_decode" "vector")
@@ -12081,6 +12108,7 @@ (define_insn "sse4_2_pcmpestr_cconly"
[(set_attr "type" "sselog")
(set_attr "prefix_data16" "1")
(set_attr "prefix_extra" "1")
+ (set_attr "ssememalign" "8")
(set_attr "length_immediate" "1")
(set_attr "memory" "none,load,none,load")
(set_attr "btver2_decode" "vector,vector,vector,vector")
@@ -12134,6 +12162,7 @@ (define_insn_and_split "sse4_2_pcmpistr"
[(set_attr "type" "sselog")
(set_attr "prefix_data16" "1")
(set_attr "prefix_extra" "1")
+ (set_attr "ssememalign" "8")
(set_attr "length_immediate" "1")
(set_attr "memory" "none,load")
(set_attr "mode" "TI")])
@@ -12187,6 +12216,7 @@ (define_insn_and_split "*sse4_2_pcmpistr
[(set_attr "type" "sselog")
(set_attr "prefix_data16" "1")
(set_attr "prefix_extra" "1")
+ (set_attr "ssememalign" "8")
(set_attr "length_immediate" "1")
(set_attr "memory" "load")
(set_attr "mode" "TI")])
@@ -12209,6 +12239,7 @@ (define_insn "sse4_2_pcmpistri"
[(set_attr "type" "sselog")
(set_attr "prefix_data16" "1")
(set_attr "prefix_extra" "1")
+ (set_attr "ssememalign" "8")
(set_attr "length_immediate" "1")
(set_attr "prefix" "maybe_vex")
(set_attr "memory" "none,load")
@@ -12233,6 +12264,7 @@ (define_insn "sse4_2_pcmpistrm"
[(set_attr "type" "sselog")
(set_attr "prefix_data16" "1")
(set_attr "prefix_extra" "1")
+ (set_attr "ssememalign" "8")
(set_attr "length_immediate" "1")
(set_attr "prefix" "maybe_vex")
(set_attr "memory" "none,load")
@@ -12257,6 +12289,7 @@ (define_insn "sse4_2_pcmpistr_cconly"
[(set_attr "type" "sselog")
(set_attr "prefix_data16" "1")
(set_attr "prefix_extra" "1")
+ (set_attr "ssememalign" "8")
(set_attr "length_immediate" "1")
(set_attr "memory" "none,load,none,load")
(set_attr "prefix" "maybe_vex")
@@ -402,6 +402,13 @@ (define_attr "unit" "integer,i387,sse,mm
(const_string "unknown")]
(const_string "integer")))
+;; The minimum required alignment of vector mode memory operands of the SSE
+;; (non-VEX/EVEX) instruction in bits, if it is different from
+;; GET_MODE_ALIGNMENT of the operand, otherwise 0. If an instruction has
+;; multiple alternatives, this should be conservative maximum of those minimum
+;; required alignments.
+(define_attr "ssememalign" "" (const_int 0))
+
;; The (bounding maximum) length of an instruction immediate.
(define_attr "length_immediate" ""
(cond [(eq_attr "type" "incdec,setcc,icmov,str,lea,other,multi,idiv,leave,
@@ -0,0 +1,30 @@
+// PR target/59163
+// { dg-do run }
+
+struct A { float a[4]; };
+struct B { int b; A a; };
+
+__attribute__((noinline, noclone)) void
+bar (A &a)
+{
+ if (a.a[0] != 36.0f || a.a[1] != 42.0f || a.a[2] != 48.0f || a.a[3] != 54.0f)
+ __builtin_abort ();
+}
+
+__attribute__((noinline, noclone)) void
+foo (A &a)
+{
+ int i;
+ A c = a;
+ for (i = 0; i < 4; i++)
+ c.a[i] *= 6.0f;
+ a = c;
+ bar (a);
+}
+
+int
+main ()
+{
+ B b = { 5, { 6, 7, 8, 9 } };
+ foo (b.a);
+}