Message ID | 871u30mrv6.fsf@talisman.default |
---|---|
State | New |
Headers | show |
On 11/01/2013 04:46 AM, Richard Sandiford wrote: > I'm building one target for each supported CPU and comparing the wide-int > assembly output of gcc.c-torture, gcc.dg and g++.dg with the corresponding > output from the merge point. This patch removes all the differences I saw > for alpha-linux-gnu in gcc.c-torture. > > Hunk 1: Preserve the current trunk behaviour in which the shift count > is truncated if SHIFT_COUNT_TRUNCATED and not otherwise. This was by > inspection after hunk 5. i used to do this inside of wide-int so that i would get consistent behavior for all clients, but i got beat up on it. > Hunks 2 and 3: Two cases where force_fit_to_type could extend to wider > types and where we weren't extending according to the sign of the source. > We should probably assert that the input is at least as wide as the type... > > Hunk 4: The "&" in: > > if ((TREE_INT_CST_HIGH (arg1) & mask_hi) == mask_hi > && (TREE_INT_CST_LOW (arg1) & mask_lo) == mask_lo) > > had got dropped during the conversion. > > Hunk 5: The old code was: > > if (shift > 0) > { > *mask = r1mask.llshift (shift, TYPE_PRECISION (type)); > *val = r1val.llshift (shift, TYPE_PRECISION (type)); > } > else if (shift < 0) > { > shift = -shift; > *mask = r1mask.rshift (shift, TYPE_PRECISION (type), !uns); > *val = r1val.rshift (shift, TYPE_PRECISION (type), !uns); > } > > and these precision arguments had two purposes: to control which > bits of the first argument were shifted, and to control the truncation > mask for SHIFT_TRUNCATED. We need to pass a width to the shift functions > for the second. > > (BTW, I'm running the comparisons with CONST_WIDE_INT locally moved to the > end of the !GENERATOR_FILE list in rtl.def, since the current position caused > some spurious differences. The "problem" AFAICT is that hash_rtx hashes > on code, RTL PRE creates registers in the hash order of the associated > expressions, RA uses register numbers as a tie-breaker during ordering, > and so the order of rtx_code can indirectly influence register allocation. > First time I'd realised that could happen, so just thought I'd mention it. > I think we should keep rtl.def in the current (logical) order though.) we noticed the difference and live with it, but i agree that for testing it is useful until the branch goes in. > Tested on x86_64-linux-gnu and powerpc64-linux-gnu. OK for wide-int? > > Thanks, > Richard > > > Index: gcc/fold-const.c > =================================================================== > --- gcc/fold-const.c (revision 204247) > +++ gcc/fold-const.c (working copy) > @@ -1008,9 +1008,12 @@ > The following code ignores overflow; perhaps a C standard > interpretation ruling is needed. */ > res = wi::rshift (arg1, arg2, sign, > - GET_MODE_BITSIZE (TYPE_MODE (type))); > + SHIFT_COUNT_TRUNCATED > + ? GET_MODE_BITSIZE (TYPE_MODE (type)) : 0); > else > - res = wi::lshift (arg1, arg2, GET_MODE_BITSIZE (TYPE_MODE (type))); > + res = wi::lshift (arg1, arg2, > + SHIFT_COUNT_TRUNCATED > + ? GET_MODE_BITSIZE (TYPE_MODE (type)) : 0); > break; > > case RROTATE_EXPR: > @@ -6870,7 +6873,8 @@ > return NULL_TREE; > > if (TREE_CODE (arg1) == INTEGER_CST) > - arg1 = force_fit_type (inner_type, arg1, 0, TREE_OVERFLOW (arg1)); > + arg1 = force_fit_type (inner_type, wi::to_widest (arg1), 0, > + TREE_OVERFLOW (arg1)); > else > arg1 = fold_convert_loc (loc, inner_type, arg1); > > @@ -8081,7 +8085,8 @@ > } > if (change) > { > - tem = force_fit_type (type, and1, 0, TREE_OVERFLOW (and1)); > + tem = force_fit_type (type, wi::to_widest (and1), 0, > + TREE_OVERFLOW (and1)); > return fold_build2_loc (loc, BIT_AND_EXPR, type, > fold_convert_loc (loc, type, and0), tem); > } > @@ -14098,12 +14103,13 @@ > (inner_width, outer_width - inner_width, false, > TYPE_PRECISION (TREE_TYPE (arg1))); > > - if (mask == arg1) > + wide_int common = mask & arg1; > + if (common == mask) > { > tem_type = signed_type_for (TREE_TYPE (tem)); > tem = fold_convert_loc (loc, tem_type, tem); > } > - else if ((mask & arg1) == 0) > + else if (common == 0) > { > tem_type = unsigned_type_for (TREE_TYPE (tem)); > tem = fold_convert_loc (loc, tem_type, tem); > Index: gcc/tree-ssa-ccp.c > =================================================================== > --- gcc/tree-ssa-ccp.c (revision 204247) > +++ gcc/tree-ssa-ccp.c (working copy) > @@ -1238,15 +1238,20 @@ > else > code = RSHIFT_EXPR; > } > + int shift_precision = SHIFT_COUNT_TRUNCATED ? width : 0; > if (code == RSHIFT_EXPR) > { > - *mask = wi::rshift (wi::ext (r1mask, width, sgn), shift, sgn); > - *val = wi::rshift (wi::ext (r1val, width, sgn), shift, sgn); > + *mask = wi::rshift (wi::ext (r1mask, width, sgn), > + shift, sgn, shift_precision); > + *val = wi::rshift (wi::ext (r1val, width, sgn), > + shift, sgn, shift_precision); > } > else > { > - *mask = wi::sext (wi::lshift (r1mask, shift), width); > - *val = wi::sext (wi::lshift (r1val, shift), width); > + *mask = wi::ext (wi::lshift (r1mask, shift, shift_precision), > + width, sgn); > + *val = wi::ext (wi::lshift (r1val, shift, shift_precision), > + width, sgn); > } > } > }
Kenneth Zadeck <zadeck@naturalbridge.com> writes: > On 11/01/2013 04:46 AM, Richard Sandiford wrote: >> I'm building one target for each supported CPU and comparing the wide-int >> assembly output of gcc.c-torture, gcc.dg and g++.dg with the corresponding >> output from the merge point. This patch removes all the differences I saw >> for alpha-linux-gnu in gcc.c-torture. >> >> Hunk 1: Preserve the current trunk behaviour in which the shift count >> is truncated if SHIFT_COUNT_TRUNCATED and not otherwise. This was by >> inspection after hunk 5. > i used to do this inside of wide-int so that i would get consistent > behavior for all clients, but i got beat up on it. Right :-) SHIFT_COUNT_TRUNCATED is a bad enough interface as it is. Putting into a utility class like wide-int just makes it worse. This is something that clients _should_ think about if they want to emulate target behaviour (rather than doing "natural" maths). Not sure -- is the patch OK, or are you deferring judgement for now? Thanks, Richard
On 11/01/2013 09:31 AM, Richard Sandiford wrote: > Kenneth Zadeck <zadeck@naturalbridge.com> writes: >> On 11/01/2013 04:46 AM, Richard Sandiford wrote: >>> I'm building one target for each supported CPU and comparing the wide-int >>> assembly output of gcc.c-torture, gcc.dg and g++.dg with the corresponding >>> output from the merge point. This patch removes all the differences I saw >>> for alpha-linux-gnu in gcc.c-torture. >>> >>> Hunk 1: Preserve the current trunk behaviour in which the shift count >>> is truncated if SHIFT_COUNT_TRUNCATED and not otherwise. This was by >>> inspection after hunk 5. >> i used to do this inside of wide-int so that i would get consistent >> behavior for all clients, but i got beat up on it. > Right :-) SHIFT_COUNT_TRUNCATED is a bad enough interface as it is. > Putting into a utility class like wide-int just makes it worse. > This is something that clients _should_ think about if they want > to emulate target behaviour (rather than doing "natural" maths). > > Not sure -- is the patch OK, or are you deferring judgement for now? > > Thanks, > Richard it is fine, sorry. kenny
On Fri, 1 Nov 2013, Richard Sandiford wrote: > I'm building one target for each supported CPU and comparing the wide-int > assembly output of gcc.c-torture, gcc.dg and g++.dg with the corresponding > output from the merge point. This patch removes all the differences I saw > for alpha-linux-gnu in gcc.c-torture. > > Hunk 1: Preserve the current trunk behaviour in which the shift count > is truncated if SHIFT_COUNT_TRUNCATED and not otherwise. This was by > inspection after hunk 5. > > Hunks 2 and 3: Two cases where force_fit_to_type could extend to wider > types and where we weren't extending according to the sign of the source. > We should probably assert that the input is at least as wide as the type... > > Hunk 4: The "&" in: > > if ((TREE_INT_CST_HIGH (arg1) & mask_hi) == mask_hi > && (TREE_INT_CST_LOW (arg1) & mask_lo) == mask_lo) > > had got dropped during the conversion. > > Hunk 5: The old code was: > > if (shift > 0) > { > *mask = r1mask.llshift (shift, TYPE_PRECISION (type)); > *val = r1val.llshift (shift, TYPE_PRECISION (type)); > } > else if (shift < 0) > { > shift = -shift; > *mask = r1mask.rshift (shift, TYPE_PRECISION (type), !uns); > *val = r1val.rshift (shift, TYPE_PRECISION (type), !uns); > } > > and these precision arguments had two purposes: to control which > bits of the first argument were shifted, and to control the truncation > mask for SHIFT_TRUNCATED. We need to pass a width to the shift functions > for the second. > > (BTW, I'm running the comparisons with CONST_WIDE_INT locally moved to the > end of the !GENERATOR_FILE list in rtl.def, since the current position caused > some spurious differences. The "problem" AFAICT is that hash_rtx hashes > on code, RTL PRE creates registers in the hash order of the associated > expressions, RA uses register numbers as a tie-breaker during ordering, > and so the order of rtx_code can indirectly influence register allocation. > First time I'd realised that could happen, so just thought I'd mention it. > I think we should keep rtl.def in the current (logical) order though.) > > Tested on x86_64-linux-gnu and powerpc64-linux-gnu. OK for wide-int? Bah - can you instead try removing the use of SHIFT_COUNT_TRUNCATED from double-int.c on the trunk? If not then putting this at the callers of wi::rshift and friends is clearly asking for future mistakes. Oh, and I think that honoring SHIFT_COUNT_TRUNCATED anywhere else than in machine instruction patterns was a mistake in the past. Thanks, Richard. [btw, please use diff -p to get us at least some context if you are not writing changelogs ...] > Thanks, > Richard > > > Index: gcc/fold-const.c > =================================================================== > --- gcc/fold-const.c (revision 204247) > +++ gcc/fold-const.c (working copy) > @@ -1008,9 +1008,12 @@ > The following code ignores overflow; perhaps a C standard > interpretation ruling is needed. */ > res = wi::rshift (arg1, arg2, sign, > - GET_MODE_BITSIZE (TYPE_MODE (type))); > + SHIFT_COUNT_TRUNCATED > + ? GET_MODE_BITSIZE (TYPE_MODE (type)) : 0); > else > - res = wi::lshift (arg1, arg2, GET_MODE_BITSIZE (TYPE_MODE (type))); > + res = wi::lshift (arg1, arg2, > + SHIFT_COUNT_TRUNCATED > + ? GET_MODE_BITSIZE (TYPE_MODE (type)) : 0); > break; > > case RROTATE_EXPR: > @@ -6870,7 +6873,8 @@ > return NULL_TREE; > > if (TREE_CODE (arg1) == INTEGER_CST) > - arg1 = force_fit_type (inner_type, arg1, 0, TREE_OVERFLOW (arg1)); > + arg1 = force_fit_type (inner_type, wi::to_widest (arg1), 0, > + TREE_OVERFLOW (arg1)); > else > arg1 = fold_convert_loc (loc, inner_type, arg1); > > @@ -8081,7 +8085,8 @@ > } > if (change) > { > - tem = force_fit_type (type, and1, 0, TREE_OVERFLOW (and1)); > + tem = force_fit_type (type, wi::to_widest (and1), 0, > + TREE_OVERFLOW (and1)); > return fold_build2_loc (loc, BIT_AND_EXPR, type, > fold_convert_loc (loc, type, and0), tem); > } > @@ -14098,12 +14103,13 @@ > (inner_width, outer_width - inner_width, false, > TYPE_PRECISION (TREE_TYPE (arg1))); > > - if (mask == arg1) > + wide_int common = mask & arg1; > + if (common == mask) > { > tem_type = signed_type_for (TREE_TYPE (tem)); > tem = fold_convert_loc (loc, tem_type, tem); > } > - else if ((mask & arg1) == 0) > + else if (common == 0) > { > tem_type = unsigned_type_for (TREE_TYPE (tem)); > tem = fold_convert_loc (loc, tem_type, tem); > Index: gcc/tree-ssa-ccp.c > =================================================================== > --- gcc/tree-ssa-ccp.c (revision 204247) > +++ gcc/tree-ssa-ccp.c (working copy) > @@ -1238,15 +1238,20 @@ > else > code = RSHIFT_EXPR; > } > + int shift_precision = SHIFT_COUNT_TRUNCATED ? width : 0; > if (code == RSHIFT_EXPR) > { > - *mask = wi::rshift (wi::ext (r1mask, width, sgn), shift, sgn); > - *val = wi::rshift (wi::ext (r1val, width, sgn), shift, sgn); > + *mask = wi::rshift (wi::ext (r1mask, width, sgn), > + shift, sgn, shift_precision); > + *val = wi::rshift (wi::ext (r1val, width, sgn), > + shift, sgn, shift_precision); > } > else > { > - *mask = wi::sext (wi::lshift (r1mask, shift), width); > - *val = wi::sext (wi::lshift (r1val, shift), width); > + *mask = wi::ext (wi::lshift (r1mask, shift, shift_precision), > + width, sgn); > + *val = wi::ext (wi::lshift (r1val, shift, shift_precision), > + width, sgn); > } > } > } > >
On Mon, 4 Nov 2013, Richard Biener wrote: > On Fri, 1 Nov 2013, Richard Sandiford wrote: > > > I'm building one target for each supported CPU and comparing the wide-int > > assembly output of gcc.c-torture, gcc.dg and g++.dg with the corresponding > > output from the merge point. This patch removes all the differences I saw > > for alpha-linux-gnu in gcc.c-torture. > > > > Hunk 1: Preserve the current trunk behaviour in which the shift count > > is truncated if SHIFT_COUNT_TRUNCATED and not otherwise. This was by > > inspection after hunk 5. > > > > Hunks 2 and 3: Two cases where force_fit_to_type could extend to wider > > types and where we weren't extending according to the sign of the source. > > We should probably assert that the input is at least as wide as the type... > > > > Hunk 4: The "&" in: > > > > if ((TREE_INT_CST_HIGH (arg1) & mask_hi) == mask_hi > > && (TREE_INT_CST_LOW (arg1) & mask_lo) == mask_lo) > > > > had got dropped during the conversion. > > > > Hunk 5: The old code was: > > > > if (shift > 0) > > { > > *mask = r1mask.llshift (shift, TYPE_PRECISION (type)); > > *val = r1val.llshift (shift, TYPE_PRECISION (type)); > > } > > else if (shift < 0) > > { > > shift = -shift; > > *mask = r1mask.rshift (shift, TYPE_PRECISION (type), !uns); > > *val = r1val.rshift (shift, TYPE_PRECISION (type), !uns); > > } > > > > and these precision arguments had two purposes: to control which > > bits of the first argument were shifted, and to control the truncation > > mask for SHIFT_TRUNCATED. We need to pass a width to the shift functions > > for the second. > > > > (BTW, I'm running the comparisons with CONST_WIDE_INT locally moved to the > > end of the !GENERATOR_FILE list in rtl.def, since the current position caused > > some spurious differences. The "problem" AFAICT is that hash_rtx hashes > > on code, RTL PRE creates registers in the hash order of the associated > > expressions, RA uses register numbers as a tie-breaker during ordering, > > and so the order of rtx_code can indirectly influence register allocation. > > First time I'd realised that could happen, so just thought I'd mention it. > > I think we should keep rtl.def in the current (logical) order though.) > > > > Tested on x86_64-linux-gnu and powerpc64-linux-gnu. OK for wide-int? > > Bah - can you instead try removing the use of SHIFT_COUNT_TRUNCATED > from double-int.c on the trunk? If not then putting this at the > callers of wi::rshift and friends is clearly asking for future > mistakes. > > Oh, and I think that honoring SHIFT_COUNT_TRUNCATED anywhere else > than in machine instruction patterns was a mistake in the past. Oh, and my understanding is that it's maybe required for correctness on the RTL side if middle-end code removes masking operations. Constant propagation results later may not change the result because of that. I'm not sure this is the way it happens, but if we have (lshift:si (reg:si 1) (and:qi (reg:qi 2) 0x1f)) and we'd transform it to (based on SHIFT_COUNT_TRUNCATED) (lshift:si (reg:si 1) (reg:qi 2)) and later constant propagate 77 to reg:qi 2. That isn't the way SHIFT_COUNT_TRUNCATED should work, of course, but instead the backends should have a define_insn which matches the 'and' and combine should try to match that. You can see that various architectures may have truncating shifts but not for all patterns which results in stuff like config/aarch64/aarch64.h:#define SHIFT_COUNT_TRUNCATED !TARGET_SIMD which clearly means that it's not implemented in terms of providing shift insns which can match a shift operand that is masked. That is, either try to clean that up (hooray!), or preserve the double-int.[ch] behavior (how does CONST_INT RTL constant folding honor it?). Richard. > Thanks, > Richard. > > [btw, please use diff -p to get us at least some context if you > are not writing changelogs ...] > > > Thanks, > > Richard > > > > > > Index: gcc/fold-const.c > > =================================================================== > > --- gcc/fold-const.c (revision 204247) > > +++ gcc/fold-const.c (working copy) > > @@ -1008,9 +1008,12 @@ > > The following code ignores overflow; perhaps a C standard > > interpretation ruling is needed. */ > > res = wi::rshift (arg1, arg2, sign, > > - GET_MODE_BITSIZE (TYPE_MODE (type))); > > + SHIFT_COUNT_TRUNCATED > > + ? GET_MODE_BITSIZE (TYPE_MODE (type)) : 0); > > else > > - res = wi::lshift (arg1, arg2, GET_MODE_BITSIZE (TYPE_MODE (type))); > > + res = wi::lshift (arg1, arg2, > > + SHIFT_COUNT_TRUNCATED > > + ? GET_MODE_BITSIZE (TYPE_MODE (type)) : 0); > > break; > > > > case RROTATE_EXPR: > > @@ -6870,7 +6873,8 @@ > > return NULL_TREE; > > > > if (TREE_CODE (arg1) == INTEGER_CST) > > - arg1 = force_fit_type (inner_type, arg1, 0, TREE_OVERFLOW (arg1)); > > + arg1 = force_fit_type (inner_type, wi::to_widest (arg1), 0, > > + TREE_OVERFLOW (arg1)); > > else > > arg1 = fold_convert_loc (loc, inner_type, arg1); > > > > @@ -8081,7 +8085,8 @@ > > } > > if (change) > > { > > - tem = force_fit_type (type, and1, 0, TREE_OVERFLOW (and1)); > > + tem = force_fit_type (type, wi::to_widest (and1), 0, > > + TREE_OVERFLOW (and1)); > > return fold_build2_loc (loc, BIT_AND_EXPR, type, > > fold_convert_loc (loc, type, and0), tem); > > } > > @@ -14098,12 +14103,13 @@ > > (inner_width, outer_width - inner_width, false, > > TYPE_PRECISION (TREE_TYPE (arg1))); > > > > - if (mask == arg1) > > + wide_int common = mask & arg1; > > + if (common == mask) > > { > > tem_type = signed_type_for (TREE_TYPE (tem)); > > tem = fold_convert_loc (loc, tem_type, tem); > > } > > - else if ((mask & arg1) == 0) > > + else if (common == 0) > > { > > tem_type = unsigned_type_for (TREE_TYPE (tem)); > > tem = fold_convert_loc (loc, tem_type, tem); > > Index: gcc/tree-ssa-ccp.c > > =================================================================== > > --- gcc/tree-ssa-ccp.c (revision 204247) > > +++ gcc/tree-ssa-ccp.c (working copy) > > @@ -1238,15 +1238,20 @@ > > else > > code = RSHIFT_EXPR; > > } > > + int shift_precision = SHIFT_COUNT_TRUNCATED ? width : 0; > > if (code == RSHIFT_EXPR) > > { > > - *mask = wi::rshift (wi::ext (r1mask, width, sgn), shift, sgn); > > - *val = wi::rshift (wi::ext (r1val, width, sgn), shift, sgn); > > + *mask = wi::rshift (wi::ext (r1mask, width, sgn), > > + shift, sgn, shift_precision); > > + *val = wi::rshift (wi::ext (r1val, width, sgn), > > + shift, sgn, shift_precision); > > } > > else > > { > > - *mask = wi::sext (wi::lshift (r1mask, shift), width); > > - *val = wi::sext (wi::lshift (r1val, shift), width); > > + *mask = wi::ext (wi::lshift (r1mask, shift, shift_precision), > > + width, sgn); > > + *val = wi::ext (wi::lshift (r1val, shift, shift_precision), > > + width, sgn); > > } > > } > > } > > > > > >
Richard Biener <rguenther@suse.de> writes: > On Fri, 1 Nov 2013, Richard Sandiford wrote: >> I'm building one target for each supported CPU and comparing the wide-int >> assembly output of gcc.c-torture, gcc.dg and g++.dg with the corresponding >> output from the merge point. This patch removes all the differences I saw >> for alpha-linux-gnu in gcc.c-torture. >> >> Hunk 1: Preserve the current trunk behaviour in which the shift count >> is truncated if SHIFT_COUNT_TRUNCATED and not otherwise. This was by >> inspection after hunk 5. >> >> Hunks 2 and 3: Two cases where force_fit_to_type could extend to wider >> types and where we weren't extending according to the sign of the source. >> We should probably assert that the input is at least as wide as the type... >> >> Hunk 4: The "&" in: >> >> if ((TREE_INT_CST_HIGH (arg1) & mask_hi) == mask_hi >> && (TREE_INT_CST_LOW (arg1) & mask_lo) == mask_lo) >> >> had got dropped during the conversion. >> >> Hunk 5: The old code was: >> >> if (shift > 0) >> { >> *mask = r1mask.llshift (shift, TYPE_PRECISION (type)); >> *val = r1val.llshift (shift, TYPE_PRECISION (type)); >> } >> else if (shift < 0) >> { >> shift = -shift; >> *mask = r1mask.rshift (shift, TYPE_PRECISION (type), !uns); >> *val = r1val.rshift (shift, TYPE_PRECISION (type), !uns); >> } >> >> and these precision arguments had two purposes: to control which >> bits of the first argument were shifted, and to control the truncation >> mask for SHIFT_TRUNCATED. We need to pass a width to the shift functions >> for the second. >> >> (BTW, I'm running the comparisons with CONST_WIDE_INT locally moved to the >> end of the !GENERATOR_FILE list in rtl.def, since the current position caused >> some spurious differences. The "problem" AFAICT is that hash_rtx hashes >> on code, RTL PRE creates registers in the hash order of the associated >> expressions, RA uses register numbers as a tie-breaker during ordering, >> and so the order of rtx_code can indirectly influence register allocation. >> First time I'd realised that could happen, so just thought I'd mention it. >> I think we should keep rtl.def in the current (logical) order though.) >> >> Tested on x86_64-linux-gnu and powerpc64-linux-gnu. OK for wide-int? > > Bah - can you instead try removing the use of SHIFT_COUNT_TRUNCATED > from double-int.c on the trunk? If not then putting this at the > callers of wi::rshift and friends is clearly asking for future > mistakes. > > Oh, and I think that honoring SHIFT_COUNT_TRUNCATED anywhere else > than in machine instruction patterns was a mistake in the past. Sure, I can try, but what counts as success? For better or worse: unsigned int i = 1 << 32; has traditionally honoured target/SHIFT_COUNT_TRUNCATED behaviour, so (e.g.) i is 0 for x86_64 and 1 for MIPS. So if the idea is to get rid of SHIFT_COUNT_TRUNCATED at the tree level, it will be a visible change (but presumably only for undefined code). > [btw, please use diff -p to get us at least some context if you > are not writing changelogs ...] Yeah, sorry, this whole "no changelog" thing has messed up my workflow :-) The scripts I normally use did that for me. Thanks, Richard
Richard Biener <rguenther@suse.de> writes: > On Mon, 4 Nov 2013, Richard Biener wrote: > >> On Fri, 1 Nov 2013, Richard Sandiford wrote: >> >> > I'm building one target for each supported CPU and comparing the wide-int >> > assembly output of gcc.c-torture, gcc.dg and g++.dg with the corresponding >> > output from the merge point. This patch removes all the differences I saw >> > for alpha-linux-gnu in gcc.c-torture. >> > >> > Hunk 1: Preserve the current trunk behaviour in which the shift count >> > is truncated if SHIFT_COUNT_TRUNCATED and not otherwise. This was by >> > inspection after hunk 5. >> > >> > Hunks 2 and 3: Two cases where force_fit_to_type could extend to wider >> > types and where we weren't extending according to the sign of the source. >> > We should probably assert that the input is at least as wide as the type... >> > >> > Hunk 4: The "&" in: >> > >> > if ((TREE_INT_CST_HIGH (arg1) & mask_hi) == mask_hi >> > && (TREE_INT_CST_LOW (arg1) & mask_lo) == mask_lo) >> > >> > had got dropped during the conversion. >> > >> > Hunk 5: The old code was: >> > >> > if (shift > 0) >> > { >> > *mask = r1mask.llshift (shift, TYPE_PRECISION (type)); >> > *val = r1val.llshift (shift, TYPE_PRECISION (type)); >> > } >> > else if (shift < 0) >> > { >> > shift = -shift; >> > *mask = r1mask.rshift (shift, TYPE_PRECISION (type), !uns); >> > *val = r1val.rshift (shift, TYPE_PRECISION (type), !uns); >> > } >> > >> > and these precision arguments had two purposes: to control which >> > bits of the first argument were shifted, and to control the truncation >> > mask for SHIFT_TRUNCATED. We need to pass a width to the shift functions >> > for the second. >> > >> > (BTW, I'm running the comparisons with CONST_WIDE_INT locally moved to the >> > end of the !GENERATOR_FILE list in rtl.def, since the current position caused >> > some spurious differences. The "problem" AFAICT is that hash_rtx hashes >> > on code, RTL PRE creates registers in the hash order of the associated >> > expressions, RA uses register numbers as a tie-breaker during ordering, >> > and so the order of rtx_code can indirectly influence register allocation. >> > First time I'd realised that could happen, so just thought I'd mention it. >> > I think we should keep rtl.def in the current (logical) order though.) >> > >> > Tested on x86_64-linux-gnu and powerpc64-linux-gnu. OK for wide-int? >> >> Bah - can you instead try removing the use of SHIFT_COUNT_TRUNCATED >> from double-int.c on the trunk? If not then putting this at the >> callers of wi::rshift and friends is clearly asking for future >> mistakes. >> >> Oh, and I think that honoring SHIFT_COUNT_TRUNCATED anywhere else >> than in machine instruction patterns was a mistake in the past. > > Oh, and my understanding is that it's maybe required for correctness on > the RTL side if middle-end code removes masking operations. > Constant propagation results later may not change the result because > of that. I'm not sure this is the way it happens, but if we have > > (lshift:si (reg:si 1) (and:qi (reg:qi 2) 0x1f)) > > and we'd transform it to (based on SHIFT_COUNT_TRUNCATED) > > (lshift:si (reg:si 1) (reg:qi 2)) > > and later constant propagate 77 to reg:qi 2. > > That isn't the way SHIFT_COUNT_TRUNCATED should work, of course, > but instead the backends should have a define_insn which matches > the 'and' and combine should try to match that. > > You can see that various architectures may have truncating shifts > but not for all patterns which results in stuff like > > config/aarch64/aarch64.h:#define SHIFT_COUNT_TRUNCATED !TARGET_SIMD > > which clearly means that it's not implemented in terms of providing > shift insns which can match a shift operand that is masked. > > That is, either try to clean that up (hooray!), or preserve > the double-int.[ch] behavior (how does CONST_INT RTL constant > folding honor it?). The CONST_INT code does the modulus explicitly: /* Truncate the shift if SHIFT_COUNT_TRUNCATED, otherwise make sure the value is in range. We can't return any old value for out-of-range arguments because either the middle-end (via shift_truncation_mask) or the back-end might be relying on target-specific knowledge. Nor can we rely on shift_truncation_mask, since the shift might not be part of an ashlM3, lshrM3 or ashrM3 instruction. */ if (SHIFT_COUNT_TRUNCATED) arg1 = (unsigned HOST_WIDE_INT) arg1 % width; else if (arg1 < 0 || arg1 >= GET_MODE_BITSIZE (mode)) return 0; There was a strong feeling (from me and others) that wide-int.h should not depend on tm.h. I don't think wi::lshift itself should know about SHIFT_COUNT_TRUNCATED. Especially since (and I think we agree on this :-)) it's a terrible interface. It would be better if it took a mode as an argument and (perhaps) returned a mask as a result. But that would make it even harder for wide-int.h to do the right thing, because it doesn't know about modes. Having the callers do it seems like the right thing to me, both now and after any future tweaks to SHIFT_COUNT_TRUNCATED. If your objection is that it's too easy to forget, then I don't think that's a problem at the RTL level. simplify_const_binary_operation is the only place that does constant RTX shifts. Sorry, I don't have time to replace SHIFT_COUNT_TRUNCATED right now. Thanks, Richard
On Mon, 4 Nov 2013, Richard Sandiford wrote: > Richard Biener <rguenther@suse.de> writes: > > On Fri, 1 Nov 2013, Richard Sandiford wrote: > >> I'm building one target for each supported CPU and comparing the wide-int > >> assembly output of gcc.c-torture, gcc.dg and g++.dg with the corresponding > >> output from the merge point. This patch removes all the differences I saw > >> for alpha-linux-gnu in gcc.c-torture. > >> > >> Hunk 1: Preserve the current trunk behaviour in which the shift count > >> is truncated if SHIFT_COUNT_TRUNCATED and not otherwise. This was by > >> inspection after hunk 5. > >> > >> Hunks 2 and 3: Two cases where force_fit_to_type could extend to wider > >> types and where we weren't extending according to the sign of the source. > >> We should probably assert that the input is at least as wide as the type... > >> > >> Hunk 4: The "&" in: > >> > >> if ((TREE_INT_CST_HIGH (arg1) & mask_hi) == mask_hi > >> && (TREE_INT_CST_LOW (arg1) & mask_lo) == mask_lo) > >> > >> had got dropped during the conversion. > >> > >> Hunk 5: The old code was: > >> > >> if (shift > 0) > >> { > >> *mask = r1mask.llshift (shift, TYPE_PRECISION (type)); > >> *val = r1val.llshift (shift, TYPE_PRECISION (type)); > >> } > >> else if (shift < 0) > >> { > >> shift = -shift; > >> *mask = r1mask.rshift (shift, TYPE_PRECISION (type), !uns); > >> *val = r1val.rshift (shift, TYPE_PRECISION (type), !uns); > >> } > >> > >> and these precision arguments had two purposes: to control which > >> bits of the first argument were shifted, and to control the truncation > >> mask for SHIFT_TRUNCATED. We need to pass a width to the shift functions > >> for the second. > >> > >> (BTW, I'm running the comparisons with CONST_WIDE_INT locally moved to the > >> end of the !GENERATOR_FILE list in rtl.def, since the current position caused > >> some spurious differences. The "problem" AFAICT is that hash_rtx hashes > >> on code, RTL PRE creates registers in the hash order of the associated > >> expressions, RA uses register numbers as a tie-breaker during ordering, > >> and so the order of rtx_code can indirectly influence register allocation. > >> First time I'd realised that could happen, so just thought I'd mention it. > >> I think we should keep rtl.def in the current (logical) order though.) > >> > >> Tested on x86_64-linux-gnu and powerpc64-linux-gnu. OK for wide-int? > > > > Bah - can you instead try removing the use of SHIFT_COUNT_TRUNCATED > > from double-int.c on the trunk? If not then putting this at the > > callers of wi::rshift and friends is clearly asking for future > > mistakes. > > > > Oh, and I think that honoring SHIFT_COUNT_TRUNCATED anywhere else > > than in machine instruction patterns was a mistake in the past. > > Sure, I can try, but what counts as success? For better or worse: > > unsigned int i = 1 << 32; > > has traditionally honoured target/SHIFT_COUNT_TRUNCATED behaviour, > so (e.g.) i is 0 for x86_64 and 1 for MIPS. So if the idea is to > get rid of SHIFT_COUNT_TRUNCATED at the tree level, it will be a > visible change (but presumably only for undefined code). Sure - but the above is undefined (as you say). Generally constant folding should follow language standards, not machine dependent behavior. > > [btw, please use diff -p to get us at least some context if you > > are not writing changelogs ...] > > Yeah, sorry, this whole "no changelog" thing has messed up my workflow :-) > The scripts I normally use did that for me. ;) Richard.
On Mon, 4 Nov 2013, Richard Sandiford wrote: > Richard Biener <rguenther@suse.de> writes: > > On Mon, 4 Nov 2013, Richard Biener wrote: > > > >> On Fri, 1 Nov 2013, Richard Sandiford wrote: > >> > >> > I'm building one target for each supported CPU and comparing the wide-int > >> > assembly output of gcc.c-torture, gcc.dg and g++.dg with the corresponding > >> > output from the merge point. This patch removes all the differences I saw > >> > for alpha-linux-gnu in gcc.c-torture. > >> > > >> > Hunk 1: Preserve the current trunk behaviour in which the shift count > >> > is truncated if SHIFT_COUNT_TRUNCATED and not otherwise. This was by > >> > inspection after hunk 5. > >> > > >> > Hunks 2 and 3: Two cases where force_fit_to_type could extend to wider > >> > types and where we weren't extending according to the sign of the source. > >> > We should probably assert that the input is at least as wide as the type... > >> > > >> > Hunk 4: The "&" in: > >> > > >> > if ((TREE_INT_CST_HIGH (arg1) & mask_hi) == mask_hi > >> > && (TREE_INT_CST_LOW (arg1) & mask_lo) == mask_lo) > >> > > >> > had got dropped during the conversion. > >> > > >> > Hunk 5: The old code was: > >> > > >> > if (shift > 0) > >> > { > >> > *mask = r1mask.llshift (shift, TYPE_PRECISION (type)); > >> > *val = r1val.llshift (shift, TYPE_PRECISION (type)); > >> > } > >> > else if (shift < 0) > >> > { > >> > shift = -shift; > >> > *mask = r1mask.rshift (shift, TYPE_PRECISION (type), !uns); > >> > *val = r1val.rshift (shift, TYPE_PRECISION (type), !uns); > >> > } > >> > > >> > and these precision arguments had two purposes: to control which > >> > bits of the first argument were shifted, and to control the truncation > >> > mask for SHIFT_TRUNCATED. We need to pass a width to the shift functions > >> > for the second. > >> > > >> > (BTW, I'm running the comparisons with CONST_WIDE_INT locally moved to the > >> > end of the !GENERATOR_FILE list in rtl.def, since the current position caused > >> > some spurious differences. The "problem" AFAICT is that hash_rtx hashes > >> > on code, RTL PRE creates registers in the hash order of the associated > >> > expressions, RA uses register numbers as a tie-breaker during ordering, > >> > and so the order of rtx_code can indirectly influence register allocation. > >> > First time I'd realised that could happen, so just thought I'd mention it. > >> > I think we should keep rtl.def in the current (logical) order though.) > >> > > >> > Tested on x86_64-linux-gnu and powerpc64-linux-gnu. OK for wide-int? > >> > >> Bah - can you instead try removing the use of SHIFT_COUNT_TRUNCATED > >> from double-int.c on the trunk? If not then putting this at the > >> callers of wi::rshift and friends is clearly asking for future > >> mistakes. > >> > >> Oh, and I think that honoring SHIFT_COUNT_TRUNCATED anywhere else > >> than in machine instruction patterns was a mistake in the past. > > > > Oh, and my understanding is that it's maybe required for correctness on > > the RTL side if middle-end code removes masking operations. > > Constant propagation results later may not change the result because > > of that. I'm not sure this is the way it happens, but if we have > > > > (lshift:si (reg:si 1) (and:qi (reg:qi 2) 0x1f)) > > > > and we'd transform it to (based on SHIFT_COUNT_TRUNCATED) > > > > (lshift:si (reg:si 1) (reg:qi 2)) > > > > and later constant propagate 77 to reg:qi 2. > > > > That isn't the way SHIFT_COUNT_TRUNCATED should work, of course, > > but instead the backends should have a define_insn which matches > > the 'and' and combine should try to match that. > > > > You can see that various architectures may have truncating shifts > > but not for all patterns which results in stuff like > > > > config/aarch64/aarch64.h:#define SHIFT_COUNT_TRUNCATED !TARGET_SIMD > > > > which clearly means that it's not implemented in terms of providing > > shift insns which can match a shift operand that is masked. > > > > That is, either try to clean that up (hooray!), or preserve > > the double-int.[ch] behavior (how does CONST_INT RTL constant > > folding honor it?). > > The CONST_INT code does the modulus explicitly: > > /* Truncate the shift if SHIFT_COUNT_TRUNCATED, otherwise make sure > the value is in range. We can't return any old value for > out-of-range arguments because either the middle-end (via > shift_truncation_mask) or the back-end might be relying on > target-specific knowledge. Nor can we rely on > shift_truncation_mask, since the shift might not be part of an > ashlM3, lshrM3 or ashrM3 instruction. */ > if (SHIFT_COUNT_TRUNCATED) > arg1 = (unsigned HOST_WIDE_INT) arg1 % width; > else if (arg1 < 0 || arg1 >= GET_MODE_BITSIZE (mode)) > return 0; > > There was a strong feeling (from me and others) that wide-int.h > should not depend on tm.h. I don't think wi::lshift itself should > know about SHIFT_COUNT_TRUNCATED. Especially since (and I think we agree > on this :-)) it's a terrible interface. It would be better if it took > a mode as an argument and (perhaps) returned a mask as a result. But that > would make it even harder for wide-int.h to do the right thing, because > it doesn't know about modes. Having the callers do it seems like the > right thing to me, both now and after any future tweaks to > SHIFT_COUNT_TRUNCATED. Hmm, yeah. Oh well. > If your objection is that it's too easy to forget, then I don't think > that's a problem at the RTL level. simplify_const_binary_operation > is the only place that does constant RTX shifts. > > Sorry, I don't have time to replace SHIFT_COUNT_TRUNCATED right now. Personally I'd be happy with a global #define SHIFT_COUNT_TRUNCATED 0 (thus basically do a source code transform based on that). Any target we regress on is left to target maintainers to decide if and how to fix. Of course picking a single popular one and restoring behavior there would be nice (you may pick AARCH64 ...). Most targets have weird comments like /* Immediate shift counts are truncated by the output routines (or was it the assembler?). Shift counts in a register are truncated by ARM. Note that the native compiler puts too large (> 32) immediate shift counts into a register and shifts by the register, letting the ARM decide what to do instead of doing that itself. */ /* This is all wrong. Defining SHIFT_COUNT_TRUNCATED tells combine that code like (X << (Y % 32)) for register X, Y is equivalent to (X << Y). On the arm, Y in a register is used modulo 256 for the shift. Only for rotates is modulo 32 used. */ /* #define SHIFT_COUNT_TRUNCATED 1 */ and thus they end up erroring on the safe side and using 0 for SHIFT_COUNT_TRUNCATED anyway... A quick grep shows that targets with SHIFT_COUNT_TRUNCATED 1 are aarch64 (for !TARGET_SIMD), alpha, epiphany, iq2000, lm32, m32r, mep, microblaze, mips (for !TARGET_LOONGSON_VECTORS), mn10300, nds32, pa, picochip, score, sparc, stormy16, tilepro, v850 and xtensa. I'm sure one or the other uses it with wrong assumptions about how it works. AFAICS SHIFT_COUNT_TRUNCATED provides the opportunity to remove certain explicit mask operations for shift operands. Thanks, Richard.
On 11/04/2013 04:12 AM, Richard Biener wrote: > On Fri, 1 Nov 2013, Richard Sandiford wrote: > >> I'm building one target for each supported CPU and comparing the wide-int >> assembly output of gcc.c-torture, gcc.dg and g++.dg with the corresponding >> output from the merge point. This patch removes all the differences I saw >> for alpha-linux-gnu in gcc.c-torture. >> >> Hunk 1: Preserve the current trunk behaviour in which the shift count >> is truncated if SHIFT_COUNT_TRUNCATED and not otherwise. This was by >> inspection after hunk 5. >> >> Hunks 2 and 3: Two cases where force_fit_to_type could extend to wider >> types and where we weren't extending according to the sign of the source. >> We should probably assert that the input is at least as wide as the type... >> >> Hunk 4: The "&" in: >> >> if ((TREE_INT_CST_HIGH (arg1) & mask_hi) == mask_hi >> && (TREE_INT_CST_LOW (arg1) & mask_lo) == mask_lo) >> >> had got dropped during the conversion. >> >> Hunk 5: The old code was: >> >> if (shift > 0) >> { >> *mask = r1mask.llshift (shift, TYPE_PRECISION (type)); >> *val = r1val.llshift (shift, TYPE_PRECISION (type)); >> } >> else if (shift < 0) >> { >> shift = -shift; >> *mask = r1mask.rshift (shift, TYPE_PRECISION (type), !uns); >> *val = r1val.rshift (shift, TYPE_PRECISION (type), !uns); >> } >> >> and these precision arguments had two purposes: to control which >> bits of the first argument were shifted, and to control the truncation >> mask for SHIFT_TRUNCATED. We need to pass a width to the shift functions >> for the second. >> >> (BTW, I'm running the comparisons with CONST_WIDE_INT locally moved to the >> end of the !GENERATOR_FILE list in rtl.def, since the current position caused >> some spurious differences. The "problem" AFAICT is that hash_rtx hashes >> on code, RTL PRE creates registers in the hash order of the associated >> expressions, RA uses register numbers as a tie-breaker during ordering, >> and so the order of rtx_code can indirectly influence register allocation. >> First time I'd realised that could happen, so just thought I'd mention it. >> I think we should keep rtl.def in the current (logical) order though.) >> >> Tested on x86_64-linux-gnu and powerpc64-linux-gnu. OK for wide-int? > Bah - can you instead try removing the use of SHIFT_COUNT_TRUNCATED > from double-int.c on the trunk? If not then putting this at the > callers of wi::rshift and friends is clearly asking for future > mistakes. > > Oh, and I think that honoring SHIFT_COUNT_TRUNCATED anywhere else > than in machine instruction patterns was a mistake in the past. I disagree. The programmer has the right to expect that the optimizer transforms the program in a manner that is consistent, but faster, with the underlying machine. Saying you are going to do it in the patterns is not realistic. I should point out that i had originally put the shift count truncated inside wide-int and you (richi) told me to push it back out to the callers. I personally think that it should be returned to wide-int to assure consistent behavior. However, i would also say that it would be nice if were actually done right. There are (as far as i know) actually 3 ways handling shift amounts that have been used in the past: (1) mask the shift amount by (bitsize - 1). This is the most popular and is what is done if you set SHIFT_COUNT_TRUNCATED. (2) mask the shift amount by ((2*bitsize) -1). There are a couple of architectures that do this including the ppc. However, gcc has never done it correctly for these platforms. (3) assume that the shift amount can be a big number. The only machine that i know of that ever did this was the Knuth's mmix, and AFAIK, it has never "known" silicon. (it turns out that this is almost impossible to build efficiently). And yet, we support this if you do not set SHIFT_COUNT_TRUNCATED. I will, if you like, fix this properly. But i want to see a plan that at least richi and richard are happy with first. My gut feeling is to provide a target hook that takes a bitsize and a shift value as wide-int (or possibly a hwi) and returns a machine specific shift amount. Then i would put that inside of the shift routines of wide-int. kenny
Kenneth Zadeck <zadeck@naturalbridge.com> writes: > However, i would also say that it would be nice if were actually done > right. There are (as far as i know) actually 3 ways handling shift > amounts that have been used in the past: > > (1) mask the shift amount by (bitsize - 1). This is the most popular > and is what is done if you set SHIFT_COUNT_TRUNCATED. > (2) mask the shift amount by ((2*bitsize) -1). There are a couple of > architectures that do this including the ppc. However, gcc has never > done it correctly for these platforms. There's TARGET_SHIFT_TRUNCATION_MASK for these cases. It applies only to the shift instruction patterns though, not to any random rtl shift expression. Part of the reason for this mess is that the things represented as rtl shifts didn't necessarily start out as shifts. E.g. during combine, extractions are rewritten as shifts, optimised, then converted back. So since SHIFT_COUNT_TRUNCATED applies to all rtl shift expressions, it also has to apply to all bitfield operations, since bitfield operations temporarily appear as shifts (see tm.texi). TARGET_SHIFT_TRUNCATION_MASK deliberately limits the scope to the shift instruction patterns because that's easier to contain. E.g. knowing that shifts operate to a certain mask allows you to generate more efficient doubleword shift sequences, which was why TARGET_SHIFT_TRUNCATION_MASK was added in the first place. > (3) assume that the shift amount can be a big number. The only > machine that i know of that ever did this was the Knuth's mmix, and > AFAIK, it has never "known" silicon. (it turns out that this is almost > impossible to build efficiently). And yet, we support this if you do > not set SHIFT_COUNT_TRUNCATED. Well, we support it in the sense that we make no assumptions about what happens, at least at the rtl level. We just punt on any out-of-range shift and leave it to be evaluated at run time. Thanks, Richard
Index: gcc/fold-const.c =================================================================== --- gcc/fold-const.c (revision 204247) +++ gcc/fold-const.c (working copy) @@ -1008,9 +1008,12 @@ The following code ignores overflow; perhaps a C standard interpretation ruling is needed. */ res = wi::rshift (arg1, arg2, sign, - GET_MODE_BITSIZE (TYPE_MODE (type))); + SHIFT_COUNT_TRUNCATED + ? GET_MODE_BITSIZE (TYPE_MODE (type)) : 0); else - res = wi::lshift (arg1, arg2, GET_MODE_BITSIZE (TYPE_MODE (type))); + res = wi::lshift (arg1, arg2, + SHIFT_COUNT_TRUNCATED + ? GET_MODE_BITSIZE (TYPE_MODE (type)) : 0); break; case RROTATE_EXPR: @@ -6870,7 +6873,8 @@ return NULL_TREE; if (TREE_CODE (arg1) == INTEGER_CST) - arg1 = force_fit_type (inner_type, arg1, 0, TREE_OVERFLOW (arg1)); + arg1 = force_fit_type (inner_type, wi::to_widest (arg1), 0, + TREE_OVERFLOW (arg1)); else arg1 = fold_convert_loc (loc, inner_type, arg1); @@ -8081,7 +8085,8 @@ } if (change) { - tem = force_fit_type (type, and1, 0, TREE_OVERFLOW (and1)); + tem = force_fit_type (type, wi::to_widest (and1), 0, + TREE_OVERFLOW (and1)); return fold_build2_loc (loc, BIT_AND_EXPR, type, fold_convert_loc (loc, type, and0), tem); } @@ -14098,12 +14103,13 @@ (inner_width, outer_width - inner_width, false, TYPE_PRECISION (TREE_TYPE (arg1))); - if (mask == arg1) + wide_int common = mask & arg1; + if (common == mask) { tem_type = signed_type_for (TREE_TYPE (tem)); tem = fold_convert_loc (loc, tem_type, tem); } - else if ((mask & arg1) == 0) + else if (common == 0) { tem_type = unsigned_type_for (TREE_TYPE (tem)); tem = fold_convert_loc (loc, tem_type, tem); Index: gcc/tree-ssa-ccp.c =================================================================== --- gcc/tree-ssa-ccp.c (revision 204247) +++ gcc/tree-ssa-ccp.c (working copy) @@ -1238,15 +1238,20 @@ else code = RSHIFT_EXPR; } + int shift_precision = SHIFT_COUNT_TRUNCATED ? width : 0; if (code == RSHIFT_EXPR) { - *mask = wi::rshift (wi::ext (r1mask, width, sgn), shift, sgn); - *val = wi::rshift (wi::ext (r1val, width, sgn), shift, sgn); + *mask = wi::rshift (wi::ext (r1mask, width, sgn), + shift, sgn, shift_precision); + *val = wi::rshift (wi::ext (r1val, width, sgn), + shift, sgn, shift_precision); } else { - *mask = wi::sext (wi::lshift (r1mask, shift), width); - *val = wi::sext (wi::lshift (r1val, shift), width); + *mask = wi::ext (wi::lshift (r1mask, shift, shift_precision), + width, sgn); + *val = wi::ext (wi::lshift (r1val, shift, shift_precision), + width, sgn); } } }