Message ID | 874n82rcpn.fsf@talisman.default |
---|---|
State | New |
Headers | show |
On Sun, 27 Oct 2013, Richard Sandiford wrote: > This patch adds some more optimisations to the wi:: comparison functions. > It uses the: > > #define CONSTANT(X) (__builtin_constant_p (X) && (X)) > > idiom that was mentioned before, except that I thought CONSTANT would be > too easily confused with CONSTANT_P, so I went for CAN_TELL instead. > Better names welcome. STATIC_CONSTANT_P similar to static_assert? > The changes are: > > - Add a fast path to eq_p for when one of the inputs isn't sign-extended. > This includes code to handle compile-time 0 specially. > > - Add the opposite optimisation to Mike's lts_p change, if we can tell at > compile time that it applies. I think the cases Mike added should only be enabled when we can figure them out at compile-time, too. Ok with that change. Thanks, Richard. > - Add fast paths to ltu_p for constants. > > E.g.: > > bool > f1 (const_tree x) > { > return wi::eq_p (x, 0); > } > > now gives: > > xorl %eax, %eax > cmpw $1, 4(%rdi) > je .L5 > rep ret > .p2align 4,,10 > .p2align 3 > .L5: > cmpq $0, 16(%rdi) > sete %al > ret > > bool > f2 (const_tree x, HOST_WIDE_INT y) > { > return wi::eq_p (x, y); > } > > gives: > > movq 8(%rdi), %rax > movzwl 52(%rax), %edx > xorl %eax, %eax > andw $1023, %dx > cmpw $1, 4(%rdi) > je .L10 > rep ret > .p2align 4,,10 > .p2align 3 > .L10: > xorq 16(%rdi), %rsi > movzwl %dx, %edx > movl $64, %ecx > subl %edx, %ecx > movq %rsi, %rax > salq %cl, %rax > testl %ecx, %ecx > cmovg %rax, %rsi > testq %rsi, %rsi > sete %al > ret > > bool > f3 (HOST_WIDE_INT x, const_tree y) > { > return wi::lts_p (x, y); > } > > is similarly ugly because of way that it ignores TYPE_SIGN and so has to > explicitly sign-extend "small-prec" cases: > > movq 8(%rsi), %rax > movzwl 4(%rsi), %ecx > movzwl 52(%rax), %edx > andl $1023, %edx > cmpl $1, %ecx > je .L16 > leal -1(%rcx), %eax > sall $6, %ecx > subl %edx, %ecx > movq 16(%rsi,%rax,8), %rax > movq %rax, %rdx > salq %cl, %rdx > testl %ecx, %ecx > cmovg %rdx, %rax > sarq $63, %rax > addl $1, %eax > ret > .p2align 4,,10 > .p2align 3 > .L16: > cmpl $63, %edx > movq 16(%rsi), %rax > ja .L13 > movb $64, %cl > subl %edx, %ecx > salq %cl, %rax > sarq %cl, %rax > .L13: > cmpq %rdi, %rax > setg %al > ret > > but: > > bool > f4 (HOST_WIDE_INT x, const_tree y) > { > return wi::lts_p (x, wi::to_widest (y)); > } > > is a bit more respectable: > > movzwl 6(%rsi), %eax > cmpl $1, %eax > je .L20 > subl $1, %eax > movq 16(%rsi,%rax,8), %rax > sarq $63, %rax > addl $1, %eax > ret > .p2align 4,,10 > .p2align 3 > .L20: > cmpq %rdi, 16(%rsi) > setg %al > ret > > For similar reasons: > > bool > f5 (const_tree x) > { > return wi::ltu_p (x, 100); > } > > gives: > > movq 8(%rdi), %rax > movzwl 52(%rax), %ecx > xorl %eax, %eax > andw $1023, %cx > cmpw $1, 4(%rdi) > je .L26 > rep ret > .p2align 4,,10 > .p2align 3 > .L26: > cmpw $63, %cx > ja .L23 > movl $1, %eax > salq %cl, %rax > subq $1, %rax > andq 16(%rdi), %rax > .L24: > cmpq $99, %rax > setbe %al > ret > .p2align 4,,10 > .p2align 3 > .L23: > movq 16(%rdi), %rax > jmp .L24 > > but: > > bool > f6 (const_tree x) > { > return wi::ltu_p (wi::to_widest (x), 100); > } > > gives: > > xorl %eax, %eax > cmpw $1, 6(%rdi) > je .L30 > rep ret > .p2align 4,,10 > .p2align 3 > .L30: > cmpq $99, 16(%rdi) > setbe %al > ret > > Tested on powerpc64-linux-gnu and x86_64-linux-gnu. OK for wide-int? > > Thanks, > Richard > > > Index: gcc/system.h > =================================================================== > --- gcc/system.h 2013-10-27 14:25:19.144723977 +0000 > +++ gcc/system.h 2013-10-27 14:25:20.716738045 +0000 > @@ -711,6 +711,12 @@ #define gcc_unreachable() __builtin_unre > #define gcc_unreachable() (fancy_abort (__FILE__, __LINE__, __FUNCTION__)) > #endif > > +#if GCC_VERSION >= 3001 > +#define CAN_TELL(X) (__builtin_constant_p (X) && (X)) > +#else > +#define CAN_TELL(X) (false && (X)) > +#endif > + > /* Until we can use STATIC_ASSERT. */ > #define STATIC_ASSERT(X) \ > typedef int assertion1[(X) ? 1 : -1] ATTRIBUTE_UNUSED > Index: gcc/wide-int.h > =================================================================== > --- gcc/wide-int.h 2013-10-27 14:25:19.144723977 +0000 > +++ gcc/wide-int.h 2013-10-27 14:37:34.834443832 +0000 > @@ -1495,6 +1495,7 @@ wi::eq_p (const T1 &x, const T2 &y) > WIDE_INT_REF_FOR (T2) yi (y, precision); > if (xi.is_sign_extended && yi.is_sign_extended) > { > + /* This case reduces to array equality. */ > if (xi.len != yi.len) > return false; > unsigned int i = 0; > @@ -1504,10 +1505,21 @@ wi::eq_p (const T1 &x, const T2 &y) > while (++i != xi.len); > return true; > } > - if (precision <= HOST_BITS_PER_WIDE_INT) > + if (yi.len == 1) > { > - unsigned HOST_WIDE_INT diff = xi.ulow () ^ yi.ulow (); > - return (diff << (-precision % HOST_BITS_PER_WIDE_INT)) == 0; > + /* XI is only equal to YI if it too has a single HWI. */ > + if (xi.len != 1) > + return false; > + /* Excess bits in xi.val[0] will be signs or zeros, so comparisons > + with 0 are simple. */ > + if (CAN_TELL (yi.val[0] == 0)) > + return xi.val[0] == 0; > + /* Otherwise flush out any excess bits first. */ > + unsigned HOST_WIDE_INT diff = xi.val[0] ^ yi.val[0]; > + int excess = HOST_BITS_PER_WIDE_INT - precision; > + if (excess > 0) > + diff <<= excess; > + return diff == 0; > } > return eq_p_large (xi.val, xi.len, yi.val, yi.len, precision); > } > @@ -1528,20 +1540,28 @@ wi::lts_p (const T1 &x, const T2 &y) > unsigned int precision = get_binary_precision (x, y); > WIDE_INT_REF_FOR (T1) xi (x, precision); > WIDE_INT_REF_FOR (T2) yi (y, precision); > - // We optimize x < y, where y is 64 or fewer bits. > + /* We optimize x < y, where y is 64 or fewer bits. */ > if (wi::fits_shwi_p (yi)) > { > - // If x fits directly into a shwi, we can compare directly. > + /* Make lts_p (x, 0) as efficient as wi::neg_p (x). */ > + if (CAN_TELL (yi.val[0] == 0)) > + return neg_p (xi); > + /* If x fits directly into a shwi, we can compare directly. */ > if (wi::fits_shwi_p (xi)) > return xi.to_shwi () < yi.to_shwi (); > - // If x doesn't fit and is negative, then it must be more > - // negative than any value in y, and hence smaller than y. > - if (neg_p (xi, SIGNED)) > + /* If x doesn't fit and is negative, then it must be more > + negative than any value in y, and hence smaller than y. */ > + if (neg_p (xi)) > return true; > - // If x is positive, then it must be larger than any value in y, > - // and hence greater than y. > + /* If x is positive, then it must be larger than any value in y, > + and hence greater than y. */ > return false; > } > + /* Optimize the opposite case, if it can be detected at compile time. */ > + if (CAN_TELL (xi.len == 1)) > + /* If YI is negative it is lower than the least HWI. > + If YI is positive it is greater than the greatest HWI. */ > + return !neg_p (yi); > return lts_p_large (xi.val, xi.len, precision, yi.val, yi.len); > } > > @@ -1553,6 +1573,12 @@ wi::ltu_p (const T1 &x, const T2 &y) > unsigned int precision = get_binary_precision (x, y); > WIDE_INT_REF_FOR (T1) xi (x, precision); > WIDE_INT_REF_FOR (T2) yi (y, precision); > + /* Optimize comparisons with constants and with sub-HWI unsigned > + integers. */ > + if (CAN_TELL (yi.len == 1 && yi.val[0] >= 0)) > + return xi.len == 1 && xi.to_uhwi () < (unsigned HOST_WIDE_INT) yi.val[0]; > + if (CAN_TELL (xi.len == 1 && xi.val[0] >= 0)) > + return yi.len != 1 || yi.to_uhwi () > (unsigned HOST_WIDE_INT) xi.val[0]; > if (precision <= HOST_BITS_PER_WIDE_INT) > { > unsigned HOST_WIDE_INT xl = xi.to_uhwi (); > >
Richard Biener <rguenther@suse.de> writes: > On Sun, 27 Oct 2013, Richard Sandiford wrote: >> This patch adds some more optimisations to the wi:: comparison functions. >> It uses the: >> >> #define CONSTANT(X) (__builtin_constant_p (X) && (X)) >> >> idiom that was mentioned before, except that I thought CONSTANT would be >> too easily confused with CONSTANT_P, so I went for CAN_TELL instead. >> Better names welcome. > > STATIC_CONSTANT_P similar to static_assert? Sounds good. >> The changes are: >> >> - Add a fast path to eq_p for when one of the inputs isn't sign-extended. >> This includes code to handle compile-time 0 specially. >> >> - Add the opposite optimisation to Mike's lts_p change, if we can tell at >> compile time that it applies. > > I think the cases Mike added should only be enabled when we can figure > them out at compile-time, too. Well, the idea with most of these functions was to handle the simple "everything is one HWI" cases inline. For operations like addition this needs to be based on the precision, since that's the only cheap way of knowing whether the result is also a single HWI. But for logic operations like AND and OR, it can be based on whether the inputs are single HWIs, since single-HWI inputs give a single-HWI output. This catches more cases. lts_p is a bit like AND and OR in this respect. fits_shwi_p is the same as "len == 1", which is simple to check and is often true. If we restrict the optimisation to STATIC_CONSTANT_P we'll end up using the out-of-line comparison for all TREE_INT_CST_LTs, even the int-int ones that we currently handle inline. Thanks, Richard
On 10/29/2013 08:43 AM, Richard Sandiford wrote: > Richard Biener <rguenther@suse.de> writes: >> On Sun, 27 Oct 2013, Richard Sandiford wrote: >>> This patch adds some more optimisations to the wi:: comparison functions. >>> It uses the: >>> >>> #define CONSTANT(X) (__builtin_constant_p (X) && (X)) >>> >>> idiom that was mentioned before, except that I thought CONSTANT would be >>> too easily confused with CONSTANT_P, so I went for CAN_TELL instead. >>> Better names welcome. >> STATIC_CONSTANT_P similar to static_assert? > Sounds good. > >>> The changes are: >>> >>> - Add a fast path to eq_p for when one of the inputs isn't sign-extended. >>> This includes code to handle compile-time 0 specially. >>> >>> - Add the opposite optimisation to Mike's lts_p change, if we can tell at >>> compile time that it applies. >> I think the cases Mike added should only be enabled when we can figure >> them out at compile-time, too. > Well, the idea with most of these functions was to handle the simple > "everything is one HWI" cases inline. For operations like addition this > needs to be based on the precision, since that's the only cheap way of > knowing whether the result is also a single HWI. But for logic operations > like AND and OR, it can be based on whether the inputs are single HWIs, > since single-HWI inputs give a single-HWI output. This catches more cases. > > lts_p is a bit like AND and OR in this respect. fits_shwi_p is the > same as "len == 1", which is simple to check and is often true. If we > restrict the optimisation to STATIC_CONSTANT_P we'll end up using the > out-of-line comparison for all TREE_INT_CST_LTs, even the int-int ones > that we currently handle inline. This has always been my way of looking at it. kenny > Thanks, > Richard >
On 10/29/2013 08:43 AM, Richard Sandiford wrote: > Richard Biener <rguenther@suse.de> writes: >> On Sun, 27 Oct 2013, Richard Sandiford wrote: >>> This patch adds some more optimisations to the wi:: comparison functions. >>> It uses the: >>> >>> #define CONSTANT(X) (__builtin_constant_p (X) && (X)) >>> >>> idiom that was mentioned before, except that I thought CONSTANT would be >>> too easily confused with CONSTANT_P, so I went for CAN_TELL instead. >>> Better names welcome. >> STATIC_CONSTANT_P similar to static_assert? > Sounds good. > >>> The changes are: >>> >>> - Add a fast path to eq_p for when one of the inputs isn't sign-extended. >>> This includes code to handle compile-time 0 specially. >>> >>> - Add the opposite optimisation to Mike's lts_p change, if we can tell at >>> compile time that it applies. >> I think the cases Mike added should only be enabled when we can figure >> them out at compile-time, too. > Well, the idea with most of these functions was to handle the simple > "everything is one HWI" cases inline. For operations like addition this > needs to be based on the precision, since that's the only cheap way of > knowing whether the result is also a single HWI. But for logic operations > like AND and OR, it can be based on whether the inputs are single HWIs, > since single-HWI inputs give a single-HWI output. This catches more cases. > > lts_p is a bit like AND and OR in this respect. fits_shwi_p is the > same as "len == 1", which is simple to check and is often true. If we > restrict the optimisation to STATIC_CONSTANT_P we'll end up using the > out-of-line comparison for all TREE_INT_CST_LTs, even the int-int ones > that we currently handle inline. > > Thanks, > Richard > yes, this is a very boring and mechanical patch and i found nothing to talk about. kenny
On Oct 29, 2013, at 5:43 AM, Richard Sandiford <rsandifo@linux.vnet.ibm.com> wrote: > Richard Biener <rguenther@suse.de> writes: >> >> I think the cases Mike added should only be enabled when we can figure >> them out at compile-time, too. > > Well, the idea with most of these functions was to handle the simple > "everything is one HWI" cases inline. Yes. The idea is that 99.99% of all cases are 1 HWI or less, dynamically. By inlining and doing those cases inline, provided the code is relatively small, seemed like a win. This win comes at the cost of duplicative code in the wider path, as it checks the precision/length as well, but slowing down those cases seemed reasonable with respect to how infrequent we expect them. Now, if the slow path code is the same speed as the inline code would have been, certainly the duplication just hurts. This is why I was posting the code fragments for the fast path case. I was aiming for the fast path to be really nice to help ensure that we don't don't slow down compiles on narrow code any.
On Tue, 29 Oct 2013, Mike Stump wrote: > On Oct 29, 2013, at 5:43 AM, Richard Sandiford <rsandifo@linux.vnet.ibm.com> wrote: > > Richard Biener <rguenther@suse.de> writes: > >> > >> I think the cases Mike added should only be enabled when we can figure > >> them out at compile-time, too. > > > > Well, the idea with most of these functions was to handle the simple > > "everything is one HWI" cases inline. > > Yes. The idea is that 99.99% of all cases are 1 HWI or less, > dynamically. By inlining and doing those cases inline, provided the > code is relatively small, seemed like a win. This win comes at the cost > of duplicative code in the wider path, as it checks the precision/length > as well, but slowing down those cases seemed reasonable with respect to > how infrequent we expect them. Now, if the slow path code is the same > speed as the inline code would have been, certainly the duplication just > hurts. This is why I was posting the code fragments for the fast path > case. I was aiming for the fast path to be really nice to help ensure > that we don't don't slow down compiles on narrow code any. Handling the len == 1 case inline is fine, just don't add too many fast special cases (mixed len == 1, len > 1, special known extended cases, etc.) that are not decidable at compile-time as branchy code is both hard to optimize, and puts load on I-cache and the branch predictor. Richard.
Index: gcc/system.h =================================================================== --- gcc/system.h 2013-10-27 14:25:19.144723977 +0000 +++ gcc/system.h 2013-10-27 14:25:20.716738045 +0000 @@ -711,6 +711,12 @@ #define gcc_unreachable() __builtin_unre #define gcc_unreachable() (fancy_abort (__FILE__, __LINE__, __FUNCTION__)) #endif +#if GCC_VERSION >= 3001 +#define CAN_TELL(X) (__builtin_constant_p (X) && (X)) +#else +#define CAN_TELL(X) (false && (X)) +#endif + /* Until we can use STATIC_ASSERT. */ #define STATIC_ASSERT(X) \ typedef int assertion1[(X) ? 1 : -1] ATTRIBUTE_UNUSED Index: gcc/wide-int.h =================================================================== --- gcc/wide-int.h 2013-10-27 14:25:19.144723977 +0000 +++ gcc/wide-int.h 2013-10-27 14:37:34.834443832 +0000 @@ -1495,6 +1495,7 @@ wi::eq_p (const T1 &x, const T2 &y) WIDE_INT_REF_FOR (T2) yi (y, precision); if (xi.is_sign_extended && yi.is_sign_extended) { + /* This case reduces to array equality. */ if (xi.len != yi.len) return false; unsigned int i = 0; @@ -1504,10 +1505,21 @@ wi::eq_p (const T1 &x, const T2 &y) while (++i != xi.len); return true; } - if (precision <= HOST_BITS_PER_WIDE_INT) + if (yi.len == 1) { - unsigned HOST_WIDE_INT diff = xi.ulow () ^ yi.ulow (); - return (diff << (-precision % HOST_BITS_PER_WIDE_INT)) == 0; + /* XI is only equal to YI if it too has a single HWI. */ + if (xi.len != 1) + return false; + /* Excess bits in xi.val[0] will be signs or zeros, so comparisons + with 0 are simple. */ + if (CAN_TELL (yi.val[0] == 0)) + return xi.val[0] == 0; + /* Otherwise flush out any excess bits first. */ + unsigned HOST_WIDE_INT diff = xi.val[0] ^ yi.val[0]; + int excess = HOST_BITS_PER_WIDE_INT - precision; + if (excess > 0) + diff <<= excess; + return diff == 0; } return eq_p_large (xi.val, xi.len, yi.val, yi.len, precision); } @@ -1528,20 +1540,28 @@ wi::lts_p (const T1 &x, const T2 &y) unsigned int precision = get_binary_precision (x, y); WIDE_INT_REF_FOR (T1) xi (x, precision); WIDE_INT_REF_FOR (T2) yi (y, precision); - // We optimize x < y, where y is 64 or fewer bits. + /* We optimize x < y, where y is 64 or fewer bits. */ if (wi::fits_shwi_p (yi)) { - // If x fits directly into a shwi, we can compare directly. + /* Make lts_p (x, 0) as efficient as wi::neg_p (x). */ + if (CAN_TELL (yi.val[0] == 0)) + return neg_p (xi); + /* If x fits directly into a shwi, we can compare directly. */ if (wi::fits_shwi_p (xi)) return xi.to_shwi () < yi.to_shwi (); - // If x doesn't fit and is negative, then it must be more - // negative than any value in y, and hence smaller than y. - if (neg_p (xi, SIGNED)) + /* If x doesn't fit and is negative, then it must be more + negative than any value in y, and hence smaller than y. */ + if (neg_p (xi)) return true; - // If x is positive, then it must be larger than any value in y, - // and hence greater than y. + /* If x is positive, then it must be larger than any value in y, + and hence greater than y. */ return false; } + /* Optimize the opposite case, if it can be detected at compile time. */ + if (CAN_TELL (xi.len == 1)) + /* If YI is negative it is lower than the least HWI. + If YI is positive it is greater than the greatest HWI. */ + return !neg_p (yi); return lts_p_large (xi.val, xi.len, precision, yi.val, yi.len); } @@ -1553,6 +1573,12 @@ wi::ltu_p (const T1 &x, const T2 &y) unsigned int precision = get_binary_precision (x, y); WIDE_INT_REF_FOR (T1) xi (x, precision); WIDE_INT_REF_FOR (T2) yi (y, precision); + /* Optimize comparisons with constants and with sub-HWI unsigned + integers. */ + if (CAN_TELL (yi.len == 1 && yi.val[0] >= 0)) + return xi.len == 1 && xi.to_uhwi () < (unsigned HOST_WIDE_INT) yi.val[0]; + if (CAN_TELL (xi.len == 1 && xi.val[0] >= 0)) + return yi.len != 1 || yi.to_uhwi () > (unsigned HOST_WIDE_INT) xi.val[0]; if (precision <= HOST_BITS_PER_WIDE_INT) { unsigned HOST_WIDE_INT xl = xi.to_uhwi ();