diff mbox

[wide-int] More optimisations

Message ID 874n82rcpn.fsf@talisman.default
State New
Headers show

Commit Message

Richard Sandiford Oct. 27, 2013, 2:49 p.m. UTC
This patch adds some more optimisations to the wi:: comparison functions.
It uses the:

  #define CONSTANT(X) (__builtin_constant_p (X) && (X))

idiom that was mentioned before, except that I thought CONSTANT would be
too easily confused with CONSTANT_P, so I went for CAN_TELL instead.
Better names welcome.

The changes are:

- Add a fast path to eq_p for when one of the inputs isn't sign-extended.
  This includes code to handle compile-time 0 specially.

- Add the opposite optimisation to Mike's lts_p change, if we can tell at
  compile time that it applies.

- Add fast paths to ltu_p for constants.

E.g.:

  bool
  f1 (const_tree x)
  {
    return wi::eq_p (x, 0);
  }

now gives:

        xorl    %eax, %eax
        cmpw    $1, 4(%rdi)
        je      .L5
        rep ret
        .p2align 4,,10
        .p2align 3
.L5:
        cmpq    $0, 16(%rdi)
        sete    %al
        ret

  bool
  f2 (const_tree x, HOST_WIDE_INT y)
  {
    return wi::eq_p (x, y);
  }

gives:

        movq    8(%rdi), %rax
        movzwl  52(%rax), %edx
        xorl    %eax, %eax
        andw    $1023, %dx
        cmpw    $1, 4(%rdi)
        je      .L10
        rep ret
        .p2align 4,,10
        .p2align 3
.L10:
        xorq    16(%rdi), %rsi
        movzwl  %dx, %edx
        movl    $64, %ecx
        subl    %edx, %ecx
        movq    %rsi, %rax
        salq    %cl, %rax
        testl   %ecx, %ecx
        cmovg   %rax, %rsi
        testq   %rsi, %rsi
        sete    %al
        ret

  bool
  f3 (HOST_WIDE_INT x, const_tree y)
  {
    return wi::lts_p (x, y);
  }

is similarly ugly because of way that it ignores TYPE_SIGN and so has to
explicitly sign-extend "small-prec" cases:

        movq    8(%rsi), %rax
        movzwl  4(%rsi), %ecx
        movzwl  52(%rax), %edx
        andl    $1023, %edx
        cmpl    $1, %ecx
        je      .L16
        leal    -1(%rcx), %eax
        sall    $6, %ecx
        subl    %edx, %ecx
        movq    16(%rsi,%rax,8), %rax
        movq    %rax, %rdx
        salq    %cl, %rdx
        testl   %ecx, %ecx
        cmovg   %rdx, %rax
        sarq    $63, %rax
        addl    $1, %eax
        ret
        .p2align 4,,10
        .p2align 3
.L16:
        cmpl    $63, %edx
        movq    16(%rsi), %rax
        ja      .L13
        movb    $64, %cl
        subl    %edx, %ecx
        salq    %cl, %rax
        sarq    %cl, %rax
.L13:
        cmpq    %rdi, %rax
        setg    %al
        ret

but:

  bool
  f4 (HOST_WIDE_INT x, const_tree y)
  {
    return wi::lts_p (x, wi::to_widest (y));
  }

is a bit more respectable:

        movzwl  6(%rsi), %eax
        cmpl    $1, %eax
        je      .L20
        subl    $1, %eax
        movq    16(%rsi,%rax,8), %rax
        sarq    $63, %rax
        addl    $1, %eax
        ret
        .p2align 4,,10
        .p2align 3
.L20:
        cmpq    %rdi, 16(%rsi)
        setg    %al
        ret

For similar reasons:

  bool
  f5 (const_tree x)
  {
    return wi::ltu_p (x, 100);
  }

gives:

        movq    8(%rdi), %rax
        movzwl  52(%rax), %ecx
        xorl    %eax, %eax
        andw    $1023, %cx
        cmpw    $1, 4(%rdi)
        je      .L26
        rep ret
        .p2align 4,,10
        .p2align 3
.L26:
        cmpw    $63, %cx
        ja      .L23
        movl    $1, %eax
        salq    %cl, %rax
        subq    $1, %rax
        andq    16(%rdi), %rax
.L24:
        cmpq    $99, %rax
        setbe   %al
        ret
        .p2align 4,,10
        .p2align 3
.L23:
        movq    16(%rdi), %rax
        jmp     .L24

but:

  bool
  f6 (const_tree x)
  {
    return wi::ltu_p (wi::to_widest (x), 100);
  }

gives:

        xorl    %eax, %eax
        cmpw    $1, 6(%rdi)
        je      .L30
        rep ret
        .p2align 4,,10
        .p2align 3
.L30:
        cmpq    $99, 16(%rdi)
        setbe   %al
        ret

Tested on powerpc64-linux-gnu and x86_64-linux-gnu.  OK for wide-int?

Thanks,
Richard

Comments

Richard Biener Oct. 29, 2013, 12:20 p.m. UTC | #1
On Sun, 27 Oct 2013, Richard Sandiford wrote:

> This patch adds some more optimisations to the wi:: comparison functions.
> It uses the:
> 
>   #define CONSTANT(X) (__builtin_constant_p (X) && (X))
> 
> idiom that was mentioned before, except that I thought CONSTANT would be
> too easily confused with CONSTANT_P, so I went for CAN_TELL instead.
> Better names welcome.

STATIC_CONSTANT_P similar to static_assert?

> The changes are:
> 
> - Add a fast path to eq_p for when one of the inputs isn't sign-extended.
>   This includes code to handle compile-time 0 specially.
> 
> - Add the opposite optimisation to Mike's lts_p change, if we can tell at
>   compile time that it applies.

I think the cases Mike added should only be enabled when we can figure
them out at compile-time, too.

Ok with that change.

Thanks,
Richard.

> - Add fast paths to ltu_p for constants.
> 
> E.g.:
> 
>   bool
>   f1 (const_tree x)
>   {
>     return wi::eq_p (x, 0);
>   }
> 
> now gives:
> 
>         xorl    %eax, %eax
>         cmpw    $1, 4(%rdi)
>         je      .L5
>         rep ret
>         .p2align 4,,10
>         .p2align 3
> .L5:
>         cmpq    $0, 16(%rdi)
>         sete    %al
>         ret
> 
>   bool
>   f2 (const_tree x, HOST_WIDE_INT y)
>   {
>     return wi::eq_p (x, y);
>   }
> 
> gives:
> 
>         movq    8(%rdi), %rax
>         movzwl  52(%rax), %edx
>         xorl    %eax, %eax
>         andw    $1023, %dx
>         cmpw    $1, 4(%rdi)
>         je      .L10
>         rep ret
>         .p2align 4,,10
>         .p2align 3
> .L10:
>         xorq    16(%rdi), %rsi
>         movzwl  %dx, %edx
>         movl    $64, %ecx
>         subl    %edx, %ecx
>         movq    %rsi, %rax
>         salq    %cl, %rax
>         testl   %ecx, %ecx
>         cmovg   %rax, %rsi
>         testq   %rsi, %rsi
>         sete    %al
>         ret
> 
>   bool
>   f3 (HOST_WIDE_INT x, const_tree y)
>   {
>     return wi::lts_p (x, y);
>   }
> 
> is similarly ugly because of way that it ignores TYPE_SIGN and so has to
> explicitly sign-extend "small-prec" cases:
> 
>         movq    8(%rsi), %rax
>         movzwl  4(%rsi), %ecx
>         movzwl  52(%rax), %edx
>         andl    $1023, %edx
>         cmpl    $1, %ecx
>         je      .L16
>         leal    -1(%rcx), %eax
>         sall    $6, %ecx
>         subl    %edx, %ecx
>         movq    16(%rsi,%rax,8), %rax
>         movq    %rax, %rdx
>         salq    %cl, %rdx
>         testl   %ecx, %ecx
>         cmovg   %rdx, %rax
>         sarq    $63, %rax
>         addl    $1, %eax
>         ret
>         .p2align 4,,10
>         .p2align 3
> .L16:
>         cmpl    $63, %edx
>         movq    16(%rsi), %rax
>         ja      .L13
>         movb    $64, %cl
>         subl    %edx, %ecx
>         salq    %cl, %rax
>         sarq    %cl, %rax
> .L13:
>         cmpq    %rdi, %rax
>         setg    %al
>         ret
> 
> but:
> 
>   bool
>   f4 (HOST_WIDE_INT x, const_tree y)
>   {
>     return wi::lts_p (x, wi::to_widest (y));
>   }
> 
> is a bit more respectable:
> 
>         movzwl  6(%rsi), %eax
>         cmpl    $1, %eax
>         je      .L20
>         subl    $1, %eax
>         movq    16(%rsi,%rax,8), %rax
>         sarq    $63, %rax
>         addl    $1, %eax
>         ret
>         .p2align 4,,10
>         .p2align 3
> .L20:
>         cmpq    %rdi, 16(%rsi)
>         setg    %al
>         ret
> 
> For similar reasons:
> 
>   bool
>   f5 (const_tree x)
>   {
>     return wi::ltu_p (x, 100);
>   }
> 
> gives:
> 
>         movq    8(%rdi), %rax
>         movzwl  52(%rax), %ecx
>         xorl    %eax, %eax
>         andw    $1023, %cx
>         cmpw    $1, 4(%rdi)
>         je      .L26
>         rep ret
>         .p2align 4,,10
>         .p2align 3
> .L26:
>         cmpw    $63, %cx
>         ja      .L23
>         movl    $1, %eax
>         salq    %cl, %rax
>         subq    $1, %rax
>         andq    16(%rdi), %rax
> .L24:
>         cmpq    $99, %rax
>         setbe   %al
>         ret
>         .p2align 4,,10
>         .p2align 3
> .L23:
>         movq    16(%rdi), %rax
>         jmp     .L24
> 
> but:
> 
>   bool
>   f6 (const_tree x)
>   {
>     return wi::ltu_p (wi::to_widest (x), 100);
>   }
> 
> gives:
> 
>         xorl    %eax, %eax
>         cmpw    $1, 6(%rdi)
>         je      .L30
>         rep ret
>         .p2align 4,,10
>         .p2align 3
> .L30:
>         cmpq    $99, 16(%rdi)
>         setbe   %al
>         ret
> 
> Tested on powerpc64-linux-gnu and x86_64-linux-gnu.  OK for wide-int?
> 
> Thanks,
> Richard
> 
> 
> Index: gcc/system.h
> ===================================================================
> --- gcc/system.h	2013-10-27 14:25:19.144723977 +0000
> +++ gcc/system.h	2013-10-27 14:25:20.716738045 +0000
> @@ -711,6 +711,12 @@ #define gcc_unreachable() __builtin_unre
>  #define gcc_unreachable() (fancy_abort (__FILE__, __LINE__, __FUNCTION__))
>  #endif
>  
> +#if GCC_VERSION >= 3001
> +#define CAN_TELL(X) (__builtin_constant_p (X) && (X))
> +#else
> +#define CAN_TELL(X) (false && (X))
> +#endif
> +
>  /* Until we can use STATIC_ASSERT.  */
>  #define STATIC_ASSERT(X) \
>    typedef int assertion1[(X) ? 1 : -1] ATTRIBUTE_UNUSED
> Index: gcc/wide-int.h
> ===================================================================
> --- gcc/wide-int.h	2013-10-27 14:25:19.144723977 +0000
> +++ gcc/wide-int.h	2013-10-27 14:37:34.834443832 +0000
> @@ -1495,6 +1495,7 @@ wi::eq_p (const T1 &x, const T2 &y)
>    WIDE_INT_REF_FOR (T2) yi (y, precision);
>    if (xi.is_sign_extended && yi.is_sign_extended)
>      {
> +      /* This case reduces to array equality.  */
>        if (xi.len != yi.len)
>  	return false;
>        unsigned int i = 0;
> @@ -1504,10 +1505,21 @@ wi::eq_p (const T1 &x, const T2 &y)
>        while (++i != xi.len);
>        return true;
>      }
> -  if (precision <= HOST_BITS_PER_WIDE_INT)
> +  if (yi.len == 1)
>      {
> -      unsigned HOST_WIDE_INT diff = xi.ulow () ^ yi.ulow ();
> -      return (diff << (-precision % HOST_BITS_PER_WIDE_INT)) == 0;
> +      /* XI is only equal to YI if it too has a single HWI.  */
> +      if (xi.len != 1)
> +	return false;
> +      /* Excess bits in xi.val[0] will be signs or zeros, so comparisons
> +	 with 0 are simple.  */
> +      if (CAN_TELL (yi.val[0] == 0))
> +	return xi.val[0] == 0;
> +      /* Otherwise flush out any excess bits first.  */
> +      unsigned HOST_WIDE_INT diff = xi.val[0] ^ yi.val[0];
> +      int excess = HOST_BITS_PER_WIDE_INT - precision;
> +      if (excess > 0)
> +	diff <<= excess;
> +      return diff == 0;
>      }
>    return eq_p_large (xi.val, xi.len, yi.val, yi.len, precision);
>  }
> @@ -1528,20 +1540,28 @@ wi::lts_p (const T1 &x, const T2 &y)
>    unsigned int precision = get_binary_precision (x, y);
>    WIDE_INT_REF_FOR (T1) xi (x, precision);
>    WIDE_INT_REF_FOR (T2) yi (y, precision);
> -  // We optimize x < y, where y is 64 or fewer bits.
> +  /* We optimize x < y, where y is 64 or fewer bits.  */
>    if (wi::fits_shwi_p (yi))
>      {
> -      // If x fits directly into a shwi, we can compare directly.
> +      /* Make lts_p (x, 0) as efficient as wi::neg_p (x).  */
> +      if (CAN_TELL (yi.val[0] == 0))
> +	return neg_p (xi);
> +      /* If x fits directly into a shwi, we can compare directly.  */
>        if (wi::fits_shwi_p (xi))
>  	return xi.to_shwi () < yi.to_shwi ();
> -      // If x doesn't fit and is negative, then it must be more
> -      // negative than any value in y, and hence smaller than y.
> -      if (neg_p (xi, SIGNED))
> +      /* If x doesn't fit and is negative, then it must be more
> +	 negative than any value in y, and hence smaller than y.  */
> +      if (neg_p (xi))
>  	return true;
> -      // If x is positive, then it must be larger than any value in y,
> -      // and hence greater than y.
> +      /* If x is positive, then it must be larger than any value in y,
> +	 and hence greater than y.  */
>        return false;
>      }
> +  /* Optimize the opposite case, if it can be detected at compile time.  */
> +  if (CAN_TELL (xi.len == 1))
> +    /* If YI is negative it is lower than the least HWI.
> +       If YI is positive it is greater than the greatest HWI.  */
> +    return !neg_p (yi);
>    return lts_p_large (xi.val, xi.len, precision, yi.val, yi.len);
>  }
>  
> @@ -1553,6 +1573,12 @@ wi::ltu_p (const T1 &x, const T2 &y)
>    unsigned int precision = get_binary_precision (x, y);
>    WIDE_INT_REF_FOR (T1) xi (x, precision);
>    WIDE_INT_REF_FOR (T2) yi (y, precision);
> +  /* Optimize comparisons with constants and with sub-HWI unsigned
> +     integers.  */
> +  if (CAN_TELL (yi.len == 1 && yi.val[0] >= 0))
> +    return xi.len == 1 && xi.to_uhwi () < (unsigned HOST_WIDE_INT) yi.val[0];
> +  if (CAN_TELL (xi.len == 1 && xi.val[0] >= 0))
> +    return yi.len != 1 || yi.to_uhwi () > (unsigned HOST_WIDE_INT) xi.val[0];
>    if (precision <= HOST_BITS_PER_WIDE_INT)
>      {
>        unsigned HOST_WIDE_INT xl = xi.to_uhwi ();
> 
>
Richard Sandiford Oct. 29, 2013, 12:43 p.m. UTC | #2
Richard Biener <rguenther@suse.de> writes:
> On Sun, 27 Oct 2013, Richard Sandiford wrote:
>> This patch adds some more optimisations to the wi:: comparison functions.
>> It uses the:
>> 
>>   #define CONSTANT(X) (__builtin_constant_p (X) && (X))
>> 
>> idiom that was mentioned before, except that I thought CONSTANT would be
>> too easily confused with CONSTANT_P, so I went for CAN_TELL instead.
>> Better names welcome.
>
> STATIC_CONSTANT_P similar to static_assert?

Sounds good.

>> The changes are:
>> 
>> - Add a fast path to eq_p for when one of the inputs isn't sign-extended.
>>   This includes code to handle compile-time 0 specially.
>> 
>> - Add the opposite optimisation to Mike's lts_p change, if we can tell at
>>   compile time that it applies.
>
> I think the cases Mike added should only be enabled when we can figure
> them out at compile-time, too.

Well, the idea with most of these functions was to handle the simple
"everything is one HWI" cases inline.  For operations like addition this
needs to be based on the precision, since that's the only cheap way of
knowing whether the result is also a single HWI.  But for logic operations
like AND and OR, it can be based on whether the inputs are single HWIs,
since single-HWI inputs give a single-HWI output.  This catches more cases.

lts_p is a bit like AND and OR in this respect.  fits_shwi_p is the
same as "len == 1", which is simple to check and is often true.  If we
restrict the optimisation to STATIC_CONSTANT_P we'll end up using the
out-of-line comparison for all TREE_INT_CST_LTs, even the int-int ones
that we currently handle inline.

Thanks,
Richard
Kenneth Zadeck Oct. 29, 2013, 3:21 p.m. UTC | #3
On 10/29/2013 08:43 AM, Richard Sandiford wrote:
> Richard Biener <rguenther@suse.de> writes:
>> On Sun, 27 Oct 2013, Richard Sandiford wrote:
>>> This patch adds some more optimisations to the wi:: comparison functions.
>>> It uses the:
>>>
>>>    #define CONSTANT(X) (__builtin_constant_p (X) && (X))
>>>
>>> idiom that was mentioned before, except that I thought CONSTANT would be
>>> too easily confused with CONSTANT_P, so I went for CAN_TELL instead.
>>> Better names welcome.
>> STATIC_CONSTANT_P similar to static_assert?
> Sounds good.
>
>>> The changes are:
>>>
>>> - Add a fast path to eq_p for when one of the inputs isn't sign-extended.
>>>    This includes code to handle compile-time 0 specially.
>>>
>>> - Add the opposite optimisation to Mike's lts_p change, if we can tell at
>>>    compile time that it applies.
>> I think the cases Mike added should only be enabled when we can figure
>> them out at compile-time, too.
> Well, the idea with most of these functions was to handle the simple
> "everything is one HWI" cases inline.  For operations like addition this
> needs to be based on the precision, since that's the only cheap way of
> knowing whether the result is also a single HWI.  But for logic operations
> like AND and OR, it can be based on whether the inputs are single HWIs,
> since single-HWI inputs give a single-HWI output.  This catches more cases.
>
> lts_p is a bit like AND and OR in this respect.  fits_shwi_p is the
> same as "len == 1", which is simple to check and is often true.  If we
> restrict the optimisation to STATIC_CONSTANT_P we'll end up using the
> out-of-line comparison for all TREE_INT_CST_LTs, even the int-int ones
> that we currently handle inline.

This has always been my way of looking at it.

kenny
> Thanks,
> Richard
>
Kenneth Zadeck Oct. 30, 2013, 12:17 a.m. UTC | #4
On 10/29/2013 08:43 AM, Richard Sandiford wrote:
> Richard Biener <rguenther@suse.de> writes:
>> On Sun, 27 Oct 2013, Richard Sandiford wrote:
>>> This patch adds some more optimisations to the wi:: comparison functions.
>>> It uses the:
>>>
>>>    #define CONSTANT(X) (__builtin_constant_p (X) && (X))
>>>
>>> idiom that was mentioned before, except that I thought CONSTANT would be
>>> too easily confused with CONSTANT_P, so I went for CAN_TELL instead.
>>> Better names welcome.
>> STATIC_CONSTANT_P similar to static_assert?
> Sounds good.
>
>>> The changes are:
>>>
>>> - Add a fast path to eq_p for when one of the inputs isn't sign-extended.
>>>    This includes code to handle compile-time 0 specially.
>>>
>>> - Add the opposite optimisation to Mike's lts_p change, if we can tell at
>>>    compile time that it applies.
>> I think the cases Mike added should only be enabled when we can figure
>> them out at compile-time, too.
> Well, the idea with most of these functions was to handle the simple
> "everything is one HWI" cases inline.  For operations like addition this
> needs to be based on the precision, since that's the only cheap way of
> knowing whether the result is also a single HWI.  But for logic operations
> like AND and OR, it can be based on whether the inputs are single HWIs,
> since single-HWI inputs give a single-HWI output.  This catches more cases.
>
> lts_p is a bit like AND and OR in this respect.  fits_shwi_p is the
> same as "len == 1", which is simple to check and is often true.  If we
> restrict the optimisation to STATIC_CONSTANT_P we'll end up using the
> out-of-line comparison for all TREE_INT_CST_LTs, even the int-int ones
> that we currently handle inline.
>
> Thanks,
> Richard
>
yes, this is a very boring and mechanical patch and i found nothing to 
talk about.

kenny
Mike Stump Oct. 30, 2013, 1:22 a.m. UTC | #5
On Oct 29, 2013, at 5:43 AM, Richard Sandiford <rsandifo@linux.vnet.ibm.com> wrote:
> Richard Biener <rguenther@suse.de> writes:
>> 
>> I think the cases Mike added should only be enabled when we can figure
>> them out at compile-time, too.
> 
> Well, the idea with most of these functions was to handle the simple
> "everything is one HWI" cases inline.

Yes.  The idea is that 99.99% of all cases are 1 HWI or less, dynamically.  By inlining and doing those cases inline, provided the code is relatively small, seemed like a win.  This win comes at the cost of duplicative code in the wider path, as it checks the precision/length as well, but slowing down those cases seemed reasonable with respect to how infrequent we expect them.  Now, if the slow path code is the same speed as the inline code would have been, certainly the duplication just hurts.  This is why I was posting the code fragments for the fast path case.  I was aiming for the fast path to be really nice to help ensure that we don't don't slow down compiles on narrow code any.
Richard Biener Oct. 30, 2013, 9:59 a.m. UTC | #6
On Tue, 29 Oct 2013, Mike Stump wrote:

> On Oct 29, 2013, at 5:43 AM, Richard Sandiford <rsandifo@linux.vnet.ibm.com> wrote:
> > Richard Biener <rguenther@suse.de> writes:
> >> 
> >> I think the cases Mike added should only be enabled when we can figure
> >> them out at compile-time, too.
> > 
> > Well, the idea with most of these functions was to handle the simple
> > "everything is one HWI" cases inline.
> 
> Yes.  The idea is that 99.99% of all cases are 1 HWI or less, 
> dynamically.  By inlining and doing those cases inline, provided the 
> code is relatively small, seemed like a win.  This win comes at the cost 
> of duplicative code in the wider path, as it checks the precision/length 
> as well, but slowing down those cases seemed reasonable with respect to 
> how infrequent we expect them.  Now, if the slow path code is the same 
> speed as the inline code would have been, certainly the duplication just 
> hurts.  This is why I was posting the code fragments for the fast path 
> case.  I was aiming for the fast path to be really nice to help ensure 
> that we don't don't slow down compiles on narrow code any.

Handling the len == 1 case inline is fine, just don't add too many
fast special cases (mixed len == 1, len > 1, special known extended
cases, etc.) that are not decidable at compile-time as branchy code
is both hard to optimize, and puts load on I-cache and the branch
predictor.

Richard.
diff mbox

Patch

Index: gcc/system.h
===================================================================
--- gcc/system.h	2013-10-27 14:25:19.144723977 +0000
+++ gcc/system.h	2013-10-27 14:25:20.716738045 +0000
@@ -711,6 +711,12 @@  #define gcc_unreachable() __builtin_unre
 #define gcc_unreachable() (fancy_abort (__FILE__, __LINE__, __FUNCTION__))
 #endif
 
+#if GCC_VERSION >= 3001
+#define CAN_TELL(X) (__builtin_constant_p (X) && (X))
+#else
+#define CAN_TELL(X) (false && (X))
+#endif
+
 /* Until we can use STATIC_ASSERT.  */
 #define STATIC_ASSERT(X) \
   typedef int assertion1[(X) ? 1 : -1] ATTRIBUTE_UNUSED
Index: gcc/wide-int.h
===================================================================
--- gcc/wide-int.h	2013-10-27 14:25:19.144723977 +0000
+++ gcc/wide-int.h	2013-10-27 14:37:34.834443832 +0000
@@ -1495,6 +1495,7 @@  wi::eq_p (const T1 &x, const T2 &y)
   WIDE_INT_REF_FOR (T2) yi (y, precision);
   if (xi.is_sign_extended && yi.is_sign_extended)
     {
+      /* This case reduces to array equality.  */
       if (xi.len != yi.len)
 	return false;
       unsigned int i = 0;
@@ -1504,10 +1505,21 @@  wi::eq_p (const T1 &x, const T2 &y)
       while (++i != xi.len);
       return true;
     }
-  if (precision <= HOST_BITS_PER_WIDE_INT)
+  if (yi.len == 1)
     {
-      unsigned HOST_WIDE_INT diff = xi.ulow () ^ yi.ulow ();
-      return (diff << (-precision % HOST_BITS_PER_WIDE_INT)) == 0;
+      /* XI is only equal to YI if it too has a single HWI.  */
+      if (xi.len != 1)
+	return false;
+      /* Excess bits in xi.val[0] will be signs or zeros, so comparisons
+	 with 0 are simple.  */
+      if (CAN_TELL (yi.val[0] == 0))
+	return xi.val[0] == 0;
+      /* Otherwise flush out any excess bits first.  */
+      unsigned HOST_WIDE_INT diff = xi.val[0] ^ yi.val[0];
+      int excess = HOST_BITS_PER_WIDE_INT - precision;
+      if (excess > 0)
+	diff <<= excess;
+      return diff == 0;
     }
   return eq_p_large (xi.val, xi.len, yi.val, yi.len, precision);
 }
@@ -1528,20 +1540,28 @@  wi::lts_p (const T1 &x, const T2 &y)
   unsigned int precision = get_binary_precision (x, y);
   WIDE_INT_REF_FOR (T1) xi (x, precision);
   WIDE_INT_REF_FOR (T2) yi (y, precision);
-  // We optimize x < y, where y is 64 or fewer bits.
+  /* We optimize x < y, where y is 64 or fewer bits.  */
   if (wi::fits_shwi_p (yi))
     {
-      // If x fits directly into a shwi, we can compare directly.
+      /* Make lts_p (x, 0) as efficient as wi::neg_p (x).  */
+      if (CAN_TELL (yi.val[0] == 0))
+	return neg_p (xi);
+      /* If x fits directly into a shwi, we can compare directly.  */
       if (wi::fits_shwi_p (xi))
 	return xi.to_shwi () < yi.to_shwi ();
-      // If x doesn't fit and is negative, then it must be more
-      // negative than any value in y, and hence smaller than y.
-      if (neg_p (xi, SIGNED))
+      /* If x doesn't fit and is negative, then it must be more
+	 negative than any value in y, and hence smaller than y.  */
+      if (neg_p (xi))
 	return true;
-      // If x is positive, then it must be larger than any value in y,
-      // and hence greater than y.
+      /* If x is positive, then it must be larger than any value in y,
+	 and hence greater than y.  */
       return false;
     }
+  /* Optimize the opposite case, if it can be detected at compile time.  */
+  if (CAN_TELL (xi.len == 1))
+    /* If YI is negative it is lower than the least HWI.
+       If YI is positive it is greater than the greatest HWI.  */
+    return !neg_p (yi);
   return lts_p_large (xi.val, xi.len, precision, yi.val, yi.len);
 }
 
@@ -1553,6 +1573,12 @@  wi::ltu_p (const T1 &x, const T2 &y)
   unsigned int precision = get_binary_precision (x, y);
   WIDE_INT_REF_FOR (T1) xi (x, precision);
   WIDE_INT_REF_FOR (T2) yi (y, precision);
+  /* Optimize comparisons with constants and with sub-HWI unsigned
+     integers.  */
+  if (CAN_TELL (yi.len == 1 && yi.val[0] >= 0))
+    return xi.len == 1 && xi.to_uhwi () < (unsigned HOST_WIDE_INT) yi.val[0];
+  if (CAN_TELL (xi.len == 1 && xi.val[0] >= 0))
+    return yi.len != 1 || yi.to_uhwi () > (unsigned HOST_WIDE_INT) xi.val[0];
   if (precision <= HOST_BITS_PER_WIDE_INT)
     {
       unsigned HOST_WIDE_INT xl = xi.to_uhwi ();