[wide-int] More optimisations

This patch adds some more optimisations to the wi:: comparison functions.
It uses the:

  #define CONSTANT(X) (__builtin_constant_p (X) && (X))

idiom that was mentioned before, except that I thought CONSTANT would be
too easily confused with CONSTANT_P, so I went for CAN_TELL instead.
Better names welcome.

The changes are:

- Add a fast path to eq_p for when one of the inputs isn't sign-extended.
  This includes code to handle compile-time 0 specially.

- Add the opposite optimisation to Mike's lts_p change, if we can tell at
  compile time that it applies.

- Add fast paths to ltu_p for constants.

E.g.:

  bool
  f1 (const_tree x)
  {
    return wi::eq_p (x, 0);
  }

now gives:

        xorl    %eax, %eax
        cmpw    $1, 4(%rdi)
        je      .L5
        rep ret
        .p2align 4,,10
        .p2align 3
.L5:
        cmpq    $0, 16(%rdi)
        sete    %al
        ret

  bool
  f2 (const_tree x, HOST_WIDE_INT y)
  {
    return wi::eq_p (x, y);
  }

gives:

        movq    8(%rdi), %rax
        movzwl  52(%rax), %edx
        xorl    %eax, %eax
        andw    $1023, %dx
        cmpw    $1, 4(%rdi)
        je      .L10
        rep ret
        .p2align 4,,10
        .p2align 3
.L10:
        xorq    16(%rdi), %rsi
        movzwl  %dx, %edx
        movl    $64, %ecx
        subl    %edx, %ecx
        movq    %rsi, %rax
        salq    %cl, %rax
        testl   %ecx, %ecx
        cmovg   %rax, %rsi
        testq   %rsi, %rsi
        sete    %al
        ret

  bool
  f3 (HOST_WIDE_INT x, const_tree y)
  {
    return wi::lts_p (x, y);
  }

is similarly ugly because of way that it ignores TYPE_SIGN and so has to
explicitly sign-extend "small-prec" cases:

        movq    8(%rsi), %rax
        movzwl  4(%rsi), %ecx
        movzwl  52(%rax), %edx
        andl    $1023, %edx
        cmpl    $1, %ecx
        je      .L16
        leal    -1(%rcx), %eax
        sall    $6, %ecx
        subl    %edx, %ecx
        movq    16(%rsi,%rax,8), %rax
        movq    %rax, %rdx
        salq    %cl, %rdx
        testl   %ecx, %ecx
        cmovg   %rdx, %rax
        sarq    $63, %rax
        addl    $1, %eax
        ret
        .p2align 4,,10
        .p2align 3
.L16:
        cmpl    $63, %edx
        movq    16(%rsi), %rax
        ja      .L13
        movb    $64, %cl
        subl    %edx, %ecx
        salq    %cl, %rax
        sarq    %cl, %rax
.L13:
        cmpq    %rdi, %rax
        setg    %al
        ret

but:

  bool
  f4 (HOST_WIDE_INT x, const_tree y)
  {
    return wi::lts_p (x, wi::to_widest (y));
  }

is a bit more respectable:

        movzwl  6(%rsi), %eax
        cmpl    $1, %eax
        je      .L20
        subl    $1, %eax
        movq    16(%rsi,%rax,8), %rax
        sarq    $63, %rax
        addl    $1, %eax
        ret
        .p2align 4,,10
        .p2align 3
.L20:
        cmpq    %rdi, 16(%rsi)
        setg    %al
        ret

For similar reasons:

  bool
  f5 (const_tree x)
  {
    return wi::ltu_p (x, 100);
  }

gives:

        movq    8(%rdi), %rax
        movzwl  52(%rax), %ecx
        xorl    %eax, %eax
        andw    $1023, %cx
        cmpw    $1, 4(%rdi)
        je      .L26
        rep ret
        .p2align 4,,10
        .p2align 3
.L26:
        cmpw    $63, %cx
        ja      .L23
        movl    $1, %eax
        salq    %cl, %rax
        subq    $1, %rax
        andq    16(%rdi), %rax
.L24:
        cmpq    $99, %rax
        setbe   %al
        ret
        .p2align 4,,10
        .p2align 3
.L23:
        movq    16(%rdi), %rax
        jmp     .L24

but:

  bool
  f6 (const_tree x)
  {
    return wi::ltu_p (wi::to_widest (x), 100);
  }

gives:

        xorl    %eax, %eax
        cmpw    $1, 6(%rdi)
        je      .L30
        rep ret
        .p2align 4,,10
        .p2align 3
.L30:
        cmpq    $99, 16(%rdi)
        setbe   %al
        ret

Tested on powerpc64-linux-gnu and x86_64-linux-gnu.  OK for wide-int?

Thanks,
Richard

[wide-int] More optimisations

Commit Message

Comments

Patch