diff mbox

[RFC] Use of host vector operations in host helper functions

Message ID 53FF4D6B.2050202@twiddle.net
State New
Headers show

Commit Message

Richard Henderson Aug. 28, 2014, 3:40 p.m. UTC
Most of the time, guest vector operations are rare enough that it doesn't
really matter that we implement them with a loop around integer operations.

But for target-alpha, there's one vector comparison operation that appears in
every guest string operation, and is used heavily enough that it's in the top
10 functions in the profile: cmpbge (compare bytes greater or equal).

I did some experiments, where I rewrote the function using gcc's "generic"
vector types and builtin operations.  Irritatingly, gcc won't use a wider
vector insn to implement a narrower operation, so I needed to widen by hand in
order to get vectorization for SSE2, but:

----------------------------------------------------------------------
----------------------------------------------------------------------

allows very good optimization on x86_64:

0000000000000120 <helper_cmpbge>:
 120:   48 89 7c 24 e8          mov    %rdi,-0x18(%rsp)
 125:   48 b8 01 01 01 01 01    movabs $0x101010101010101,%rax
 12c:   01 01 01
 12f:   f3 0f 7e 5c 24 e8       movq   -0x18(%rsp),%xmm3
 135:   48 89 74 24 e8          mov    %rsi,-0x18(%rsp)
 13a:   f3 0f 7e 64 24 e8       movq   -0x18(%rsp),%xmm4
 140:   f3 0f 7e c3             movq   %xmm3,%xmm0
 144:   f3 0f 7e cc             movq   %xmm4,%xmm1
 148:   66 0f 6f d1             movdqa %xmm1,%xmm2
 14c:   66 0f d8 d0             psubusb %xmm0,%xmm2
 150:   66 0f ef c0             pxor   %xmm0,%xmm0
 154:   66 0f 74 c2             pcmpeqb %xmm2,%xmm0
 158:   66 0f 7f 44 24 e8       movdqa %xmm0,-0x18(%rsp)
 15e:   48 8b 54 24 e8          mov    -0x18(%rsp),%rdx
 163:   48 21 c2                and    %rax,%rdx
 166:   48 89 d0                mov    %rdx,%rax
 169:   48 c1 e8 07             shr    $0x7,%rax
 16d:   48 09 d0                or     %rdx,%rax
 170:   48 89 c2                mov    %rax,%rdx
 173:   48 c1 ea 0e             shr    $0xe,%rdx
 177:   48 09 c2                or     %rax,%rdx
 17a:   48 89 d0                mov    %rdx,%rax
 17d:   48 c1 e8 1c             shr    $0x1c,%rax
 181:   48 09 d0                or     %rdx,%rax
 184:   0f b6 c0                movzbl %al,%eax
 187:   c3                      retq

which is just about as good as you could hope for (modulo two extra movq insns).

Profiling a (guest) compilation of glibc, helper_cmpbge is reduced from 3% to
0.8% of emulation time, and from 7th to 11th in the ranking.

GCC doesn't do a half-bad job on other hosts either:

aarch64:
  b4:   4f000400        movi    v0.4s, #0x0
  b8:   4ea01c01        mov     v1.16b, v0.16b
  bc:   4e081c00        mov     v0.d[0], x0
  c0:   4e081c21        mov     v1.d[0], x1
  c4:   6e213c00        cmhs    v0.16b, v0.16b, v1.16b
  c8:   4e083c00        mov     x0, v0.d[0]
  cc:   9200c000        and     x0, x0, #0x101010101010101
  d0:   aa401c00        orr     x0, x0, x0, lsr #7
  d4:   aa403800        orr     x0, x0, x0, lsr #14
  d8:   aa407000        orr     x0, x0, x0, lsr #28
  dc:   53001c00        uxtb    w0, w0
  e0:   d65f03c0        ret

Of course aarch64 *does* have an 8-byte vector size that gcc knows how to use.
 If I adjust the patch above to use it, only the first two insns are eliminated
-- surely not a measurable difference.

power7:
  ...
  vcmpgtub 13,0,1
  vcmpequb 0,0,1
  xxlor 32,45,32
  ...


But I guess the larger question here is: how much of this should we accept?

(0) Ignore this and do nothing?

(1) No general infrastructure.  Special case this one insn with #ifdef __SSE2__
and ignore anything else.

(2) Put in just enough infrastructure to know if compiler support for general
vectors is available, and then use it ad hoc when such functions are shown to
be high on the profile?

(3) Put in more infrastructure and allow it to be used to implement most guest
vector operations, possibly tidying their implementations?



r~

Comments

Alex Bennée Sept. 13, 2014, 4:02 p.m. UTC | #1
Richard Henderson writes:

> Most of the time, guest vector operations are rare enough that it doesn't
> really matter that we implement them with a loop around integer operations.
>
> But for target-alpha, there's one vector comparison operation that appears in
> every guest string operation, and is used heavily enough that it's in the top
> 10 functions in the profile: cmpbge (compare bytes greater or equal).

For a helper function to top the profile is pretty impressive. I wonder
how it compares when you break it down by basic blocks?

> I did some experiments, where I rewrote the function using gcc's "generic"
> vector types and builtin operations.
>
> <snip>
>
> GCC doesn't do a half-bad job on other hosts either:
>
> aarch64:
>   b4:   4f000400        movi    v0.4s, #0x0
>   b8:   4ea01c01        mov     v1.16b, v0.16b
>   bc:   4e081c00        mov     v0.d[0], x0
>   c0:   4e081c21        mov     v1.d[0], x1
>   c4:   6e213c00        cmhs    v0.16b, v0.16b, v1.16b
>   c8:   4e083c00        mov     x0, v0.d[0]
>   cc:   9200c000        and     x0, x0, #0x101010101010101
>   d0:   aa401c00        orr     x0, x0, x0, lsr #7
>   d4:   aa403800        orr     x0, x0, x0, lsr #14
>   d8:   aa407000        orr     x0, x0, x0, lsr #28
>   dc:   53001c00        uxtb    w0, w0
>   e0:   d65f03c0        ret
>
> Of course aarch64 *does* have an 8-byte vector size that gcc knows how to use.
>  If I adjust the patch above to use it, only the first two insns are eliminated
> -- surely not a measurable difference.
>
> power7:
>   ...
>   vcmpgtub 13,0,1
>   vcmpequb 0,0,1
>   xxlor 32,45,32
>   ...
>
>
> But I guess the larger question here is: how much of this should we accept?
>
> (0) Ignore this and do nothing?
>
> (1) No general infrastructure.  Special case this one insn with #ifdef __SSE2__
> and ignore anything else.

Not a big fan of special cases that are arch dependent.

> (2) Put in just enough infrastructure to know if compiler support for general
> vectors is available, and then use it ad hoc when such functions are shown to
> be high on the profile?
>
> (3) Put in more infrastructure and allow it to be used to implement most guest
> vector operations, possibly tidying their implementations?
<snip>

(4) Consider supporting generic vector operations in the TCG?

While making helper functions faster is good I've wondered if they is
enough genericsm across the various SIMD/vector operations we could add
add TCG ops to translate them? The ops could fall back to generic helper
functions using the GCC instrinsics if we know there is no decent
back-end support for them?
Kirill Batuzov Oct. 16, 2014, 8:56 a.m. UTC | #2
> (4) Consider supporting generic vector operations in the TCG?

I gave it a go and was quite happy with the result. I have implemented the add_i32x4
opcode which is addition of 128-bit vectors composed of four 32-bit integers
and used it to translate NEON vadd.i32 to SSE paddd instruction. I used ARM for
my guest because I'm familiar with this architecture and it is different from
my host.

I got a 3x speedup on my testcase:

    mov			r0, #0xb0000000
loop:
    vadd.i32    q0, q0, q1
    vadd.i32    q0, q0, q1
    vadd.i32    q0, q0, q1
    vadd.i32    q0, q0, q1
    subs        r0, r0, #1
    bne         loop

Evaluation results:

master: 25.398s
patched: 7.704s

Generated code:

IN: 
0x00008298:  f2200842      vadd.i32	q0, q0, q1
0x0000829c:  f2200842      vadd.i32	q0, q0, q1
0x000082a0:  f2200842      vadd.i32	q0, q0, q1
0x000082a4:  f2200842      vadd.i32	q0, q0, q1
<...>

OP after optimization and liveness analysis:
 ld_i32 tmp5,env,$0xfffffffffffffffc
 movi_i32 tmp6,$0x0
 brcond_i32 tmp5,tmp6,ne,$0x0
 ---- 0x8298
 add_i32x4 q0,q0,q1

 ---- 0x829c
 add_i32x4 q0,q0,q1

 ---- 0x82a0
 add_i32x4 q0,q0,q1

 ---- 0x82a4
 add_i32x4 q0,q0,q1
<...>

OUT: [size=196]
0x60442450:  mov    -0x4(%r14),%ebp
0x60442454:  test   %ebp,%ebp
0x60442456:  jne    0x60442505
0x6044245c:  movdqu 0x658(%r14),%xmm0
0x60442465:  movdqu 0x668(%r14),%xmm1
0x6044246e:  paddd  %xmm1,%xmm0
0x60442472:  paddd  %xmm1,%xmm0
0x60442476:  paddd  %xmm1,%xmm0
0x6044247a:  paddd  %xmm1,%xmm0
0x6044247e:  movdqu %xmm0,0x658(%r14)
<...>

> But for target-alpha, there's one vector comparison operation that appears in
> every guest string operation, and is used heavily enough that it's in the top
> 10 functions in the profile: cmpbge (compare bytes greater or equal).

cmpbge can be translated as follows:

cmpge_i8x8      tmp0, arg1, arg2
select_msb_i8x8 res, tmp0

where cmpge is "compare grater or equal" with following semantic:
res[i] = <111...11> if arg1[i] >= arg2[i]
res[i] = <000...00> if arg1[i] <  arg2[i]
There is such operation in NEON. In SSE we can emulate it with PCMPEQB, PCMPGTB
and POR.

select_msb is "select most significant bit". SSE instruction PMOVMSKB.

> While making helper functions faster is good I've wondered if they is
> enough genericsm across the various SIMD/vector operations we could add
> add TCG ops to translate them? The ops could fall back to generic helper
> functions using the GCC instrinsics if we know there is no decent
> back-end support for them?

From Valgrind experience there are enough genericism. Valgrind can translate
SSE, AltiVec and NEON instructions to vector opcodes. Most of the opcodes are
reused between instruction sets.

But keep in mind - there are a lot of vector opcodes. Much much more than
scalar ones. You can see full list in Valgrind sources (VEX/pub/libvex_ir.h).

We can reduce the amount of opcodes by converting vector element size from part
of an opcode to a constant argument. But we will lose some flexibility offered
by the TARGET_HAS_opcode macro when target has support for some sizes but not for
others. For example SSE has vector minimum for sizes i8x16, i16x8, i32x4 but
does not have one for size i64x2. 

Some implementation details and concerns.

The most problematic issue was the fact that with vector registers we have one
entity that can be accessed as both global variable and memory location. I
solved it by introducing the sync_temp opcode that instructs register allocator to
save global variable to its memory location if it is on the register. If a
variable is not on a register or memory is already coherent - no store is issued,
so performance penalty for it is minimal. Still this approach has a serious
drawback: we need to generate sync_temp explicitly. But I do not know any better
way to achieve consistency.

Note that as of this RFC I have not finished conversion of ARM guest so mixing
NEON with VFP code can cause a miscompile.

The second problem is that a backend may or may not support vector operations. We
do not want each frontend to check it on every operation. I created a wrapper that
generates vector opcode if it is supported or generates emulation code.

For add_i32x4 emulation code is generated inline. I tried to make it a helper
but got a very significant performance loss (5x slowdown). I'm not sure about
the cause but I suspect that memory was a bottleneck and extra stores needed
by calling conventions mattered a lot.

The existing constraints are good enough to express that vector registers and
general purpose registers are different and can not be used instead of each
other.

One unsolved problem is global aliasing. With general purpose registers we have
no aliasing between globals. The only example I know where registers can alias
is the x86 ah/ax/eax/rax case. They are handled as one global. With vector
registers we have NEON where an 128-bit Q register consists of two 64-bit
D registers each consisting of two 32-bit S registers. I think I'll need
to add alias list to each global listing every other global it can clobber and
then iterate over it in the optimizer. Fortunately this list will be static and not
very long.

Why I think all this is worth doing:

(1) Performance. 200% speedup is a lot. My test was specifically crafted and real
    life applications may not have that much vector operations on average, but
    there is a specific class of applications where it will matter a lot - media
    processing applications like ffmpeg.

(2) Some unification of common operations. Right now every target reimplements
    common vector operations (like vector add/sub/mul/min/compare etc.). We can
    do it once in the common TCG code.

Still there are some cons I mentioned earlier. The need to support a lot of
opcodes is the most significant in the long run I think. So before I commit my
time to conversion of more operations I'd like to hear your opinions if this
approach is acceptable and worth spending efforts.

Kirill Batuzov (7):
  tcg: add support for 128bit vector type
  tcg: store ENV global in TCGContext
  tcg: add sync_temp opcode
  tcg: add add_i32x4 opcode
  target-arm: support access to 128-bit guest registers as globals
  target-arm: use add_i32x4 opcode to handle vadd.i32 instruction
  tcg/i386: add support for vector opcodes

 target-arm/translate.c |   30 ++++++++++-
 tcg/i386/tcg-target.c  |  103 ++++++++++++++++++++++++++++++++---
 tcg/i386/tcg-target.h  |   24 ++++++++-
 tcg/tcg-op.h           |  141 ++++++++++++++++++++++++++++++++++++++++++++++++
 tcg/tcg-opc.h          |   13 +++++
 tcg/tcg.c              |   36 +++++++++++++
 tcg/tcg.h              |   34 ++++++++++++
 7 files changed, 371 insertions(+), 10 deletions(-)
Alex Bennée Oct. 16, 2014, 10:03 a.m. UTC | #3
Kirill Batuzov <batuzovk@ispras.ru> writes:

>> (4) Consider supporting generic vector operations in the TCG?
>
> I gave it a go and was quite happy with the result. I have implemented the add_i32x4
> opcode which is addition of 128-bit vectors composed of four 32-bit integers
> and used it to translate NEON vadd.i32 to SSE paddd instruction. I used ARM for
> my guest because I'm familiar with this architecture and it is different from
> my host.
>
> I got a 3x speedup on my testcase:
<snip>
> OUT: [size=196]
> 0x60442450:  mov    -0x4(%r14),%ebp
> 0x60442454:  test   %ebp,%ebp
> 0x60442456:  jne    0x60442505
> 0x6044245c:  movdqu 0x658(%r14),%xmm0
> 0x60442465:  movdqu 0x668(%r14),%xmm1
> 0x6044246e:  paddd  %xmm1,%xmm0
> 0x60442472:  paddd  %xmm1,%xmm0
> 0x60442476:  paddd  %xmm1,%xmm0
> 0x6044247a:  paddd  %xmm1,%xmm0
> 0x6044247e:  movdqu %xmm0,0x658(%r14)
> <...>

It certainly looks promising although as I suspect you know add is a
pretty easy target ;-)

>
>> But for target-alpha, there's one vector comparison operation that appears in
>> every guest string operation, and is used heavily enough that it's in the top
>> 10 functions in the profile: cmpbge (compare bytes greater or equal).
>
> cmpbge can be translated as follows:
>
> cmpge_i8x8      tmp0, arg1, arg2
> select_msb_i8x8 res, tmp0
>
> where cmpge is "compare grater or equal" with following semantic:
> res[i] = <111...11> if arg1[i] >= arg2[i]
> res[i] = <000...00> if arg1[i] <  arg2[i]
> There is such operation in NEON. In SSE we can emulate it with PCMPEQB, PCMPGTB
> and POR.
>
> select_msb is "select most significant bit". SSE instruction PMOVMSKB.
>
>> While making helper functions faster is good I've wondered if they is
>> enough genericsm across the various SIMD/vector operations we could add
>> add TCG ops to translate them? The ops could fall back to generic helper
>> functions using the GCC instrinsics if we know there is no decent
>> back-end support for them?
>
> From Valgrind experience there are enough genericism. Valgrind can translate
> SSE, AltiVec and NEON instructions to vector opcodes. Most of the opcodes are
> reused between instruction sets.

Doesn't Valgrind have the advantage of same-arch->same-arch (I've not
looked at it's generated code in detail though).

> But keep in mind - there are a lot of vector opcodes. Much much more than
> scalar ones. You can see full list in Valgrind sources
> (VEX/pub/libvex_ir.h).

I think we could only approach this is in a piecemeal way guided by
performance bottlenecks when we find them.

> We can reduce the amount of opcodes by converting vector element size from part
> of an opcode to a constant argument. But we will lose some flexibility offered
> by the TARGET_HAS_opcode macro when target has support for some sizes but not for
> others. For example SSE has vector minimum for sizes i8x16, i16x8, i32x4 but
> does not have one for size i64x2. 
>
> Some implementation details and concerns.
>
> The most problematic issue was the fact that with vector registers we have one
> entity that can be accessed as both global variable and memory location. I
> solved it by introducing the sync_temp opcode that instructs register allocator to
> save global variable to its memory location if it is on the register. If a
> variable is not on a register or memory is already coherent - no store is issued,
> so performance penalty for it is minimal. Still this approach has a serious
> drawback: we need to generate sync_temp explicitly. But I do not know any better
> way to achieve consistency.

I'm not sure I follow. I thought we only needed the memory access when
the backend can't support the vector width operations so shouldn't have
stuff in the vector registers?

> Note that as of this RFC I have not finished conversion of ARM guest so mixing
> NEON with VFP code can cause a miscompile.
>
> The second problem is that a backend may or may not support vector operations. We
> do not want each frontend to check it on every operation. I created a wrapper that
> generates vector opcode if it is supported or generates emulation code.
>
> For add_i32x4 emulation code is generated inline. I tried to make it a helper
> but got a very significant performance loss (5x slowdown). I'm not sure about
> the cause but I suspect that memory was a bottleneck and extra stores needed
> by calling conventions mattered a lot.

So the generic helper was more API heavy than the existing NEON helpers?
>
> The existing constraints are good enough to express that vector registers and
> general purpose registers are different and can not be used instead of each
> other.
>
> One unsolved problem is global aliasing. With general purpose registers we have
> no aliasing between globals. The only example I know where registers can alias
> is the x86 ah/ax/eax/rax case. They are handled as one global. With vector
> registers we have NEON where an 128-bit Q register consists of two 64-bit
> D registers each consisting of two 32-bit S registers. I think I'll need
> to add alias list to each global listing every other global it can clobber and
> then iterate over it in the optimizer. Fortunately this list will be static and not
> very long.
>
> Why I think all this is worth doing:
>
> (1) Performance. 200% speedup is a lot. My test was specifically crafted and real
>     life applications may not have that much vector operations on average, but
>     there is a specific class of applications where it will matter a lot - media
>     processing applications like ffmpeg.
>
> (2) Some unification of common operations. Right now every target reimplements
>     common vector operations (like vector add/sub/mul/min/compare etc.). We can
>     do it once in the common TCG code.
>
> Still there are some cons I mentioned earlier. The need to support a lot of
> opcodes is the most significant in the long run I think. So before I commit my
> time to conversion of more operations I'd like to hear your opinions if this
> approach is acceptable and worth spending efforts.

Overall I'm pretty keen to explore this further. If we can get the
backend interface right and make it an easier proposition to tcg-up
various vector operations when bottle-necks arise it will be a big win.

A lot will depend on where those bottle-necks are though. If for example
the media codecs all use very ARCH specific special sauce instructions
we might never claw back that much.

I'll have a look through the patches and comment there when I've gotten
my head round the back-end issues.

Thanks for coding this up ;-)
Kirill Batuzov Oct. 16, 2014, 11:07 a.m. UTC | #4
On Thu, 16 Oct 2014, Alex Bennée wrote:

> >
> > From Valgrind experience there are enough genericism. Valgrind can translate
> > SSE, AltiVec and NEON instructions to vector opcodes. Most of the opcodes are
> > reused between instruction sets.
> 
> Doesn't Valgrind have the advantage of same-arch->same-arch (I've not
> looked at it's generated code in detail though).
>

Yes, they have this advantage, but Valgrind tools look at intermediate
code in an architecture-independent way. For tools to work they need
to preserve opcode's semantics across different architectures. For
example Iop_QAdd16Sx4 (addition with saturation) must have the same
meaning on ARM (vqadd.s16 instruction) and on x86 (paddsw instruction).
So in most cases where Valgrind uses same opcode for different
instructions from different architectures QEMU can do the same.

> > But keep in mind - there are a lot of vector opcodes. Much much more than
> > scalar ones. You can see full list in Valgrind sources
> > (VEX/pub/libvex_ir.h).
> 
> I think we could only approach this is in a piecemeal way guided by
> performance bottlenecks when we find them.
> 

I'm not sure this will work. In my example larger part of speedup comes
from the fact that I could preserve value on registers and do not need
them to be saved and loaded for each vadd.i32 instruction. To be able to
do it on the real-life application we need to support as large fraction
of its vector instructions as possible. In short: the speedup does not
come from faster emulation of one instruction but from interaction
between sequential guest instructions.

> > We can reduce the amount of opcodes by converting vector element size from part
> > of an opcode to a constant argument. But we will lose some flexibility offered
> > by the TARGET_HAS_opcode macro when target has support for some sizes but not for
> > others. For example SSE has vector minimum for sizes i8x16, i16x8, i32x4 but
> > does not have one for size i64x2. 
> >
> > Some implementation details and concerns.
> >
> > The most problematic issue was the fact that with vector registers we have one
> > entity that can be accessed as both global variable and memory location. I
> > solved it by introducing the sync_temp opcode that instructs register allocator to
> > save global variable to its memory location if it is on the register. If a
> > variable is not on a register or memory is already coherent - no store is issued,
> > so performance penalty for it is minimal. Still this approach has a serious
> > drawback: we need to generate sync_temp explicitly. But I do not know any better
> > way to achieve consistency.
> 
> I'm not sure I follow. I thought we only needed the memory access when
> the backend can't support the vector width operations so shouldn't have
> stuff in the vector registers?
> 

The target support for vector operations is not binary ("support all" or
"support none"). In most cases it will support some large subset but
some guest vector operations will be emulated. In that case we'll need
to access guest vector registers as memory locations.

Scalar operations which are not supported in opcodes are very uncommon
and a helper with large performance overhead is a reasonable option. I'd
like to avoid such heavy helpers in vector operations because
unsupported opcodes will be more common.

Another cause is the transition from existing code to vector opcodes.
During transition we'll have mix of old code (access as memory) and new
one (access as globals). Doing transition in one go is unrealistic.

> > Note that as of this RFC I have not finished conversion of ARM guest so mixing
> > NEON with VFP code can cause a miscompile.
> >
> > The second problem is that a backend may or may not support vector operations. We
> > do not want each frontend to check it on every operation. I created a wrapper that
> > generates vector opcode if it is supported or generates emulation code.
> >
> > For add_i32x4 emulation code is generated inline. I tried to make it a helper
> > but got a very significant performance loss (5x slowdown). I'm not sure about
> > the cause but I suspect that memory was a bottleneck and extra stores needed
> > by calling conventions mattered a lot.
> 
> So the generic helper was more API heavy than the existing NEON helpers?

Existing NEON implementation generates emulation code inline too. That
is how I found that my helper was slow.
Kirill Batuzov Nov. 11, 2014, 11:58 a.m. UTC | #5
On Thu, 16 Oct 2014, Kirill Batuzov wrote:

> > (4) Consider supporting generic vector operations in the TCG?
> 
> I gave it a go and was quite happy with the result. I have implemented the add_i32x4
> opcode which is addition of 128-bit vectors composed of four 32-bit integers
> and used it to translate NEON vadd.i32 to SSE paddd instruction. 

<snip>

> 
> Why I think all this is worth doing:
> 
> (1) Performance. 200% speedup is a lot. My test was specifically crafted and real
>     life applications may not have that much vector operations on average, but
>     there is a specific class of applications where it will matter a lot - media
>     processing applications like ffmpeg.
> 
> (2) Some unification of common operations. Right now every target reimplements
>     common vector operations (like vector add/sub/mul/min/compare etc.). We can
>     do it once in the common TCG code.
> 
> Still there are some cons I mentioned earlier. The need to support a lot of
> opcodes is the most significant in the long run I think. So before I commit my
> time to conversion of more operations I'd like to hear your opinions if this
> approach is acceptable and worth spending efforts.
> 
> Kirill Batuzov (7):
>   tcg: add support for 128bit vector type
>   tcg: store ENV global in TCGContext
>   tcg: add sync_temp opcode
>   tcg: add add_i32x4 opcode
>   target-arm: support access to 128-bit guest registers as globals
>   target-arm: use add_i32x4 opcode to handle vadd.i32 instruction
>   tcg/i386: add support for vector opcodes
> 
>  target-arm/translate.c |   30 ++++++++++-
>  tcg/i386/tcg-target.c  |  103 ++++++++++++++++++++++++++++++++---
>  tcg/i386/tcg-target.h  |   24 ++++++++-
>  tcg/tcg-op.h           |  141 ++++++++++++++++++++++++++++++++++++++++++++++++
>  tcg/tcg-opc.h          |   13 +++++
>  tcg/tcg.c              |   36 +++++++++++++
>  tcg/tcg.h              |   34 ++++++++++++
>  7 files changed, 371 insertions(+), 10 deletions(-)
> 
> 

Ping? Any more comments?
diff mbox

Patch

diff --git a/target-alpha/int_helper.c b/target-alpha/int_helper.c
index c023fa1..ec71c17 100644
--- a/target-alpha/int_helper.c
+++ b/target-alpha/int_helper.c
@@ -60,6 +60,42 @@  uint64_t helper_zap(uint64_t val, uint64_t mask)

 uint64_t helper_cmpbge(uint64_t op1, uint64_t op2)
 {
+#if 1
+    uint64_t r;
+
+    /* The cmpbge instruction is heavily used in the implementation of
+       every string function on Alpha.  We can do much better than either
+       the default loop below, or even an unrolled version by using the
+       native vector support.  */
+    {
+        typedef uint64_t Q __attribute__((vector_size(16)));
+        typedef uint8_t B __attribute__((vector_size(16)));
+
+        Q q1 = (Q){ op1, 0 };
+        Q q2 = (Q){ op2, 0 };
+
+        q1 = (Q)((B)q1 >= (B)q2);
+
+        r = q1[0];
+    }
+
+    /* Select only one bit from each byte.  */
+    r &= 0x0101010101010101;
+
+    /* Collect the bits into the bottom byte.  */
+    /* .......A.......B.......C.......D.......E.......F.......G.......H */
+    r |= r >> (8 - 1);
+
+    /* .......A......AB......BC......CD......DE......EF......FG......GH */
+    r |= r >> (16 - 2);
+
+    /* .......A......AB.....ABC....ABCD....BCDE....CDEF....DEFG....EFGH */
+    r |= r >> (32 - 4);
+
+    /* .......A......AB.....ABC....ABCD...ABCDE..ABCDEF.ABCDEFGABCDEFGH */
+    /* Return only the low 8 bits.  */
+    return r & 0xff;
+#else
     uint8_t opa, opb, res;
     int i;

@@ -72,6 +108,7 @@  uint64_t helper_cmpbge(uint64_t op1, uint64_t op2)
         }
     }
     return res;
+#endif
 }

 uint64_t helper_minub8(uint64_t op1, uint64_t op2)