Message ID | 20190515203112.506-3-david@redhat.com |
---|---|
State | New |
Headers | show |
Series | s390x/tcg: Vector Instruction SupportPart 3 | expand |
On 5/15/19 1:31 PM, David Hildenbrand wrote:
> +#define DEF_VFEE(BITS)
Same comment wrt inline functions applies.
Here, because there's one result, writing to byte 7, I wonder if it isn't
clearer to write the loop
first_equal = n;
first_zero = n;
for (i = n - 1; i >= 0; --i) {
if (data1 == data2) {
first_equal = i;
}
if (data1 == 0) {
first_zero = i;
}
}
// As an aside, there are bit tricks for the above,
// but let's stay simple(r) for now.
if (zs) {
if (first_equal < first_zero) {
cc = (first_zero < n ? 2 : 1);
} else {
first_equal = first_zero;
cc = (first_zero < n ? 0 : 3);
}
} else {
cc = (first_equal < n ? 1 : 3);
}
s390_vec_write_element64(v1, 0, first_equal);
s390_vec_write_element64(v1, 1, 0);
Note that you don't need S390Vector tmp, since the result is written after all
of the inputs are consumed.
r~
On 5/17/19 9:47 AM, Richard Henderson wrote: > first_equal = n; > first_zero = n; > for (i = n - 1; i >= 0; --i) { > if (data1 == data2) { > first_equal = i; > } > if (data1 == 0) { > first_zero = i; > } > } > > // As an aside, there are bit tricks for the above, > // but let's stay simple(r) for now. What the hell, it's not /that/ tricky. /* * Returns a bit set in the MSB of each element that is zero, * as defined by the mask M. */ static inline uint64_t zero_search(uint64_t a, uint64_t m) { return ~(((a & m) + m) | a | m); } /* * Returns the byte offset for the first match, or 16 for no match. */ static inline int match_index(uint64_t c0, uint64_t c1) { return (c0 ? clz64(c0) : clz64(c1) + 64) >> 3; } Use dup_const(MO_8, 0x7f) dup_const(MO_16, 0x7fff) dup_const(MO_32, 0x7fffffff) for the M parameter for the different element sizes. uint64_t a0, a1, b0, b1, e0, e1, z0, z1; a0 = s390_vec_read_element64(v2, 0); a1 = s390_vec_read_element64(v2, 1); b0 = s390_vec_read_element64(v3, 0); b1 = s390_vec_read_element64(v3, 1); e0 = zero_search(a0 ^ b0, m); e1 = zero_search(a1 ^ b1, m); first_equal = match_index(e0, e1); if (zs) { z0 = zero_search(a0, m); z1 = zero_search(a1, m); first_zero = match_index(z0, z1); ... r~
On 17.05.19 19:42, Richard Henderson wrote: > On 5/17/19 9:47 AM, Richard Henderson wrote: >> first_equal = n; >> first_zero = n; >> for (i = n - 1; i >= 0; --i) { >> if (data1 == data2) { >> first_equal = i; >> } >> if (data1 == 0) { >> first_zero = i; >> } >> } >> >> // As an aside, there are bit tricks for the above, >> // but let's stay simple(r) for now. > > What the hell, it's not /that/ tricky. > > > /* > * Returns a bit set in the MSB of each element that is zero, > * as defined by the mask M. > */ > static inline uint64_t zero_search(uint64_t a, uint64_t m) > { > return ~(((a & m) + m) | a | m); > } > > /* > * Returns the byte offset for the first match, or 16 for no match. > */ > static inline int match_index(uint64_t c0, uint64_t c1) > { > return (c0 ? clz64(c0) : clz64(c1) + 64) >> 3; > } > > Use > > dup_const(MO_8, 0x7f) > dup_const(MO_16, 0x7fff) > dup_const(MO_32, 0x7fffffff) > > for the M parameter for the different element sizes. > > uint64_t a0, a1, b0, b1, e0, e1, z0, z1; > > a0 = s390_vec_read_element64(v2, 0); > a1 = s390_vec_read_element64(v2, 1); > b0 = s390_vec_read_element64(v3, 0); > b1 = s390_vec_read_element64(v3, 1); > e0 = zero_search(a0 ^ b0, m); > e1 = zero_search(a1 ^ b1, m); > first_equal = match_index(e0, e1); > > if (zs) { > z0 = zero_search(a0, m); > z1 = zero_search(a1, m); > first_zero = match_index(z0, z1); > ... > > > r~ > Crazy stuff, seems to work (not that I am surprised :D ) I now have: +static int vfee(void *v1, const void *v2, const void *v3, bool zs, uint8_t es) +{ + const uint64_t mask = dup_const(es, -1ull >> (65 - (1 << es) * 8)); + uint64_t a0, a1, b0, b1, e0, e1, z0, z1; + uint64_t first_zero = 16; + uint64_t first_equal; + + a0 = s390_vec_read_element64(v2, 0); + a1 = s390_vec_read_element64(v2, 1); + b0 = s390_vec_read_element64(v3, 0); + b1 = s390_vec_read_element64(v3, 1); + e0 = zero_search(a0 ^ b0, mask); + e1 = zero_search(a1 ^ b1, mask); + first_equal = match_index(e0, e1); + + if (zs) { + z0 = zero_search(a0, mask); + z1 = zero_search(a1, mask); + first_zero = match_index(z0, z1); + } + + /* zero out the destination vector */ + s390_vec_write_element64(v1, 0, 0); + s390_vec_write_element64(v1, 1, 0); + + if (first_zero == 16 && first_equal == 16) { + s390_vec_write_element8(v1, 7, 16); + return 3; /* no match */ + } else if (first_zero == 16) { + s390_vec_write_element8(v1, 7, first_equal); + return 1; /* matching elements, no match for zero */ + } else if (first_equal < first_zero) { + s390_vec_write_element8(v1, 7, first_equal); + return 2; /* matching elements before match for zero */ + } + s390_vec_write_element8(v1, 7, first_zero); + return 0; /* match for zero */ +}
diff --git a/target/s390x/helper.h b/target/s390x/helper.h index c45328cf73..a1b169b666 100644 --- a/target/s390x/helper.h +++ b/target/s390x/helper.h @@ -218,6 +218,12 @@ DEF_HELPER_FLAGS_4(gvec_vfae32, TCG_CALL_NO_RWG, void, ptr, cptr, cptr, i32) DEF_HELPER_5(gvec_vfae_cc8, void, ptr, cptr, cptr, env, i32) DEF_HELPER_5(gvec_vfae_cc16, void, ptr, cptr, cptr, env, i32) DEF_HELPER_5(gvec_vfae_cc32, void, ptr, cptr, cptr, env, i32) +DEF_HELPER_FLAGS_4(gvec_vfee8, TCG_CALL_NO_RWG, void, ptr, cptr, cptr, i32) +DEF_HELPER_FLAGS_4(gvec_vfee16, TCG_CALL_NO_RWG, void, ptr, cptr, cptr, i32) +DEF_HELPER_FLAGS_4(gvec_vfee32, TCG_CALL_NO_RWG, void, ptr, cptr, cptr, i32) +DEF_HELPER_5(gvec_vfee_cc8, void, ptr, cptr, cptr, env, i32) +DEF_HELPER_5(gvec_vfee_cc16, void, ptr, cptr, cptr, env, i32) +DEF_HELPER_5(gvec_vfee_cc32, void, ptr, cptr, cptr, env, i32) #ifndef CONFIG_USER_ONLY DEF_HELPER_3(servc, i32, env, i64, i64) diff --git a/target/s390x/insn-data.def b/target/s390x/insn-data.def index 070ce2a471..d8907ef6a5 100644 --- a/target/s390x/insn-data.def +++ b/target/s390x/insn-data.def @@ -1195,6 +1195,8 @@ /* VECTOR FIND ANY ELEMENT EQUAL */ F(0xe782, VFAE, VRR_b, V, 0, 0, 0, 0, vfae, 0, IF_VEC) +/* VECTOR FIND ELEMENT EQUAL */ + F(0xe780, VFEE, VRR_b, V, 0, 0, 0, 0, vfee, 0, IF_VEC) #ifndef CONFIG_USER_ONLY /* COMPARE AND SWAP AND PURGE */ diff --git a/target/s390x/translate_vx.inc.c b/target/s390x/translate_vx.inc.c index 022990dda3..848f6d7163 100644 --- a/target/s390x/translate_vx.inc.c +++ b/target/s390x/translate_vx.inc.c @@ -2384,3 +2384,34 @@ static DisasJumpType op_vfae(DisasContext *s, DisasOps *o) } return DISAS_NEXT; } + +static DisasJumpType op_vfee(DisasContext *s, DisasOps *o) +{ + const uint8_t es = get_field(s->fields, m4); + const uint8_t m5 = get_field(s->fields, m5); + static gen_helper_gvec_3_ptr * const cc[3] = { + gen_helper_gvec_vfee_cc8, + gen_helper_gvec_vfee_cc16, + gen_helper_gvec_vfee_cc32, + }; + static gen_helper_gvec_3 * const nocc[3] = { + gen_helper_gvec_vfee8, + gen_helper_gvec_vfee16, + gen_helper_gvec_vfee32, + }; + + if (es > ES_32 || m5 & ~0x3) { + gen_program_exception(s, PGM_SPECIFICATION); + return DISAS_NORETURN; + } + + if (m5 & 1) { + gen_gvec_3_ptr(get_field(s->fields, v1), get_field(s->fields, v2), + get_field(s->fields, v3), cpu_env, m5, cc[es]); + set_cc_static(s); + } else { + gen_gvec_3_ool(get_field(s->fields, v1), get_field(s->fields, v2), + get_field(s->fields, v3), m5, nocc[es]); + } + return DISAS_NEXT; +} diff --git a/target/s390x/vec_string_helper.c b/target/s390x/vec_string_helper.c index 8a4e65b70f..6a5d05271c 100644 --- a/target/s390x/vec_string_helper.c +++ b/target/s390x/vec_string_helper.c @@ -95,3 +95,62 @@ void HELPER(gvec_vfae_cc##BITS)(void *v1, const void *v2, const void *v3, \ DEF_VFAE_CC_HELPER(8) DEF_VFAE_CC_HELPER(16) DEF_VFAE_CC_HELPER(32) + +#define DEF_VFEE(BITS) \ +static int vfee##BITS(void *v1, const void *v2, const void *v3, uint8_t m5) \ +{ \ + const bool zs = extract32(m5, 1, 1); \ + S390Vector tmp = {}; \ + int first_byte = 16; \ + int cc = 3; /* no match */ \ + int i; \ + \ + for (i = 0; i < (128 / BITS); i++) { \ + const uint##BITS##_t data1 = s390_vec_read_element##BITS(v2, i); \ + const uint##BITS##_t data2 = s390_vec_read_element##BITS(v3, i); \ + \ + if (zs && !data1) { \ + if (cc == 3) { \ + first_byte = i * (BITS / 8); \ + cc = 0; /* match for zero */ \ + } else { \ + cc = 2; /* matching elements before match for zero */ \ + } \ + break; \ + } \ + \ + if (cc == 3 && data1 == data2) { \ + first_byte = i * (BITS / 8); \ + cc = 1; /* matching elements, no match for zero */ \ + if (!zs) { \ + break; \ + } \ + } \ + } \ + s390_vec_write_element8(&tmp, 7, first_byte); \ + *(S390Vector *)v1 = tmp; \ + return cc; \ +} +DEF_VFEE(8) +DEF_VFEE(16) +DEF_VFEE(32) + +#define DEF_VFEE_HELPER(BITS) \ +void HELPER(gvec_vfee##BITS)(void *v1, const void *v2, const void *v3, \ + uint32_t desc) \ +{ \ + vfee##BITS(v1, v2, v3, simd_data(desc)); \ +} +DEF_VFEE_HELPER(8) +DEF_VFEE_HELPER(16) +DEF_VFEE_HELPER(32) + +#define DEF_VFEE_CC_HELPER(BITS) \ +void HELPER(gvec_vfee_cc##BITS)(void *v1, const void *v2, const void *v3, \ + CPUS390XState *env, uint32_t desc) \ +{ \ + env->cc_op = vfee##BITS(v1, v2, v3, simd_data(desc)); \ +} +DEF_VFEE_CC_HELPER(8) +DEF_VFEE_CC_HELPER(16) +DEF_VFEE_CC_HELPER(32)
Implement it similar to VECTOR FIND ANY ELEMENT EQUAL. The zero-check seems to have precedence in case we have "data1 == data2 == 0". The description in the PoP is a little bi confusing. Signed-off-by: David Hildenbrand <david@redhat.com> --- target/s390x/helper.h | 6 ++++ target/s390x/insn-data.def | 2 ++ target/s390x/translate_vx.inc.c | 31 +++++++++++++++++ target/s390x/vec_string_helper.c | 59 ++++++++++++++++++++++++++++++++ 4 files changed, 98 insertions(+)