diff mbox series

[v1,1/5] s390x/tcg: Implement VECTOR FIND ANYELEMENT EQUAL

Message ID 20190515203112.506-2-david@redhat.com
State New
Headers show
Series s390x/tcg: Vector Instruction SupportPart 3 | expand

Commit Message

David Hildenbrand May 15, 2019, 8:31 p.m. UTC
Complicated stuff. Provide two variants, one for the CC and one without
the CC. The CC is returned via cpu_env.

Signed-off-by: David Hildenbrand <david@redhat.com>
---
 target/s390x/Makefile.objs       |  2 +-
 target/s390x/helper.h            |  8 +++
 target/s390x/insn-data.def       |  5 ++
 target/s390x/translate_vx.inc.c  | 31 ++++++++++
 target/s390x/vec_string_helper.c | 97 ++++++++++++++++++++++++++++++++
 5 files changed, 142 insertions(+), 1 deletion(-)
 create mode 100644 target/s390x/vec_string_helper.c

Comments

Richard Henderson May 17, 2019, 4:16 p.m. UTC | #1
On 5/15/19 1:31 PM, David Hildenbrand wrote:
> +#define DEF_VFAE(BITS)                                                         \
> +static int vfae##BITS(void *v1, const void *v2, const void *v3, uint8_t m5)    


First, because this *is* complicated stuff, can we find a way to use inline
functions instead of an undebuggable macro for this?  Perhaps a different set
of wrappers than s390_vec_read_element##BITS, which always return uint32_t, so
that they have a constant signature?

> +        if (zs && !data) {
> +            if (cc == 3) {
> +                first_byte = i * (BITS / 8);
> +                cc = 0; /* match for zero */
> +            } else if (cc != 0) {
> +                cc = 2; /* matching elements before match for zero */
> +            }
> +            if (!rt) {
> +                break;
> +            }
> +        }    

So here we are computing the second intermediate result.

> +        /* try to match with any other element from the other vector */
> +        for (j = 0; j < (128 / BITS); j++) {
> +            if (data == s390_vec_read_element##BITS(v3, j)) {
> +                any_equal = true;
> +                break;
> +            }
> +        }

And here the first intermediate result,

> +        /* invert the result if requested */
> +        any_equal = in ^ any_equal;

... inverted, if requested,

> +        if (cc == 3 && any_equal) {
> +            first_byte = i * (BITS / 8);
> +            cc = 1; /* matching elements, no match for zero */
> +            if (!zs && !rt) {
> +                break;
> +            }
> +        }

> +        /* indicate bit vector if requested */
> +        if (rt && any_equal) {
> +            s390_vec_write_element##BITS(&tmp, i, (uint##BITS##_t)-1ull);
> +        }

... writing out (some of) the results of the first intermediate result.

> +    }
> +    if (!rt) {
> +        s390_vec_write_element8(&tmp, 7, first_byte);
> +    }

... writing out the rest of the first intermediate result.

I wonder if it wouldn't be clearer, within the loop, to do

	if (any_equal) {
	    if (cc == 3) {
		first_byte = ...;
		cc = 1;
	    }
	    if (rt) {
		write element -1;
	    } else if (!zs) {
		break;
	    }
	}

I also think that, if we create a bunch more of these wrappers:

> +DEF_VFAE_HELPER(8)
> +DEF_VFAE_HELPER(16)
> +DEF_VFAE_HELPER(32)

then RT and ZS can be passed in as constant parameters to the above, and then
the compiler will fold away all of the stuff that's not needed for each
different case.  Which, I think, is significant.  These are practically
different instructions with the different modifiers.


r~
David Hildenbrand May 20, 2019, 9:51 a.m. UTC | #2
On 17.05.19 18:16, Richard Henderson wrote:
> On 5/15/19 1:31 PM, David Hildenbrand wrote:
>> +#define DEF_VFAE(BITS)                                                         \
>> +static int vfae##BITS(void *v1, const void *v2, const void *v3, uint8_t m5)    
> 
> 
> First, because this *is* complicated stuff, can we find a way to use inline
> functions instead of an undebuggable macro for this?  Perhaps a different set
> of wrappers than s390_vec_read_element##BITS, which always return uint32_t, so
> that they have a constant signature?

For vfene I have for now

+static inline uint64_t s390_vec_read_element(const S390Vector *v,
uint8_t enr,
+                                             uint8_t es)
+{
+    switch (es) {
+    case MO_8:
+        return s390_vec_read_element8(v, enr);
+    case MO_16:
+        return s390_vec_read_element16(v, enr);
+    case MO_32:
+        return s390_vec_read_element32(v, enr);
+    case MO_64:
+        return s390_vec_read_element64(v, enr);
+    default:
+        g_assert_not_reached();
+    }
+}
+

Which we could reuse here.

I'll try to look into using a inline function instead, passing in the
element size and other flags, so the compiler can specialize.

Thanks!

> 
>> +        if (zs && !data) {
>> +            if (cc == 3) {
>> +                first_byte = i * (BITS / 8);
>> +                cc = 0; /* match for zero */
>> +            } else if (cc != 0) {
>> +                cc = 2; /* matching elements before match for zero */
>> +            }
>> +            if (!rt) {
>> +                break;
>> +            }
>> +        }    
> 
> So here we are computing the second intermediate result.
> 
>> +        /* try to match with any other element from the other vector */
>> +        for (j = 0; j < (128 / BITS); j++) {
>> +            if (data == s390_vec_read_element##BITS(v3, j)) {
>> +                any_equal = true;
>> +                break;
>> +            }
>> +        }
> 
> And here the first intermediate result,
> 
>> +        /* invert the result if requested */
>> +        any_equal = in ^ any_equal;
> 
> ... inverted, if requested,
> 
>> +        if (cc == 3 && any_equal) {
>> +            first_byte = i * (BITS / 8);
>> +            cc = 1; /* matching elements, no match for zero */
>> +            if (!zs && !rt) {
>> +                break;
>> +            }
>> +        }
> 
>> +        /* indicate bit vector if requested */
>> +        if (rt && any_equal) {
>> +            s390_vec_write_element##BITS(&tmp, i, (uint##BITS##_t)-1ull);
>> +        }
> 
> ... writing out (some of) the results of the first intermediate result.
> 
>> +    }
>> +    if (!rt) {
>> +        s390_vec_write_element8(&tmp, 7, first_byte);
>> +    }
> 
> ... writing out the rest of the first intermediate result.
> 
> I wonder if it wouldn't be clearer, within the loop, to do
> 
> 	if (any_equal) {
> 	    if (cc == 3) {
> 		first_byte = ...;
> 		cc = 1;
> 	    }
> 	    if (rt) {
> 		write element -1;
> 	    } else if (!zs) {
> 		break;
> 	    }
> 	}
> 
> I also think that, if we create a bunch more of these wrappers:
> 
>> +DEF_VFAE_HELPER(8)
>> +DEF_VFAE_HELPER(16)
>> +DEF_VFAE_HELPER(32)
> 
> then RT and ZS can be passed in as constant parameters to the above, and then
> the compiler will fold away all of the stuff that's not needed for each
> different case.  Which, I think, is significant.  These are practically
> different instructions with the different modifiers.
> 
> 
> r~
>
David Hildenbrand May 22, 2019, 11:01 a.m. UTC | #3
> I also think that, if we create a bunch more of these wrappers:
> 
>> +DEF_VFAE_HELPER(8)
>> +DEF_VFAE_HELPER(16)
>> +DEF_VFAE_HELPER(32)
> 
> then RT and ZS can be passed in as constant parameters to the above, and then
> the compiler will fold away all of the stuff that's not needed for each
> different case.  Which, I think, is significant.  These are practically
> different instructions with the different modifiers.
> 

So, we have 4 flags, resulting in 16 variants. Times 3 element sizes ...
48 helpers in total. Do we really want to go down that path?

I can also go ahead any try to identify the most frequent users (in
Linux) and only specialize that one.
Richard Henderson May 22, 2019, 11:09 a.m. UTC | #4
On 5/22/19 7:01 AM, David Hildenbrand wrote:
> 
>> I also think that, if we create a bunch more of these wrappers:
>>
>>> +DEF_VFAE_HELPER(8)
>>> +DEF_VFAE_HELPER(16)
>>> +DEF_VFAE_HELPER(32)
>>
>> then RT and ZS can be passed in as constant parameters to the above, and then
>> the compiler will fold away all of the stuff that's not needed for each
>> different case.  Which, I think, is significant.  These are practically
>> different instructions with the different modifiers.
>>
> 
> So, we have 4 flags, resulting in 16 variants. Times 3 element sizes ...
> 48 helpers in total. Do we really want to go down that path?

Maybe?

> I can also go ahead any try to identify the most frequent users (in
> Linux) and only specialize that one.

Also plausible.  I guess it would be good to know, anyway.

I think RT probably makes the largest difference to the layout of the function,
so maybe that's the one we pick.  We could also leave our options open and make
the 3 non-CC flags be parameters to the inline function, just extract them from
the M4 parameter at the one higher level.


r~
David Hildenbrand May 22, 2019, 11:16 a.m. UTC | #5
On 22.05.19 13:09, Richard Henderson wrote:
> On 5/22/19 7:01 AM, David Hildenbrand wrote:
>>
>>> I also think that, if we create a bunch more of these wrappers:
>>>
>>>> +DEF_VFAE_HELPER(8)
>>>> +DEF_VFAE_HELPER(16)
>>>> +DEF_VFAE_HELPER(32)
>>>
>>> then RT and ZS can be passed in as constant parameters to the above, and then
>>> the compiler will fold away all of the stuff that's not needed for each
>>> different case.  Which, I think, is significant.  These are practically
>>> different instructions with the different modifiers.
>>>
>>
>> So, we have 4 flags, resulting in 16 variants. Times 3 element sizes ...
>> 48 helpers in total. Do we really want to go down that path?
> 
> Maybe?

Hope my fingers won't bleed from all the copy-pasting ;)

> 
>> I can also go ahead any try to identify the most frequent users (in
>> Linux) and only specialize that one.
> 
> Also plausible.  I guess it would be good to know, anyway.

I'll dump the parameters when booting Linux. My gut feeling is that the
cc option is basically never used ...

> 
> I think RT probably makes the largest difference to the layout of the function,

Yes. I think the RT and ZS make the biggest difference. IN - not really
that heavy.

Maybe use different variants for RT and ZS for the !CC casen only.

> so maybe that's the one we pick.  We could also leave our options open and make
> the 3 non-CC flags be parameters to the inline function, just extract them from
> the M4 parameter at the one higher level.

That one, I have already done :)
Richard Henderson May 22, 2019, 3:59 p.m. UTC | #6
On Wed, 22 May 2019 at 07:16, David Hildenbrand <david@redhat.com> wrote:
> > Also plausible.  I guess it would be good to know, anyway.
>
> I'll dump the parameters when booting Linux. My gut feeling is that the
> cc option is basically never used ...

It looks like our intuition is wrong about that.

rth@cloudburst:~/glibc/src/sysdeps/s390$ grep -r vfaezbs * | wc -l
15

These set cc, use zs, and do not use rt.

rth@cloudburst:~/glibc/src/sysdeps/s390$ grep -r 'vfaeb' * | wc -l
3

These do not set cc, do not use zs, and do use rt.

Those are the only two VFAE forms used by glibc (note that the same
variants as 'f' are used by the wide-character strings).


r~
David Hildenbrand May 22, 2019, 6:16 p.m. UTC | #7
On 22.05.19 17:59, Richard Henderson wrote:
> On Wed, 22 May 2019 at 07:16, David Hildenbrand <david@redhat.com> wrote:
>>> Also plausible.  I guess it would be good to know, anyway.
>>
>> I'll dump the parameters when booting Linux. My gut feeling is that the
>> cc option is basically never used ...
> 
> It looks like our intuition is wrong about that.

Thanks for checking!

> 
> rth@cloudburst:~/glibc/src/sysdeps/s390$ grep -r vfaezbs * | wc -l
> 15
> 
> These set cc, use zs, and do not use rt.
> 
> rth@cloudburst:~/glibc/src/sysdeps/s390$ grep -r 'vfaeb' * | wc -l
> 3
> 
> These do not set cc, do not use zs, and do use rt.
> 
> Those are the only two VFAE forms used by glibc (note that the same
> variants as 'f' are used by the wide-character strings).
> 

I guess "rt" and "cc" make the biggest difference. Maybe special case
these two, result in 4 variants for each of the 3 element sizes?

> 
> r~
>
Richard Henderson May 22, 2019, 6:46 p.m. UTC | #8
On 5/22/19 2:16 PM, David Hildenbrand wrote:
> On 22.05.19 17:59, Richard Henderson wrote:
>> On Wed, 22 May 2019 at 07:16, David Hildenbrand <david@redhat.com> wrote:
>>>> Also plausible.  I guess it would be good to know, anyway.
>>>
>>> I'll dump the parameters when booting Linux. My gut feeling is that the
>>> cc option is basically never used ...
>>
>> It looks like our intuition is wrong about that.
> 
> Thanks for checking!
> 
>>
>> rth@cloudburst:~/glibc/src/sysdeps/s390$ grep -r vfaezbs * | wc -l
>> 15
>>
>> These set cc, use zs, and do not use rt.
>>
>> rth@cloudburst:~/glibc/src/sysdeps/s390$ grep -r 'vfaeb' * | wc -l
>> 3
>>
>> These do not set cc, do not use zs, and do use rt.
>>
>> Those are the only two VFAE forms used by glibc (note that the same
>> variants as 'f' are used by the wide-character strings).
>>
> 
> I guess "rt" and "cc" make the biggest difference. Maybe special case
> these two, result in 4 variants for each of the 3 element sizes?

Sounds good.


r~
David Hildenbrand May 23, 2019, 7:50 a.m. UTC | #9
On 22.05.19 20:46, Richard Henderson wrote:
> On 5/22/19 2:16 PM, David Hildenbrand wrote:
>> On 22.05.19 17:59, Richard Henderson wrote:
>>> On Wed, 22 May 2019 at 07:16, David Hildenbrand <david@redhat.com> wrote:
>>>>> Also plausible.  I guess it would be good to know, anyway.
>>>>
>>>> I'll dump the parameters when booting Linux. My gut feeling is that the
>>>> cc option is basically never used ...
>>>
>>> It looks like our intuition is wrong about that.
>>
>> Thanks for checking!
>>
>>>
>>> rth@cloudburst:~/glibc/src/sysdeps/s390$ grep -r vfaezbs * | wc -l
>>> 15
>>>
>>> These set cc, use zs, and do not use rt.
>>>
>>> rth@cloudburst:~/glibc/src/sysdeps/s390$ grep -r 'vfaeb' * | wc -l
>>> 3
>>>
>>> These do not set cc, do not use zs, and do use rt.
>>>
>>> Those are the only two VFAE forms used by glibc (note that the same
>>> variants as 'f' are used by the wide-character strings).
>>>
>>
>> I guess "rt" and "cc" make the biggest difference. Maybe special case
>> these two, result in 4 variants for each of the 3 element sizes?
> 
> Sounds good.
> 

So .... after all it might not be necessary, at least not for this
helper :) Using your crazy helper functions, I have this right now:

/*
 * Returns the number of bits composing one element.
 */
static uint8_t get_element_bits(uint8_t es)
{
    return (1 << es) * BITS_PER_BYTE;
}

/*
 * Returns the bitmask for a single element.
 */
static uint64_t get_single_element_mask(uint8_t es)
{
    return -1ull >> (64 - get_element_bits(es));
}

/*
 * Returns the bitmask for a single element (excluding the MSB).
 */
static uint64_t get_single_element_lsbs_mask(uint8_t es)
{
    return -1ull >> (65 - get_element_bits(es));
}

/*
 * Returns the bitmasks for multiple elements (excluding the MSBs).
 */
static uint64_t get_element_lsbs_mask(uint8_t es)
{
    return dup_const(es, get_single_element_lsbs_mask(es));
}

static int vfae(void *v1, const void *v2, const void *v3, bool in,
                bool rt, bool zs, uint8_t es)
{
    const uint64_t mask = get_element_lsbs_mask(es);
    const int bits = get_element_bits(es);
    uint64_t a0, a1, b0, b1, e0, e1, t0, t1, z0, z1;
    uint64_t first_zero = 16;
    uint64_t first_equal;
    int i;

    a0 = s390_vec_read_element64(v2, 0);
    a1 = s390_vec_read_element64(v2, 1);
    b0 = s390_vec_read_element64(v3, 0);
    b1 = s390_vec_read_element64(v3, 1);
    e0 = 0;
    e1 = 0;
    /* compare against equality with every other element */
    for (i = 0; i < 64; i += bits) {
        t0 = i ? rol64(b0, i) : b0;
        t1 = i ? rol64(b1, i) : b1;
        e0 |= zero_search(a0 ^ t0, mask);
        e0 |= zero_search(a0 ^ t1, mask);
        e1 |= zero_search(a1 ^ t0, mask);
        e1 |= zero_search(a1 ^ t1, mask);
    }
    /* invert the result if requested - invert only the MSBs */
    if (in) {
        e0 = ~e0 & ~mask;
        e1 = ~e1 & ~mask;
    }
    first_equal = match_index(e0, e1);

    if (zs) {
        z0 = zero_search(a0, mask);
        z1 = zero_search(a1, mask);
        first_zero = match_index(z0, z1);
    }

    if (rt) {
        e0 = (e0 >> (bits - 1)) * get_single_element_mask(es);
        e1 = (e1 >> (bits - 1)) * get_single_element_mask(es);
        s390_vec_write_element64(v1, 0, e0);
        s390_vec_write_element64(v1, 1, e1);
    } else {
        s390_vec_write_element64(v1, 0, MIN(first_equal, first_zero));
        s390_vec_write_element64(v1, 1, 0);
    }

    if (first_zero == 16 && first_equal == 16) {
        return 3; /* no match */
    } else if (first_zero == 16) {
        return 1; /* matching elements, no match for zero */
    } else if (first_equal < first_zero) {
        return 2; /* matching elements before match for zero */
    }
    return 0; /* match for zero */
}


At least the kernel boots with it - am i missing something or does this
indeed work?

Cheers!
Alex Bennée May 23, 2019, 10:58 a.m. UTC | #10
David Hildenbrand <david@redhat.com> writes:

> On 22.05.19 13:09, Richard Henderson wrote:
>> On 5/22/19 7:01 AM, David Hildenbrand wrote:
>>>
>>>> I also think that, if we create a bunch more of these wrappers:
>>>>
>>>>> +DEF_VFAE_HELPER(8)
>>>>> +DEF_VFAE_HELPER(16)
>>>>> +DEF_VFAE_HELPER(32)
>>>>
>>>> then RT and ZS can be passed in as constant parameters to the above, and then
>>>> the compiler will fold away all of the stuff that's not needed for each
>>>> different case.  Which, I think, is significant.  These are practically
>>>> different instructions with the different modifiers.
>>>>
>>>
>>> So, we have 4 flags, resulting in 16 variants. Times 3 element sizes ...
>>> 48 helpers in total. Do we really want to go down that path?
>>
>> Maybe?
>
> Hope my fingers won't bleed from all the copy-pasting ;)

An alternative is to generalise the code into a helper and then just use
macros to instantiate a series of calls to it (c.f. softfloat). The idea
is you can use flatten/inline to keep it efficient but you don't have a
bunch of logic obscured by macro stuff.


--
Alex Bennée
Richard Henderson May 23, 2019, 12:27 p.m. UTC | #11
On 5/23/19 3:50 AM, David Hildenbrand wrote:
> /*
>  * Returns the number of bits composing one element.
>  */
> static uint8_t get_element_bits(uint8_t es)
> {
>     return (1 << es) * BITS_PER_BYTE;
> }
> 
> /*
>  * Returns the bitmask for a single element.
>  */
> static uint64_t get_single_element_mask(uint8_t es)
> {
>     return -1ull >> (64 - get_element_bits(es));
> }
> 
> /*
>  * Returns the bitmask for a single element (excluding the MSB).
>  */
> static uint64_t get_single_element_lsbs_mask(uint8_t es)
> {
>     return -1ull >> (65 - get_element_bits(es));
> }
> 
> /*
>  * Returns the bitmasks for multiple elements (excluding the MSBs).
>  */
> static uint64_t get_element_lsbs_mask(uint8_t es)
> {
>     return dup_const(es, get_single_element_lsbs_mask(es));
> }
> 
> static int vfae(void *v1, const void *v2, const void *v3, bool in,
>                 bool rt, bool zs, uint8_t es)
> {
>     const uint64_t mask = get_element_lsbs_mask(es);
>     const int bits = get_element_bits(es);
>     uint64_t a0, a1, b0, b1, e0, e1, t0, t1, z0, z1;
>     uint64_t first_zero = 16;
>     uint64_t first_equal;
>     int i;
> 
>     a0 = s390_vec_read_element64(v2, 0);
>     a1 = s390_vec_read_element64(v2, 1);
>     b0 = s390_vec_read_element64(v3, 0);
>     b1 = s390_vec_read_element64(v3, 1);
>     e0 = 0;
>     e1 = 0;
>     /* compare against equality with every other element */
>     for (i = 0; i < 64; i += bits) {
>         t0 = i ? rol64(b0, i) : b0;
>         t1 = i ? rol64(b1, i) : b1;
>         e0 |= zero_search(a0 ^ t0, mask);
>         e0 |= zero_search(a0 ^ t1, mask);
>         e1 |= zero_search(a1 ^ t0, mask);
>         e1 |= zero_search(a1 ^ t1, mask);
>     }

I don't see that this is doing what you want.  You're shifting one element of B
down, but not broadcasting it so that it is compared against every element of A.

I'd expect something like

	t0 = dup_const(es, b0 >> i);
	t1 = dup_const(es, b1 >> i);

(I also don't see what rol is getting you that shift doesn't.)


>     /* invert the result if requested - invert only the MSBs */
>     if (in) {
>         e0 = ~e0 & ~mask;
>         e1 = ~e1 & ~mask;
>     }
>     first_equal = match_index(e0, e1);
> 
>     if (zs) {
>         z0 = zero_search(a0, mask);
>         z1 = zero_search(a1, mask);
>         first_zero = match_index(z0, z1);
>     }
> 
>     if (rt) {
>         e0 = (e0 >> (bits - 1)) * get_single_element_mask(es);
>         e1 = (e1 >> (bits - 1)) * get_single_element_mask(es);
>         s390_vec_write_element64(v1, 0, e0);
>         s390_vec_write_element64(v1, 1, e1);
>     } else {
>         s390_vec_write_element64(v1, 0, MIN(first_equal, first_zero));
>         s390_vec_write_element64(v1, 1, 0);
>     }
> 
>     if (first_zero == 16 && first_equal == 16) {
>         return 3; /* no match */
>     } else if (first_zero == 16) {
>         return 1; /* matching elements, no match for zero */
>     } else if (first_equal < first_zero) {
>         return 2; /* matching elements before match for zero */
>     }
>     return 0; /* match for zero */
> }

The rest of this looks good.


r~
David Hildenbrand May 23, 2019, 12:34 p.m. UTC | #12
On 23.05.19 14:27, Richard Henderson wrote:
> On 5/23/19 3:50 AM, David Hildenbrand wrote:
>> /*
>>  * Returns the number of bits composing one element.
>>  */
>> static uint8_t get_element_bits(uint8_t es)
>> {
>>     return (1 << es) * BITS_PER_BYTE;
>> }
>>
>> /*
>>  * Returns the bitmask for a single element.
>>  */
>> static uint64_t get_single_element_mask(uint8_t es)
>> {
>>     return -1ull >> (64 - get_element_bits(es));
>> }
>>
>> /*
>>  * Returns the bitmask for a single element (excluding the MSB).
>>  */
>> static uint64_t get_single_element_lsbs_mask(uint8_t es)
>> {
>>     return -1ull >> (65 - get_element_bits(es));
>> }
>>
>> /*
>>  * Returns the bitmasks for multiple elements (excluding the MSBs).
>>  */
>> static uint64_t get_element_lsbs_mask(uint8_t es)
>> {
>>     return dup_const(es, get_single_element_lsbs_mask(es));
>> }
>>
>> static int vfae(void *v1, const void *v2, const void *v3, bool in,
>>                 bool rt, bool zs, uint8_t es)
>> {
>>     const uint64_t mask = get_element_lsbs_mask(es);
>>     const int bits = get_element_bits(es);
>>     uint64_t a0, a1, b0, b1, e0, e1, t0, t1, z0, z1;
>>     uint64_t first_zero = 16;
>>     uint64_t first_equal;
>>     int i;
>>
>>     a0 = s390_vec_read_element64(v2, 0);
>>     a1 = s390_vec_read_element64(v2, 1);
>>     b0 = s390_vec_read_element64(v3, 0);
>>     b1 = s390_vec_read_element64(v3, 1);
>>     e0 = 0;
>>     e1 = 0;
>>     /* compare against equality with every other element */
>>     for (i = 0; i < 64; i += bits) {
>>         t0 = i ? rol64(b0, i) : b0;
>>         t1 = i ? rol64(b1, i) : b1;
>>         e0 |= zero_search(a0 ^ t0, mask);
>>         e0 |= zero_search(a0 ^ t1, mask);
>>         e1 |= zero_search(a1 ^ t0, mask);
>>         e1 |= zero_search(a1 ^ t1, mask);
>>     }
> 
> I don't see that this is doing what you want.  You're shifting one element of B
> down, but not broadcasting it so that it is compared against every element of A.
> 
> I'd expect something like
> 
> 	t0 = dup_const(es, b0 >> i);
> 	t1 = dup_const(es, b1 >> i);
> 
> (I also don't see what rol is getting you that shift doesn't.)

Let's assume

a0 = [0, 1, 2, 3]
a1 = [4, 5, 6, 7]

b0 = [8, 8, 8, 4]
b1 = [8, 8, 8, 8]

What I would check is

First iteration

a0 == [8, 8, 8, 4] -> no match
a0 == [8, 8, 8, 8] -> no match
a1 == [8, 8, 8, 4] -> no match
a1 == [8, 8, 8, 8] -> no match

Second iteration

a0 == [8, 8, 4, 8] -> no match
a0 == [8, 8, 8, 8] -> no match
a1 == [8, 8, 4, 8]
a1 == [8, 8, 8, 8] -> no match

...

Last iteration

a0 == [4, 8, 8, 8] -> no match
a0 == [8, 8, 8, 8] -> no match
a1 == [4, 8, 8, 8] -> match in first element
a1 == [8, 8, 8, 8] -> no match

What am i missing?
David Hildenbrand May 23, 2019, 12:59 p.m. UTC | #13
On 23.05.19 14:34, David Hildenbrand wrote:
> On 23.05.19 14:27, Richard Henderson wrote:
>> On 5/23/19 3:50 AM, David Hildenbrand wrote:
>>> /*
>>>  * Returns the number of bits composing one element.
>>>  */
>>> static uint8_t get_element_bits(uint8_t es)
>>> {
>>>     return (1 << es) * BITS_PER_BYTE;
>>> }
>>>
>>> /*
>>>  * Returns the bitmask for a single element.
>>>  */
>>> static uint64_t get_single_element_mask(uint8_t es)
>>> {
>>>     return -1ull >> (64 - get_element_bits(es));
>>> }
>>>
>>> /*
>>>  * Returns the bitmask for a single element (excluding the MSB).
>>>  */
>>> static uint64_t get_single_element_lsbs_mask(uint8_t es)
>>> {
>>>     return -1ull >> (65 - get_element_bits(es));
>>> }
>>>
>>> /*
>>>  * Returns the bitmasks for multiple elements (excluding the MSBs).
>>>  */
>>> static uint64_t get_element_lsbs_mask(uint8_t es)
>>> {
>>>     return dup_const(es, get_single_element_lsbs_mask(es));
>>> }
>>>
>>> static int vfae(void *v1, const void *v2, const void *v3, bool in,
>>>                 bool rt, bool zs, uint8_t es)
>>> {
>>>     const uint64_t mask = get_element_lsbs_mask(es);
>>>     const int bits = get_element_bits(es);
>>>     uint64_t a0, a1, b0, b1, e0, e1, t0, t1, z0, z1;
>>>     uint64_t first_zero = 16;
>>>     uint64_t first_equal;
>>>     int i;
>>>
>>>     a0 = s390_vec_read_element64(v2, 0);
>>>     a1 = s390_vec_read_element64(v2, 1);
>>>     b0 = s390_vec_read_element64(v3, 0);
>>>     b1 = s390_vec_read_element64(v3, 1);
>>>     e0 = 0;
>>>     e1 = 0;
>>>     /* compare against equality with every other element */
>>>     for (i = 0; i < 64; i += bits) {
>>>         t0 = i ? rol64(b0, i) : b0;
>>>         t1 = i ? rol64(b1, i) : b1;
>>>         e0 |= zero_search(a0 ^ t0, mask);
>>>         e0 |= zero_search(a0 ^ t1, mask);
>>>         e1 |= zero_search(a1 ^ t0, mask);
>>>         e1 |= zero_search(a1 ^ t1, mask);
>>>     }
>>
>> I don't see that this is doing what you want.  You're shifting one element of B
>> down, but not broadcasting it so that it is compared against every element of A.
>>
>> I'd expect something like
>>
>> 	t0 = dup_const(es, b0 >> i);
>> 	t1 = dup_const(es, b1 >> i);
>>
>> (I also don't see what rol is getting you that shift doesn't.)
> 
> Let's assume
> 
> a0 = [0, 1, 2, 3]
> a1 = [4, 5, 6, 7]
> 
> b0 = [8, 8, 8, 4]
> b1 = [8, 8, 8, 8]
> 
> What I would check is
> 
> First iteration
> 
> a0 == [8, 8, 8, 4] -> no match
> a0 == [8, 8, 8, 8] -> no match
> a1 == [8, 8, 8, 4] -> no match
> a1 == [8, 8, 8, 8] -> no match
> 
> Second iteration
> 
> a0 == [8, 8, 4, 8] -> no match
> a0 == [8, 8, 8, 8] -> no match
> a1 == [8, 8, 4, 8]
> a1 == [8, 8, 8, 8] -> no match
> 
> ...
> 
> Last iteration
> 
> a0 == [4, 8, 8, 8] -> no match
> a0 == [8, 8, 8, 8] -> no match
> a1 == [4, 8, 8, 8] -> match in first element
> a1 == [8, 8, 8, 8] -> no match
> 
> What am i missing?
> 
> 

I guess I can simplify to

t0 = rol64(b0, i);
t1 = rol64(b1, i);

My approach: Compare all elements of B at a time
Your approach: Compare a single element of B at a time

If I'm not wrong, it boils down to to whether

rol64() or dup_const(es, b0 >> i) is faster ;)
Richard Henderson May 23, 2019, 1:50 p.m. UTC | #14
On 5/23/19 8:59 AM, David Hildenbrand wrote:
> I guess I can simplify to
> 
> t0 = rol64(b0, i);
> t1 = rol64(b1, i);

Yes.

> My approach: Compare all elements of B at a time
> Your approach: Compare a single element of B at a time

Ah, I get it now.  This makes total sense, and does make the broadcast unnecessary.


r~
diff mbox series

Patch

diff --git a/target/s390x/Makefile.objs b/target/s390x/Makefile.objs
index 993ac93ed6..0a38281a14 100644
--- a/target/s390x/Makefile.objs
+++ b/target/s390x/Makefile.objs
@@ -1,7 +1,7 @@ 
 obj-y += cpu.o cpu_models.o cpu_features.o gdbstub.o interrupt.o helper.o
 obj-$(CONFIG_TCG) += translate.o cc_helper.o excp_helper.o fpu_helper.o
 obj-$(CONFIG_TCG) += int_helper.o mem_helper.o misc_helper.o crypto_helper.o
-obj-$(CONFIG_TCG) += vec_helper.o vec_int_helper.o
+obj-$(CONFIG_TCG) += vec_helper.o vec_int_helper.o vec_string_helper.o
 obj-$(CONFIG_SOFTMMU) += machine.o ioinst.o arch_dump.o mmu_helper.o diag.o
 obj-$(CONFIG_SOFTMMU) += sigp.o
 obj-$(CONFIG_KVM) += kvm.o
diff --git a/target/s390x/helper.h b/target/s390x/helper.h
index 7755a96c33..c45328cf73 100644
--- a/target/s390x/helper.h
+++ b/target/s390x/helper.h
@@ -211,6 +211,14 @@  DEF_HELPER_FLAGS_4(gvec_vscbi8, TCG_CALL_NO_RWG, void, ptr, cptr, cptr, i32)
 DEF_HELPER_FLAGS_4(gvec_vscbi16, TCG_CALL_NO_RWG, void, ptr, cptr, cptr, i32)
 DEF_HELPER_4(gvec_vtm, void, ptr, cptr, env, i32)
 
+/* === Vector String Instructions === */
+DEF_HELPER_FLAGS_4(gvec_vfae8, TCG_CALL_NO_RWG, void, ptr, cptr, cptr, i32)
+DEF_HELPER_FLAGS_4(gvec_vfae16, TCG_CALL_NO_RWG, void, ptr, cptr, cptr, i32)
+DEF_HELPER_FLAGS_4(gvec_vfae32, TCG_CALL_NO_RWG, void, ptr, cptr, cptr, i32)
+DEF_HELPER_5(gvec_vfae_cc8, void, ptr, cptr, cptr, env, i32)
+DEF_HELPER_5(gvec_vfae_cc16, void, ptr, cptr, cptr, env, i32)
+DEF_HELPER_5(gvec_vfae_cc32, void, ptr, cptr, cptr, env, i32)
+
 #ifndef CONFIG_USER_ONLY
 DEF_HELPER_3(servc, i32, env, i64, i64)
 DEF_HELPER_4(diag, void, env, i32, i32, i32)
diff --git a/target/s390x/insn-data.def b/target/s390x/insn-data.def
index e61475bdc4..070ce2a471 100644
--- a/target/s390x/insn-data.def
+++ b/target/s390x/insn-data.def
@@ -1191,6 +1191,11 @@ 
 /* VECTOR TEST UNDER MASK */
     F(0xe7d8, VTM,     VRR_a, V,   0, 0, 0, 0, vtm, 0, IF_VEC)
 
+/* === Vector String Instructions === */
+
+/* VECTOR FIND ANY ELEMENT EQUAL */
+    F(0xe782, VFAE,    VRR_b, V,   0, 0, 0, 0, vfae, 0, IF_VEC)
+
 #ifndef CONFIG_USER_ONLY
 /* COMPARE AND SWAP AND PURGE */
     E(0xb250, CSP,     RRE,   Z,   r1_32u, ra2, r1_P, 0, csp, 0, MO_TEUL, IF_PRIV)
diff --git a/target/s390x/translate_vx.inc.c b/target/s390x/translate_vx.inc.c
index 7e0bfcb190..022990dda3 100644
--- a/target/s390x/translate_vx.inc.c
+++ b/target/s390x/translate_vx.inc.c
@@ -2353,3 +2353,34 @@  static DisasJumpType op_vtm(DisasContext *s, DisasOps *o)
     set_cc_static(s);
     return DISAS_NEXT;
 }
+
+static DisasJumpType op_vfae(DisasContext *s, DisasOps *o)
+{
+    const uint8_t es = get_field(s->fields, m4);
+    const uint8_t m5 = get_field(s->fields, m5);
+    static gen_helper_gvec_3_ptr * const cc[3] = {
+        gen_helper_gvec_vfae_cc8,
+        gen_helper_gvec_vfae_cc16,
+        gen_helper_gvec_vfae_cc32,
+    };
+    static gen_helper_gvec_3 * const nocc[3] = {
+        gen_helper_gvec_vfae8,
+        gen_helper_gvec_vfae16,
+        gen_helper_gvec_vfae32,
+    };
+
+    if (es > ES_32) {
+        gen_program_exception(s, PGM_SPECIFICATION);
+        return DISAS_NORETURN;
+    }
+
+    if (m5 & 1) {
+        gen_gvec_3_ptr(get_field(s->fields, v1), get_field(s->fields, v2),
+                       get_field(s->fields, v3), cpu_env, m5, cc[es]);
+        set_cc_static(s);
+    } else {
+        gen_gvec_3_ool(get_field(s->fields, v1), get_field(s->fields, v2),
+                       get_field(s->fields, v3), m5, nocc[es]);
+    }
+    return DISAS_NEXT;
+}
diff --git a/target/s390x/vec_string_helper.c b/target/s390x/vec_string_helper.c
new file mode 100644
index 0000000000..8a4e65b70f
--- /dev/null
+++ b/target/s390x/vec_string_helper.c
@@ -0,0 +1,97 @@ 
+/*
+ * QEMU TCG support -- s390x vector string instruction support
+ *
+ * Copyright (C) 2019 Red Hat Inc
+ *
+ * Authors:
+ *   David Hildenbrand <david@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+#include "qemu/osdep.h"
+#include "qemu-common.h"
+#include "cpu.h"
+#include "internal.h"
+#include "vec.h"
+#include "tcg/tcg-gvec-desc.h"
+#include "exec/helper-proto.h"
+
+#define DEF_VFAE(BITS)                                                         \
+static int vfae##BITS(void *v1, const void *v2, const void *v3, uint8_t m5)    \
+{                                                                              \
+    const bool in = extract32(m5, 3, 1);                                       \
+    const bool rt = extract32(m5, 2, 1);                                       \
+    const bool zs = extract32(m5, 1, 1);                                       \
+    S390Vector tmp = {};                                                       \
+    int first_byte = 16;                                                       \
+    int cc = 3; /* no match */                                                 \
+    int i, j;                                                                  \
+                                                                               \
+    for (i = 0; i < (128 / BITS); i++) {                                       \
+        const uint##BITS##_t data = s390_vec_read_element##BITS(v2, i);        \
+        bool any_equal = false;                                                \
+                                                                               \
+        if (zs && !data) {                                                     \
+            if (cc == 3) {                                                     \
+                first_byte = i * (BITS / 8);                                   \
+                cc = 0; /* match for zero */                                   \
+            } else if (cc != 0) {                                              \
+                cc = 2; /* matching elements before match for zero */          \
+            }                                                                  \
+            if (!rt) {                                                         \
+                break;                                                         \
+            }                                                                  \
+        }                                                                      \
+                                                                               \
+        /* try to match with any other element from the other vector */        \
+        for (j = 0; j < (128 / BITS); j++) {                                   \
+            if (data == s390_vec_read_element##BITS(v3, j)) {                  \
+                any_equal = true;                                              \
+                break;                                                         \
+            }                                                                  \
+        }                                                                      \
+                                                                               \
+        /* invert the result if requested */                                   \
+        any_equal = in ^ any_equal;                                            \
+        if (cc == 3 && any_equal) {                                            \
+            first_byte = i * (BITS / 8);                                       \
+            cc = 1; /* matching elements, no match for zero */                 \
+            if (!zs && !rt) {                                                  \
+                break;                                                         \
+            }                                                                  \
+        }                                                                      \
+        /* indicate bit vector if requested */                                 \
+        if (rt && any_equal) {                                                 \
+            s390_vec_write_element##BITS(&tmp, i, (uint##BITS##_t)-1ull);      \
+        }                                                                      \
+    }                                                                          \
+    if (!rt) {                                                                 \
+        s390_vec_write_element8(&tmp, 7, first_byte);                          \
+    }                                                                          \
+    *(S390Vector *)v1 = tmp;                                                   \
+    return cc;                                                                 \
+}
+DEF_VFAE(8)
+DEF_VFAE(16)
+DEF_VFAE(32)
+
+#define DEF_VFAE_HELPER(BITS)                                                  \
+void HELPER(gvec_vfae##BITS)(void *v1, const void *v2, const void *v3,         \
+                             uint32_t desc)                                    \
+{                                                                              \
+    vfae##BITS(v1, v2, v3, simd_data(desc));                                   \
+}
+DEF_VFAE_HELPER(8)
+DEF_VFAE_HELPER(16)
+DEF_VFAE_HELPER(32)
+
+#define DEF_VFAE_CC_HELPER(BITS)                                               \
+void HELPER(gvec_vfae_cc##BITS)(void *v1, const void *v2, const void *v3,      \
+                                CPUS390XState *env, uint32_t desc)             \
+{                                                                              \
+    env->cc_op = vfae##BITS(v1, v2, v3, simd_data(desc));                      \
+}
+DEF_VFAE_CC_HELPER(8)
+DEF_VFAE_CC_HELPER(16)
+DEF_VFAE_CC_HELPER(32)