fold builtin_tolower, builtin_toupper
diff mbox

Message ID 1436446689-12180-1-git-send-email-rep.dot.nop@gmail.com
State New
Headers show

Commit Message

Bernhard Reutner-Fischer July 9, 2015, 12:58 p.m. UTC
gcc/ChangeLog

2015-07-09  Bernhard Reutner-Fischer  <aldot@gcc.gnu.org>

	* builtins.c (fold_builtin_tolower, fold_builtin_toupper): New
	static functions.
	(fold_builtin_1): Handle BUILT_IN_TOLOWER, BUILT_IN_TOUPPER.

Signed-off-by: Bernhard Reutner-Fischer <rep.dot.nop@gmail.com>
---
 gcc/builtins.c | 99 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 99 insertions(+)

Using the three testcases attached to PR66741 where the -1.c one is using
builtins
$ for i in 0 1 2;do gcc -o tolower_strcpy-$i tolower_strcpy-$i.c -Ofast -W -Wall -Wextra -pedantic -DMAIN -msse4.2;done

pristine (trunk@225368):
# tolower_strcpy-0

real	0m6.068s
user	0m3.204s
sys	0m2.840s
# tolower_strcpy-1

real	0m8.097s
user	0m5.548s
sys	0m2.528s
# tolower_strcpy-2

real	0m3.568s
user	0m0.804s
sys	0m2.748s

trunk@225368 + fold tolower/toupper below

# tolower_strcpy-0

real	0m6.055s
user	0m3.212s
sys	0m2.832s
# tolower_strcpy-1

real	0m5.383s
user	0m2.464s
sys	0m2.900s
# tolower_strcpy-2

real	0m3.605s
user	0m0.668s
sys	0m2.924s

The tolower loop now ends up as
.L5:
        movsbl  (%rbx), %edx
        leal    32(%rdx), %ecx
        movl    %edx, %eax
        subl    $65, %edx
        cmpl    $25, %edx
        cmovbe  %ecx, %eax
        addq    $1, %rbx
        movb    %al, -1(%rbx)
        cmpq    %rsi, %rbx
        jne     .L5

instead of the former call

.L5:
        movsbl  (%rbx), %edi
        addq    $1, %rbx
        call    tolower
        movb    %al, -1(%rbx)
        cmpq    %rbp, %rbx
        jne     .L5

Would something like attached be ok for trunk after proper testing?
Advise on the questions inline WRT caching lang_hooks intermediate
results?
Hints on further steps towards fixing the PR?

I think the next step would be to try to teach graphite to fuse the two
loops in tolower_strcpy-0.c. Need to look at graphite..
Then see how to classify builtins that could be expanded early and what
breaks if doing so. This sounds like a potential disaster, fun.
Next, see why the vectorizer (or something else) does not pave the way
to use SSE instruction as the tolower_strcpy-2.c does.

thanks,

Comments

Richard Biener July 9, 2015, 1:46 p.m. UTC | #1
On Thu, 9 Jul 2015, Bernhard Reutner-Fischer wrote:

> gcc/ChangeLog
> 
> 2015-07-09  Bernhard Reutner-Fischer  <aldot@gcc.gnu.org>
> 
> 	* builtins.c (fold_builtin_tolower, fold_builtin_toupper): New
> 	static functions.
> 	(fold_builtin_1): Handle BUILT_IN_TOLOWER, BUILT_IN_TOUPPER.

As I read it you fold tolower (X) to (X) >= target_char_set ('A')
&& (X) <= target_char_set ('Z') ? (X) - target_char_set ('A') + 
target_char_set ('a');

I don't think this can be correct for all locales which need not
have a lower-case character for all upper-case ones nor do
all letters having one need to be in the range of 'A' to 'Z'.

Joseph will surely correct me if I am wrong.

What works would eventually be constant folding.

Richard.

> Signed-off-by: Bernhard Reutner-Fischer <rep.dot.nop@gmail.com>
> ---
>  gcc/builtins.c | 99 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 99 insertions(+)
> 
> Using the three testcases attached to PR66741 where the -1.c one is using
> builtins
> $ for i in 0 1 2;do gcc -o tolower_strcpy-$i tolower_strcpy-$i.c -Ofast -W -Wall -Wextra -pedantic -DMAIN -msse4.2;done
> 
> pristine (trunk@225368):
> # tolower_strcpy-0
> 
> real	0m6.068s
> user	0m3.204s
> sys	0m2.840s
> # tolower_strcpy-1
> 
> real	0m8.097s
> user	0m5.548s
> sys	0m2.528s
> # tolower_strcpy-2
> 
> real	0m3.568s
> user	0m0.804s
> sys	0m2.748s
> 
> trunk@225368 + fold tolower/toupper below
> 
> # tolower_strcpy-0
> 
> real	0m6.055s
> user	0m3.212s
> sys	0m2.832s
> # tolower_strcpy-1
> 
> real	0m5.383s
> user	0m2.464s
> sys	0m2.900s
> # tolower_strcpy-2
> 
> real	0m3.605s
> user	0m0.668s
> sys	0m2.924s
> 
> The tolower loop now ends up as
> .L5:
>         movsbl  (%rbx), %edx
>         leal    32(%rdx), %ecx
>         movl    %edx, %eax
>         subl    $65, %edx
>         cmpl    $25, %edx
>         cmovbe  %ecx, %eax
>         addq    $1, %rbx
>         movb    %al, -1(%rbx)
>         cmpq    %rsi, %rbx
>         jne     .L5
> 
> instead of the former call
> 
> .L5:
>         movsbl  (%rbx), %edi
>         addq    $1, %rbx
>         call    tolower
>         movb    %al, -1(%rbx)
>         cmpq    %rbp, %rbx
>         jne     .L5
> 
> Would something like attached be ok for trunk after proper testing?
> Advise on the questions inline WRT caching lang_hooks intermediate
> results?
> Hints on further steps towards fixing the PR?
> 
> I think the next step would be to try to teach graphite to fuse the two
> loops in tolower_strcpy-0.c. Need to look at graphite..
> Then see how to classify builtins that could be expanded early and what
> breaks if doing so. This sounds like a potential disaster, fun.
> Next, see why the vectorizer (or something else) does not pave the way
> to use SSE instruction as the tolower_strcpy-2.c does.
> 
> thanks,
> 
> diff --git a/gcc/builtins.c b/gcc/builtins.c
> index 5f53342..421c908 100644
> --- a/gcc/builtins.c
> +++ b/gcc/builtins.c
> @@ -204,6 +204,9 @@ static tree fold_builtin_strrchr (location_t, tree, tree, tree);
>  static tree fold_builtin_strspn (location_t, tree, tree);
>  static tree fold_builtin_strcspn (location_t, tree, tree);
>  
> +static tree fold_builtin_tolower (location_t, tree);
> +static tree fold_builtin_toupper (location_t, tree);
> +
>  static rtx expand_builtin_object_size (tree);
>  static rtx expand_builtin_memory_chk (tree, rtx, machine_mode,
>  				      enum built_in_function);
> @@ -10285,6 +10288,12 @@ fold_builtin_1 (location_t loc, tree fndecl, tree arg0)
>      case BUILT_IN_ISDIGIT:
>        return fold_builtin_isdigit (loc, arg0);
>  
> +    case BUILT_IN_TOLOWER:
> +      return fold_builtin_tolower (loc, arg0);
> +
> +    case BUILT_IN_TOUPPER:
> +      return fold_builtin_toupper (loc, arg0);
> +
>      CASE_FLT_FN (BUILT_IN_FINITE):
>      case BUILT_IN_FINITED32:
>      case BUILT_IN_FINITED64:
> @@ -11208,6 +11217,96 @@ fold_builtin_strcspn (location_t loc, tree s1, tree s2)
>      }
>  }
>  
> +
> +/* Simplify a call to the tolower builtin.  ARG is the argument to the call.
> +
> +   Return NULL_TREE if no simplification was possible, otherwise return the
> +   simplified form of the call as a tree.  */
> +
> +static tree
> +fold_builtin_tolower (location_t loc, tree arg)
> +{
> +  if (!validate_arg (arg, INTEGER_TYPE))
> +    return NULL_TREE;
> +
> +  /* Transform tolower(c) -> (unsigned)(c) | 0x20.
> +
> +     More specifically:
> +     unsigned tem = arg - 'A';
> +     if (tem <= ('Z' - 'A'))
> +       arg += 'a' - 'A';
> +     return arg;
> +   */
> +  unsigned HOST_WIDE_INT target_A = lang_hooks.to_target_charset ('A');
> +  unsigned HOST_WIDE_INT target_Z = lang_hooks.to_target_charset ('Z');
> +  unsigned HOST_WIDE_INT target_a = lang_hooks.to_target_charset ('a');
> +  if (target_A == 0
> +      || target_Z == 0
> +      || target_a == 0)
> +    return NULL_TREE;
> +
> +  arg = fold_convert_loc (loc, unsigned_type_node, arg);
> +  tree tem = fold_build2 (MINUS_EXPR, unsigned_type_node, arg,
> +			  build_int_cst (unsigned_type_node, target_A));
> +  /* ??? x19 and x20 would better live in static storage; Think:
> +   * static struct static_fold_tolower {uHWI x19, x20; unsigned probe_done};
> +   */
> +  unsigned HOST_WIDE_INT x19 = target_Z - target_A;
> +  unsigned HOST_WIDE_INT x20 = target_a - target_A;
> +  tem = fold_build2_loc (loc, LE_EXPR, integer_type_node, tem,
> +			 build_int_cst (unsigned_type_node, x19));
> +  tem = fold_build3_loc (loc, COND_EXPR, unsigned_type_node, tem,
> +			 fold_build2 (PLUS_EXPR, unsigned_type_node, arg,
> +				      build_int_cst (unsigned_type_node, x20)),
> +			 arg);
> +  return fold_convert_loc (loc, integer_type_node, tem);
> +}
> +
> +/* Simplify a call to the toupper builtin.  ARG is the argument to the call.
> +
> +   Return NULL_TREE if no simplification was possible, otherwise return the
> +   simplified form of the call as a tree.  */
> +
> +static tree
> +fold_builtin_toupper (location_t loc, tree arg)
> +{
> +  if (!validate_arg (arg, INTEGER_TYPE))
> +    return NULL_TREE;
> +
> +  /* Transform toupper(c) -> (unsigned)(c) ^ 0x20.
> +
> +     More specifically:
> +     unsigned tem = arg - 'a';
> +     if (tem <= ('z' - 'a'))
> +       arg -= 'a' - 'A';
> +     return arg;
> +   */
> +  unsigned HOST_WIDE_INT target_A = lang_hooks.to_target_charset ('A');
> +  unsigned HOST_WIDE_INT target_z = lang_hooks.to_target_charset ('z');
> +  unsigned HOST_WIDE_INT target_a = lang_hooks.to_target_charset ('a');
> +  if (target_A == 0
> +      || target_z == 0
> +      || target_a == 0)
> +    return NULL_TREE;
> +
> +  arg = fold_convert_loc (loc, unsigned_type_node, arg);
> +  tree tem = fold_build2 (MINUS_EXPR, unsigned_type_node, arg,
> +			  build_int_cst (unsigned_type_node, target_a));
> +  /* ??? x19 and x20 would better live in static storage; Think:
> +   * static struct static_fold_tolower {uHWI x19, x20; unsigned probe_done};
> +   */
> +  unsigned HOST_WIDE_INT x19 = target_z - target_a;
> +  unsigned HOST_WIDE_INT x20 = target_a - target_A;
> +  tem = fold_build2_loc (loc, LE_EXPR, integer_type_node, tem,
> +			 build_int_cst (unsigned_type_node, x19));
> +  tem = fold_build3_loc (loc, COND_EXPR, unsigned_type_node, tem,
> +			 fold_build2 (MINUS_EXPR, unsigned_type_node, arg,
> +				      build_int_cst (unsigned_type_node, x20)),
> +			 arg);
> +  return fold_convert_loc (loc, integer_type_node, tem);
> +}
> +
> +
>  /* Fold the next_arg or va_start call EXP. Returns true if there was an error
>     produced.  False otherwise.  This is done so that we don't output the error
>     or warning twice or three times.  */
>
Bernhard Reutner-Fischer July 9, 2015, 3:02 p.m. UTC | #2
On 9 July 2015 at 15:46, Richard Biener <rguenther@suse.de> wrote:
> On Thu, 9 Jul 2015, Bernhard Reutner-Fischer wrote:
>
>> gcc/ChangeLog
>>
>> 2015-07-09  Bernhard Reutner-Fischer  <aldot@gcc.gnu.org>
>>
>>       * builtins.c (fold_builtin_tolower, fold_builtin_toupper): New
>>       static functions.
>>       (fold_builtin_1): Handle BUILT_IN_TOLOWER, BUILT_IN_TOUPPER.
>
> As I read it you fold tolower (X) to (X) >= target_char_set ('A')
> && (X) <= target_char_set ('Z') ? (X) - target_char_set ('A') +
> target_char_set ('a');
>
> I don't think this can be correct for all locales which need not
> have a lower-case character for all upper-case ones nor do
> all letters having one need to be in the range of 'A' to 'Z'.
>
> Joseph will surely correct me if I am wrong.
>
> What works would eventually be constant folding.

Thinking about it, this is not tolower_l nor towlower so should
probably not be concerned about locales at all.

thanks,
>
> Richard.
>
>> Signed-off-by: Bernhard Reutner-Fischer <rep.dot.nop@gmail.com>
>> ---
>>  gcc/builtins.c | 99 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>>  1 file changed, 99 insertions(+)
>>
>> Using the three testcases attached to PR66741 where the -1.c one is using
>> builtins
>> $ for i in 0 1 2;do gcc -o tolower_strcpy-$i tolower_strcpy-$i.c -Ofast -W -Wall -Wextra -pedantic -DMAIN -msse4.2;done
>>
>> pristine (trunk@225368):
>> # tolower_strcpy-0
>>
>> real  0m6.068s
>> user  0m3.204s
>> sys   0m2.840s
>> # tolower_strcpy-1
>>
>> real  0m8.097s
>> user  0m5.548s
>> sys   0m2.528s
>> # tolower_strcpy-2
>>
>> real  0m3.568s
>> user  0m0.804s
>> sys   0m2.748s
>>
>> trunk@225368 + fold tolower/toupper below
>>
>> # tolower_strcpy-0
>>
>> real  0m6.055s
>> user  0m3.212s
>> sys   0m2.832s
>> # tolower_strcpy-1
>>
>> real  0m5.383s
>> user  0m2.464s
>> sys   0m2.900s
>> # tolower_strcpy-2
>>
>> real  0m3.605s
>> user  0m0.668s
>> sys   0m2.924s
>>
>> The tolower loop now ends up as
>> .L5:
>>         movsbl  (%rbx), %edx
>>         leal    32(%rdx), %ecx
>>         movl    %edx, %eax
>>         subl    $65, %edx
>>         cmpl    $25, %edx
>>         cmovbe  %ecx, %eax
>>         addq    $1, %rbx
>>         movb    %al, -1(%rbx)
>>         cmpq    %rsi, %rbx
>>         jne     .L5
>>
>> instead of the former call
>>
>> .L5:
>>         movsbl  (%rbx), %edi
>>         addq    $1, %rbx
>>         call    tolower
>>         movb    %al, -1(%rbx)
>>         cmpq    %rbp, %rbx
>>         jne     .L5
>>
>> Would something like attached be ok for trunk after proper testing?
>> Advise on the questions inline WRT caching lang_hooks intermediate
>> results?
>> Hints on further steps towards fixing the PR?
>>
>> I think the next step would be to try to teach graphite to fuse the two
>> loops in tolower_strcpy-0.c. Need to look at graphite..
>> Then see how to classify builtins that could be expanded early and what
>> breaks if doing so. This sounds like a potential disaster, fun.
>> Next, see why the vectorizer (or something else) does not pave the way
>> to use SSE instruction as the tolower_strcpy-2.c does.
>>
>> thanks,
>>
>> diff --git a/gcc/builtins.c b/gcc/builtins.c
>> index 5f53342..421c908 100644
>> --- a/gcc/builtins.c
>> +++ b/gcc/builtins.c
>> @@ -204,6 +204,9 @@ static tree fold_builtin_strrchr (location_t, tree, tree, tree);
>>  static tree fold_builtin_strspn (location_t, tree, tree);
>>  static tree fold_builtin_strcspn (location_t, tree, tree);
>>
>> +static tree fold_builtin_tolower (location_t, tree);
>> +static tree fold_builtin_toupper (location_t, tree);
>> +
>>  static rtx expand_builtin_object_size (tree);
>>  static rtx expand_builtin_memory_chk (tree, rtx, machine_mode,
>>                                     enum built_in_function);
>> @@ -10285,6 +10288,12 @@ fold_builtin_1 (location_t loc, tree fndecl, tree arg0)
>>      case BUILT_IN_ISDIGIT:
>>        return fold_builtin_isdigit (loc, arg0);
>>
>> +    case BUILT_IN_TOLOWER:
>> +      return fold_builtin_tolower (loc, arg0);
>> +
>> +    case BUILT_IN_TOUPPER:
>> +      return fold_builtin_toupper (loc, arg0);
>> +
>>      CASE_FLT_FN (BUILT_IN_FINITE):
>>      case BUILT_IN_FINITED32:
>>      case BUILT_IN_FINITED64:
>> @@ -11208,6 +11217,96 @@ fold_builtin_strcspn (location_t loc, tree s1, tree s2)
>>      }
>>  }
>>
>> +
>> +/* Simplify a call to the tolower builtin.  ARG is the argument to the call.
>> +
>> +   Return NULL_TREE if no simplification was possible, otherwise return the
>> +   simplified form of the call as a tree.  */
>> +
>> +static tree
>> +fold_builtin_tolower (location_t loc, tree arg)
>> +{
>> +  if (!validate_arg (arg, INTEGER_TYPE))
>> +    return NULL_TREE;
>> +
>> +  /* Transform tolower(c) -> (unsigned)(c) | 0x20.
>> +
>> +     More specifically:
>> +     unsigned tem = arg - 'A';
>> +     if (tem <= ('Z' - 'A'))
>> +       arg += 'a' - 'A';
>> +     return arg;
>> +   */
>> +  unsigned HOST_WIDE_INT target_A = lang_hooks.to_target_charset ('A');
>> +  unsigned HOST_WIDE_INT target_Z = lang_hooks.to_target_charset ('Z');
>> +  unsigned HOST_WIDE_INT target_a = lang_hooks.to_target_charset ('a');
>> +  if (target_A == 0
>> +      || target_Z == 0
>> +      || target_a == 0)
>> +    return NULL_TREE;
>> +
>> +  arg = fold_convert_loc (loc, unsigned_type_node, arg);
>> +  tree tem = fold_build2 (MINUS_EXPR, unsigned_type_node, arg,
>> +                       build_int_cst (unsigned_type_node, target_A));
>> +  /* ??? x19 and x20 would better live in static storage; Think:
>> +   * static struct static_fold_tolower {uHWI x19, x20; unsigned probe_done};
>> +   */
>> +  unsigned HOST_WIDE_INT x19 = target_Z - target_A;
>> +  unsigned HOST_WIDE_INT x20 = target_a - target_A;
>> +  tem = fold_build2_loc (loc, LE_EXPR, integer_type_node, tem,
>> +                      build_int_cst (unsigned_type_node, x19));
>> +  tem = fold_build3_loc (loc, COND_EXPR, unsigned_type_node, tem,
>> +                      fold_build2 (PLUS_EXPR, unsigned_type_node, arg,
>> +                                   build_int_cst (unsigned_type_node, x20)),
>> +                      arg);
>> +  return fold_convert_loc (loc, integer_type_node, tem);
>> +}
>> +
>> +/* Simplify a call to the toupper builtin.  ARG is the argument to the call.
>> +
>> +   Return NULL_TREE if no simplification was possible, otherwise return the
>> +   simplified form of the call as a tree.  */
>> +
>> +static tree
>> +fold_builtin_toupper (location_t loc, tree arg)
>> +{
>> +  if (!validate_arg (arg, INTEGER_TYPE))
>> +    return NULL_TREE;
>> +
>> +  /* Transform toupper(c) -> (unsigned)(c) ^ 0x20.
>> +
>> +     More specifically:
>> +     unsigned tem = arg - 'a';
>> +     if (tem <= ('z' - 'a'))
>> +       arg -= 'a' - 'A';
>> +     return arg;
>> +   */
>> +  unsigned HOST_WIDE_INT target_A = lang_hooks.to_target_charset ('A');
>> +  unsigned HOST_WIDE_INT target_z = lang_hooks.to_target_charset ('z');
>> +  unsigned HOST_WIDE_INT target_a = lang_hooks.to_target_charset ('a');
>> +  if (target_A == 0
>> +      || target_z == 0
>> +      || target_a == 0)
>> +    return NULL_TREE;
>> +
>> +  arg = fold_convert_loc (loc, unsigned_type_node, arg);
>> +  tree tem = fold_build2 (MINUS_EXPR, unsigned_type_node, arg,
>> +                       build_int_cst (unsigned_type_node, target_a));
>> +  /* ??? x19 and x20 would better live in static storage; Think:
>> +   * static struct static_fold_tolower {uHWI x19, x20; unsigned probe_done};
>> +   */
>> +  unsigned HOST_WIDE_INT x19 = target_z - target_a;
>> +  unsigned HOST_WIDE_INT x20 = target_a - target_A;
>> +  tem = fold_build2_loc (loc, LE_EXPR, integer_type_node, tem,
>> +                      build_int_cst (unsigned_type_node, x19));
>> +  tem = fold_build3_loc (loc, COND_EXPR, unsigned_type_node, tem,
>> +                      fold_build2 (MINUS_EXPR, unsigned_type_node, arg,
>> +                                   build_int_cst (unsigned_type_node, x20)),
>> +                      arg);
>> +  return fold_convert_loc (loc, integer_type_node, tem);
>> +}
>> +
>> +
>>  /* Fold the next_arg or va_start call EXP. Returns true if there was an error
>>     produced.  False otherwise.  This is done so that we don't output the error
>>     or warning twice or three times.  */
>>
>
> --
> Richard Biener <rguenther@suse.de>
> SUSE LINUX GmbH, GF: Felix Imendoerffer, Jane Smithard, Dilip Upmanyu, Graham Norton, HRB 21284 (AG Nuernberg)
Ondřej Bílka July 10, 2015, 6:51 a.m. UTC | #3
On Thu, Jul 09, 2015 at 03:46:08PM +0200, Richard Biener wrote:
> On Thu, 9 Jul 2015, Bernhard Reutner-Fischer wrote:
> 
> > gcc/ChangeLog
> > 
> > 2015-07-09  Bernhard Reutner-Fischer  <aldot@gcc.gnu.org>
> > 
> > 	* builtins.c (fold_builtin_tolower, fold_builtin_toupper): New
> > 	static functions.
> > 	(fold_builtin_1): Handle BUILT_IN_TOLOWER, BUILT_IN_TOUPPER.
> 
> As I read it you fold tolower (X) to (X) >= target_char_set ('A')
> && (X) <= target_char_set ('Z') ? (X) - target_char_set ('A') + 
> target_char_set ('a');
> 
> I don't think this can be correct for all locales which need not
> have a lower-case character for all upper-case ones nor do
> all letters having one need to be in the range of 'A' to 'Z'.
> 
> Joseph will surely correct me if I am wrong.
> 
Thats correct as this doesn't handle toupper('č') with appropriate
single byte locale. You cannot even rely on fact that if x<128 then only
conversion is happens in 'A'..'Z' range, there are locales where that
doesn't hold and we need to check _NL_CTYPE_NONASCII_CASE. We don't
export that so you would need to check that while constructing table with 256 entries.

Also your example is invalid as you used __builtin_tolower instead
tolower. As usual gcc builtins are slow, you will get better performance
with following.

#include <ctype.h>
int foo(char *c)
{
 int i;
 for(i=0;i<1000;i++)
   c[i]=tolower(c[i]);
}


As your example first problem is that it doesn't work with utf8 due
multibyte characters.

Second problem is that sse4.2 doesn't help at all as generating masks
with it is quite slow. Using just sse2 is faster here.

It could be possible to add such function to libc. For vectorization you
would need to use following after checking that _NL_CTYPE_NONASCII_CASE
didn't happen. I didn't finished or tested that, you need set up
char128, a, z to to tests 128 <= x[i], 'A' <= x[i] and x[i] <= 'Z'

void c16(char *_x, char *y)
{
__m128i x = _mm_loadu_si128(_x);

int mask = _mm_movemask_epi8(_mm_cmpgt_epi8(x, char128);
x=_mm_or_si128(x, _mm_and_si128(tolower_bit, 
_mm_and_si128 (_mm_cmpgt_epi8(a,x), _mm_cmpgt_epi8(x,z))));
_mm_storeu_si128(y, x);
while (mask)
{
  int i = ffs(mask);
  y[i] = tolower(y[i]); 
  mask = mask & (mask - 1);
}
}
Bernhard Reutner-Fischer July 10, 2015, 11:52 a.m. UTC | #4
On 10 July 2015 at 08:51, Ondřej Bílka <neleai@seznam.cz> wrote:
> On Thu, Jul 09, 2015 at 03:46:08PM +0200, Richard Biener wrote:
>> On Thu, 9 Jul 2015, Bernhard Reutner-Fischer wrote:

[toupper/tolower patch withdrawn]

>> I don't think this can be correct for all locales which need not
>> have a lower-case character for all upper-case ones nor do
>> all letters having one need to be in the range of 'A' to 'Z'.
>>
>> Joseph will surely correct me if I am wrong.
>>
> Thats correct as this doesn't handle toupper('č') with appropriate
> single byte locale. You cannot even rely on fact that if x<128 then only
> conversion is happens in 'A'..'Z' range, there are locales where that
> doesn't hold and we need to check _NL_CTYPE_NONASCII_CASE. We don't
> export that so you would need to check that while constructing table with 256 entries.

I detest locales.
>
> Also your example is invalid as you used __builtin_tolower instead
> tolower. As usual gcc builtins are slow, you will get better performance

You're of course right, libc usually has a map-lookup for the fast
path for these.
(tolower) (...)  comes to mind but doesn't matter here.

> with following.
>
> #include <ctype.h>
> int foo(char *c)
> {
>  int i;
>  for(i=0;i<1000;i++)
>    c[i]=tolower(c[i]);
> }
>
>
> As your example first problem is that it doesn't work with utf8 due
> multibyte characters.

yea, the app i saw doing that strcpy/tolower has a defined input of
ASCII A-Za-z0-9- so i should not have used toupper in the example in
the first place.

>
> Second problem is that sse4.2 doesn't help at all as generating masks
> with it is quite slow. Using just sse2 is faster here.

The point of the PR was that a) loop-fusion is missing and b) nothing
is vectorized.
The quick sse4.2 example was just an extension my CPU happens to
support and that showed the result would be smaller than before and
maybe even a tiny bit faster.. ;)

>
> It could be possible to add such function to libc. For vectorization you

I think it would be better if GCC was able to fuse two or more loops
and grok to vectorize patterns like these.
As you point out, toupper is a bad example, a better one would perhaps
be something like the attached.

I guess that there is real-world code that does a
memcpy/memmove/str[n]cpy and then mask out some bits in the
destination so this should be useful generally.

thanks for your comments, though!
cheers,
Joseph Myers July 30, 2015, 3:09 p.m. UTC | #5
On Thu, 9 Jul 2015, Richard Biener wrote:

> On Thu, 9 Jul 2015, Bernhard Reutner-Fischer wrote:
> 
> > gcc/ChangeLog
> > 
> > 2015-07-09  Bernhard Reutner-Fischer  <aldot@gcc.gnu.org>
> > 
> > 	* builtins.c (fold_builtin_tolower, fold_builtin_toupper): New
> > 	static functions.
> > 	(fold_builtin_1): Handle BUILT_IN_TOLOWER, BUILT_IN_TOUPPER.
> 
> As I read it you fold tolower (X) to (X) >= target_char_set ('A')
> && (X) <= target_char_set ('Z') ? (X) - target_char_set ('A') + 
> target_char_set ('a');
> 
> I don't think this can be correct for all locales which need not
> have a lower-case character for all upper-case ones nor do
> all letters having one need to be in the range of 'A' to 'Z'.

And even if the locale contains the ASCII letters, the tolower / toupper 
mapping for them may not be the ASCII one (consider tolower ('I') and 
toupper ('i') in a Turkish locale tr_TR.ISO-8859-9 - the results are 'ı' 
and 'İ' which are single-byte characters in that locale).

Patch
diff mbox

diff --git a/gcc/builtins.c b/gcc/builtins.c
index 5f53342..421c908 100644
--- a/gcc/builtins.c
+++ b/gcc/builtins.c
@@ -204,6 +204,9 @@  static tree fold_builtin_strrchr (location_t, tree, tree, tree);
 static tree fold_builtin_strspn (location_t, tree, tree);
 static tree fold_builtin_strcspn (location_t, tree, tree);
 
+static tree fold_builtin_tolower (location_t, tree);
+static tree fold_builtin_toupper (location_t, tree);
+
 static rtx expand_builtin_object_size (tree);
 static rtx expand_builtin_memory_chk (tree, rtx, machine_mode,
 				      enum built_in_function);
@@ -10285,6 +10288,12 @@  fold_builtin_1 (location_t loc, tree fndecl, tree arg0)
     case BUILT_IN_ISDIGIT:
       return fold_builtin_isdigit (loc, arg0);
 
+    case BUILT_IN_TOLOWER:
+      return fold_builtin_tolower (loc, arg0);
+
+    case BUILT_IN_TOUPPER:
+      return fold_builtin_toupper (loc, arg0);
+
     CASE_FLT_FN (BUILT_IN_FINITE):
     case BUILT_IN_FINITED32:
     case BUILT_IN_FINITED64:
@@ -11208,6 +11217,96 @@  fold_builtin_strcspn (location_t loc, tree s1, tree s2)
     }
 }
 
+
+/* Simplify a call to the tolower builtin.  ARG is the argument to the call.
+
+   Return NULL_TREE if no simplification was possible, otherwise return the
+   simplified form of the call as a tree.  */
+
+static tree
+fold_builtin_tolower (location_t loc, tree arg)
+{
+  if (!validate_arg (arg, INTEGER_TYPE))
+    return NULL_TREE;
+
+  /* Transform tolower(c) -> (unsigned)(c) | 0x20.
+
+     More specifically:
+     unsigned tem = arg - 'A';
+     if (tem <= ('Z' - 'A'))
+       arg += 'a' - 'A';
+     return arg;
+   */
+  unsigned HOST_WIDE_INT target_A = lang_hooks.to_target_charset ('A');
+  unsigned HOST_WIDE_INT target_Z = lang_hooks.to_target_charset ('Z');
+  unsigned HOST_WIDE_INT target_a = lang_hooks.to_target_charset ('a');
+  if (target_A == 0
+      || target_Z == 0
+      || target_a == 0)
+    return NULL_TREE;
+
+  arg = fold_convert_loc (loc, unsigned_type_node, arg);
+  tree tem = fold_build2 (MINUS_EXPR, unsigned_type_node, arg,
+			  build_int_cst (unsigned_type_node, target_A));
+  /* ??? x19 and x20 would better live in static storage; Think:
+   * static struct static_fold_tolower {uHWI x19, x20; unsigned probe_done};
+   */
+  unsigned HOST_WIDE_INT x19 = target_Z - target_A;
+  unsigned HOST_WIDE_INT x20 = target_a - target_A;
+  tem = fold_build2_loc (loc, LE_EXPR, integer_type_node, tem,
+			 build_int_cst (unsigned_type_node, x19));
+  tem = fold_build3_loc (loc, COND_EXPR, unsigned_type_node, tem,
+			 fold_build2 (PLUS_EXPR, unsigned_type_node, arg,
+				      build_int_cst (unsigned_type_node, x20)),
+			 arg);
+  return fold_convert_loc (loc, integer_type_node, tem);
+}
+
+/* Simplify a call to the toupper builtin.  ARG is the argument to the call.
+
+   Return NULL_TREE if no simplification was possible, otherwise return the
+   simplified form of the call as a tree.  */
+
+static tree
+fold_builtin_toupper (location_t loc, tree arg)
+{
+  if (!validate_arg (arg, INTEGER_TYPE))
+    return NULL_TREE;
+
+  /* Transform toupper(c) -> (unsigned)(c) ^ 0x20.
+
+     More specifically:
+     unsigned tem = arg - 'a';
+     if (tem <= ('z' - 'a'))
+       arg -= 'a' - 'A';
+     return arg;
+   */
+  unsigned HOST_WIDE_INT target_A = lang_hooks.to_target_charset ('A');
+  unsigned HOST_WIDE_INT target_z = lang_hooks.to_target_charset ('z');
+  unsigned HOST_WIDE_INT target_a = lang_hooks.to_target_charset ('a');
+  if (target_A == 0
+      || target_z == 0
+      || target_a == 0)
+    return NULL_TREE;
+
+  arg = fold_convert_loc (loc, unsigned_type_node, arg);
+  tree tem = fold_build2 (MINUS_EXPR, unsigned_type_node, arg,
+			  build_int_cst (unsigned_type_node, target_a));
+  /* ??? x19 and x20 would better live in static storage; Think:
+   * static struct static_fold_tolower {uHWI x19, x20; unsigned probe_done};
+   */
+  unsigned HOST_WIDE_INT x19 = target_z - target_a;
+  unsigned HOST_WIDE_INT x20 = target_a - target_A;
+  tem = fold_build2_loc (loc, LE_EXPR, integer_type_node, tem,
+			 build_int_cst (unsigned_type_node, x19));
+  tem = fold_build3_loc (loc, COND_EXPR, unsigned_type_node, tem,
+			 fold_build2 (MINUS_EXPR, unsigned_type_node, arg,
+				      build_int_cst (unsigned_type_node, x20)),
+			 arg);
+  return fold_convert_loc (loc, integer_type_node, tem);
+}
+
+
 /* Fold the next_arg or va_start call EXP. Returns true if there was an error
    produced.  False otherwise.  This is done so that we don't output the error
    or warning twice or three times.  */