diff mbox

[v2] add benchmark for strcoll

Message ID 54FF35DF.6040709@web.de
State New
Headers show

Commit Message

Leonhard Holz March 10, 2015, 6:20 p.m. UTC
This is essentially the benchmark from last year 
(https://sourceware.org/ml/libc-alpha/2014-10/msg00345.html) plus an enchanced 
benchtests/Makefile to generate the needed locales. The attached input file have 
to be deployed to the benchtest directory.

	* benchtests/bench-strcoll.c: New benchmark.
	* benchtests/Makefile: Generate locales and run benchmark.
	* localedata/gen-locale.sh (generate_locale): Make path to charmaps
	independent of current working directory.
	* benchtests/lorem_ipsum_ar_SA: New benchmark input file.
	* benchtests/lorem_ipsum_cs_CZ: Likewise.
	* benchtests/lorem_ipsum_da_DK: Likewise.
	* benchtests/lorem_ipsum_el_GR: Likewise.
	* benchtests/lorem_ipsum_en_GB: Likewise.
	* benchtests/lorem_ipsum_en_US: Likewise.
	* benchtests/lorem_ipsum_es_ES: Likewise.
	* benchtests/lorem_ipsum_fr_FR: Likewise.
	* benchtests/lorem_ipsum_hi_IN: Likewise.
	* benchtests/lorem_ipsum_hu_HU: Likewise.
	* benchtests/lorem_ipsum_is_IS: Likewise.
	* benchtests/lorem_ipsum_it_IT: Likewise.
	* benchtests/lorem_ipsum_iw_IL: Likewise.
	* benchtests/lorem_ipsum_ja_JP: Likewise.
	* benchtests/lorem_ipsum_pl_PL: Likewise.
	* benchtests/lorem_ipsum_pt_PT: Likewise.
	* benchtests/lorem_ipsum_ru_RU: Likewise.
	* benchtests/lorem_ipsum_sr_RS: Likewise.
	* benchtests/lorem_ipsum_sv_SE: Likewise.
	* benchtests/lorem_ipsum_tr_TR: Likewise.
	* benchtests/lorem_ipsum_vi_VN: Likewise.
	* benchtests/lorem_ipsum_zh_CN: Likewise.

Comments

Leonhard Holz March 17, 2015, 10:16 a.m. UTC | #1
Please review: https://sourceware.org/ml/libc-alpha/2015-03/msg00417.html
Leonhard Holz March 24, 2015, 1:01 p.m. UTC | #2
Please review: https://sourceware.org/ml/libc-alpha/2015-03/msg00417.html
Siddhesh Poyarekar March 25, 2015, 10:45 a.m. UTC | #3
On Tue, Mar 10, 2015 at 07:20:15PM +0100, Leonhard Holz wrote:
> This is essentially the benchmark from last year
> (https://sourceware.org/ml/libc-alpha/2014-10/msg00345.html) plus an
> enchanced benchtests/Makefile to generate the needed locales. The attached
> input file have to be deployed to the benchtest directory.
> 
> 	* benchtests/bench-strcoll.c: New benchmark.
> 	* benchtests/Makefile: Generate locales and run benchmark.
> 	* localedata/gen-locale.sh (generate_locale): Make path to charmaps
> 	independent of current working directory.
> 	* benchtests/lorem_ipsum_ar_SA: New benchmark input file.
> 	* benchtests/lorem_ipsum_cs_CZ: Likewise.
> 	* benchtests/lorem_ipsum_da_DK: Likewise.
> 	* benchtests/lorem_ipsum_el_GR: Likewise.
> 	* benchtests/lorem_ipsum_en_GB: Likewise.
> 	* benchtests/lorem_ipsum_en_US: Likewise.
> 	* benchtests/lorem_ipsum_es_ES: Likewise.
> 	* benchtests/lorem_ipsum_fr_FR: Likewise.
> 	* benchtests/lorem_ipsum_hi_IN: Likewise.
> 	* benchtests/lorem_ipsum_hu_HU: Likewise.
> 	* benchtests/lorem_ipsum_is_IS: Likewise.
> 	* benchtests/lorem_ipsum_it_IT: Likewise.
> 	* benchtests/lorem_ipsum_iw_IL: Likewise.
> 	* benchtests/lorem_ipsum_ja_JP: Likewise.
> 	* benchtests/lorem_ipsum_pl_PL: Likewise.
> 	* benchtests/lorem_ipsum_pt_PT: Likewise.
> 	* benchtests/lorem_ipsum_ru_RU: Likewise.
> 	* benchtests/lorem_ipsum_sr_RS: Likewise.
> 	* benchtests/lorem_ipsum_sv_SE: Likewise.
> 	* benchtests/lorem_ipsum_tr_TR: Likewise.
> 	* benchtests/lorem_ipsum_vi_VN: Likewise.
> 	* benchtests/lorem_ipsum_zh_CN: Likewise.

Put the input files in a separate directory, say,
benchtests/strcoll-inputs/.

> diff --git a/benchtests/bench-strcoll.c b/benchtests/bench-strcoll.c
> index e69de29..6636e38 100644
> --- a/benchtests/bench-strcoll.c
> +++ b/benchtests/bench-strcoll.c
> @@ -0,0 +1,382 @@
> +/* Measure strcoll execution time in different locales.
> +   Copyright (C) 2015 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#define TEST_MAIN
> +#define TEST_NAME "strcoll"
> +
> +#include <stdio.h>
> +#include <fcntl.h>
> +#include <assert.h>
> +#include <stdlib.h>
> +#include <locale.h>
> +#include <unistd.h>
> +#include <dirent.h>
> +#include "json-lib.h"
> +#include "bench-timing.h"
> +
> +/* many thanks to http://generator.lorem-ipsum.info/ */

Capitalize first letter and two spaces before comment end.

> +#define CHARSET "UTF-8"
> +#define INPUT_PREFIX "lorem_ipsum_"
> +
> +static const char *const locales[] = {
> +  "vi_VN",
> +  "en_US",
> +  "ar_SA",
> +  "en_US",
> +  "zh_CN",
> +  "cs_CZ",
> +  "en_GB",
> +  "da_DK",
> +  "pl_PL",
> +  "fr_FR",
> +  "pt_PT",
> +  "el_GR",
> +  "ru_RU",
> +  "iw_IL",
> +  "es_ES",
> +  "hi_IN",
> +  "sv_SE",
> +  "hu_HU",
> +  "tr_TR",
> +  "is_IS",
> +  "it_IT",
> +  "sr_RS",
> +  "ja_JP"
> +};
> +
> +#define TEXTFILE_DELIMITER " \n\r\t.,?!"
> +#define FILENAMES_DELIMITER "\n\r"
> +
> +static int
> +is_dir (const char *filename)
> +{
> +  int is_dir = 0;
> +  struct stat stats;
> +
> +  if (stat (filename, &stats) == 0)
> +    is_dir = stats.st_mode & S_IFDIR;
> +
> +  return is_dir;
> +}
> +
> +static char *
> +concat_path (const char *dirname, const char *filename)
> +{
> +  char *path;
> +  int ret = asprintf (&path, "%s/%s", dirname, filename);
> +  assert (ret > 0);
> +  return path;
> +}
> +
> +static char *
> +read_file (const char *filename)
> +{
> +  struct stat stats;
> +  char *buffer = NULL;
> +  int fd = open (filename, O_CLOEXEC);
> +
> +  if (fd >= 0)
> +    {
> +      if (fstat (fd, &stats) == 0)
> +	{
> +	  buffer = malloc (stats.st_size + 1);
> +	  if (buffer)
> +	    {
> +	      if (read (fd, buffer, stats.st_size) == stats.st_size)
> +		buffer[stats.st_size] = '\0';
> +	      else
> +		{
> +		  free (buffer);
> +		  buffer = NULL;
> +		}
> +	    }
> +	}
> +      close (fd);
> +    }
> +
> +  return buffer;
> +}
> +
> +static size_t
> +count_words (const char *text, const char *delim)
> +{
> +  size_t wordcount = 0;
> +  char *tmp = strdup (text);
> +
> +  char *token = strtok (tmp, delim);
> +  while (token != NULL)
> +    {
> +      if (*token != '\0')
> +	wordcount++;
> +      token = strtok (NULL, delim);
> +    }
> +
> +  free (tmp);
> +  return wordcount;
> +}
> +
> +typedef struct
> +{
> +  size_t size;
> +  char **words;
> +} word_list;
> +
> +static word_list *
> +new_word_list (size_t size)
> +{
> +  word_list *list = malloc (sizeof (word_list));
> +  assert (list != NULL);
> +  list->size = size;
> +  list->words = malloc (size * sizeof (char *));
> +  assert (list->words != NULL);
> +  return list;
> +}
> +
> +static word_list *
> +merge_word_list (word_list * dst, word_list * src)
> +{
> +  size_t i, n = dst->size;
> +  dst->size += src->size;
> +
> +  char **new_words = malloc (dst->size * sizeof (char *));
> +  assert (new_words != NULL);
> +  memcpy (new_words, dst->words, n * sizeof (char *));
> +  free (dst->words);
> +  dst->words = new_words;
> +
> +  for (i = 0; i < src->size; i++)
> +    dst->words[i + n] = src->words[i];
> +
> +  free (src->words);
> +  free (src);
> +
> +  return dst;
> +}
> +
> +static word_list *
> +file_word_list (const char *dirname)
> +{
> +  DIR *dir;
> +  struct dirent *ent;
> +  word_list *list = NULL;
> +
> +  if ((dir = opendir (dirname)) != NULL)
> +    {
> +      size_t ent_cnt = 0, i = 0;
> +      word_list *sublist = new_word_list (0);
> +
> +      while ((ent = readdir (dir)) != NULL)
> +	if (ent->d_name[0] != '.')
> +	  {
> +	    char *subdirname = concat_path (dirname, ent->d_name);
> +	    if (is_dir (subdirname))
> +		merge_word_list (sublist, file_word_list (subdirname));
> +	    free (subdirname);
> +
> +	    ent_cnt++;
> +	  }
> +
> +      rewinddir (dir);
> +      list = new_word_list (ent_cnt);
> +      while ((ent = readdir (dir)) != NULL)
> +	if (ent->d_name[0] != '.')
> +	  {
> +	    list->words[i] = strdup (ent->d_name);
> +	    i++;
> +	  }
> +
> +      merge_word_list (list, sublist);
> +      closedir (dir);
> +    }
> +
> +  return list;
> +}
> +
> +static word_list *
> +str_word_list (const char *str, const char *delim)
> +{
> +  size_t n = 0;
> +  word_list *list = new_word_list (count_words (str, delim));
> +
> +  char *toks = strdup (str);
> +  char *word = strtok (toks, delim);
> +  while (word != NULL && n < list->size)
> +    {
> +      if (*word != '\0')
> +	list->words[n++] = strdup (word);
> +      word = strtok (NULL, delim);
> +    }
> +
> +  free (toks);
> +  return list;
> +}
> +
> +static word_list *
> +copy_word_list (const word_list *list)
> +{
> +  size_t i;
> +  word_list *copy = new_word_list (list->size);
> +
> +  for (i = 0; i < list->size; i++)
> +    copy->words[i] = strdup (list->words[i]);
> +
> +  return copy;
> +}
> +
> +static void
> +free_word_list (word_list *list)
> +{
> +  size_t i;
> +  for (i = 0; i < list->size; i++)
> +    free (list->words[i]);
> +
> +  free (list->words);
> +  free (list);
> +}
> +
> +static int
> +compare_words (const void *a, const void *b)
> +{
> +  const char *s1 = *(char **) a;
> +  const char *s2 = *(char **) b;
> +  return strcoll (s1, s2);
> +}
> +
> +#undef INNER_LOOP_ITERS
> +#define INNER_LOOP_ITERS 16
> +
> +static void
> +bench_list (json_ctx_t *json_ctx, word_list *list)
> +{
> +  size_t i;
> +  timing_t start, stop, cur;
> +
> +  word_list **tests = malloc (INNER_LOOP_ITERS * sizeof (word_list *));
> +  assert (tests != NULL);
> +  for (i = 0; i < INNER_LOOP_ITERS; i++)
> +    tests[i] = copy_word_list (list);
> +
> +  TIMING_NOW (start);
> +  for (i = 0; i < INNER_LOOP_ITERS; i++)
> +    qsort (tests[i]->words, tests[i]->size, sizeof (char *), compare_words);
> +  TIMING_NOW (stop);
> +
> +  TIMING_DIFF (cur, start, stop);
> +  setlocale (LC_ALL, "en_US.UTF-8");
> +  json_attr_double (json_ctx, "duration", cur);
> +  json_attr_double (json_ctx, "iterations", i);
> +  json_attr_double (json_ctx, "mean", cur / i);
> +
> +  for (i = 0; i < INNER_LOOP_ITERS; i++)
> +    free_word_list (tests[i]);
> +  free (tests);
> +}
> +
> +#define OK 0
> +#define ERROR_LOCALE 1
> +#define ERROR_IO 2

Use an enum instead.

> +
> +static int
> +bench_file (json_ctx_t *json_ctx, const char *filename, const char *delim,
> +  const char *locale)

Adjust this as recommended below.

> +{
> +  if (setlocale (LC_ALL, locale) == NULL)
> +    return ERROR_LOCALE;
> +
> +  char *text = read_file (filename);
> +  if (text == NULL)
> +    return ERROR_IO;
> +
> +  word_list *list = str_word_list (text, delim);
> +
> +  json_attr_object_begin (json_ctx, locale);
> +  bench_list (json_ctx, list);
> +  json_attr_object_end (json_ctx);
> +
> +  free_word_list (list);
> +  free (text);
> +  return OK;
> +}
> +
> +static int
> +bench_file_list (json_ctx_t *json_ctx, const char *dirname, const char *locale)

If you just create a file with this information once, you can collapse
this special case into the other cases and get rid of this function.

> +{
> +  if (setlocale (LC_ALL, locale) == NULL)
> +    return ERROR_LOCALE;
> +
> +  word_list *list = file_word_list (dirname);
> +  if (list == NULL)
> +    return ERROR_IO;
> +
> +  json_attr_object_begin (json_ctx, "");
> +  json_attr_string (json_ctx, "locale", locale);
> +  bench_list (json_ctx, list);
> +  json_attr_object_end (json_ctx);
> +
> +  free_word_list (list);
> +  return OK;
> +}
> +
> +static int
> +do_bench (void)
> +{
> +  timing_t res;
> +  TIMING_INIT (res);
> +
> +  json_ctx_t *json_ctx = malloc (sizeof (json_ctx_t));
> +  assert (json_ctx != NULL);
> +  json_init (json_ctx, 2, stdout);
> +  json_attr_object_begin (json_ctx, "strcoll");
> +
> +  int result = bench_file_list (json_ctx, "..", "C");
> +  if (result != OK) {
> +    error:
> +    if (result == ERROR_LOCALE)
> +      printf ("Failed to set locale en_US.UTF-8, aborting!\n");

s/en_US.UTF-8/C/

In fact, set a variable and use it to pass to bench_file_list and the
printf so as to avoid such an error.

> +    else if (result == ERROR_IO)
> +      printf ("Failed to read libc directory, aborting!\n");
> +    else
> +      printf ("Error, aborting!\n");

Print the value of the result.

> +    return result;
> +  }
> +  result = bench_file_list (json_ctx, "..", "en_US.UTF-8");
> +  if (result != OK)
> +    goto error;

Instead, split the error handling code into a function, set RESULT and
goto a label 'out' near the end of the function that just returns
RESULT.

> +
> +  size_t i;
> +  char filename[40], locale[15];
> +  for (i = 0; i < (sizeof (locales) / sizeof (locales[0])); i++)
> +    {
> +      sprintf (locale, "%s." CHARSET, locales[i]);
> +      sprintf (filename, INPUT_PREFIX "%s", locales[i]);
> +      result = bench_file (json_ctx, filename, TEXTFILE_DELIMITER, locale);

Just make bench_file accept the locale name and deduce the locale and
filename in the function.  You don't really need to pass the delimiter
either since you don't call bench_file with anything other than
TEXTFILE_DELIMITER.  That could simplify your code to:

	result = bench_file (json_ctx, locales[i]);
	if (result != OK)
	  goto out;

> +      if (result != OK)
> +	goto error;
> +    }
> +
> +  json_attr_object_end (json_ctx);
> +  free (json_ctx);

This is where the out: label goes and you return RESULT instead of OK.

> +  return OK;
> +}
> +
> +#define TEST_FUNCTION do_bench ()
> +
> +/* On slower platforms this test needs more than the default 2 seconds.  */
> +#define TIMEOUT 10
> +
> +#include "../test-skeleton.c"

Don't use the test-skeleton.  Make do_bench the main function.  The
other string benchmarks do this because they were directly ported from
the old test cases.  In fact, they need to be evaluated and improved.

> diff --git a/benchtests/Makefile b/benchtests/Makefile
> index 08603a2..4aaff73 100644
> --- a/benchtests/Makefile
> +++ b/benchtests/Makefile
> @@ -34,9 +34,22 @@ string-bench := bcopy bzero memccpy memchr memcmp memcpy
> memmem memmove \
>  		mempcpy memset rawmemchr stpcpy stpncpy strcasecmp strcasestr \
>  		strcat strchr strchrnul strcmp strcpy strcspn strlen \
>  		strncasecmp strncat strncmp strncpy strnlen strpbrk strrchr \
> -		strspn strstr strcpy_chk stpcpy_chk memrchr strsep strtok
> +		strspn strstr strcpy_chk stpcpy_chk memrchr strsep strtok \
> +		strcoll
>  string-bench-all := $(string-bench)
> 
> +# We have to generate locales
> +LOCALES := en_US.UTF-8 tr_TR.UTF-8 cs_CZ.UTF-8 fa_IR.UTF-8 fr_FR.UTF-8 \
> +	   ja_JP.UTF-8 si_LK.UTF-8 en_GB.UTF-8 vi_VN.UTF-8 ar_SA.UTF-8 \
> +	   da_DK.UTF-8 pl_PL.UTF-8 pt_PT.UTF-8 el_GR.UTF-8 ru_RU.UTF-8 \
> +	   iw_IL.UTF-8 is_IS.UTF-8 es_ES.UTF-8 hi_IN.UTF-8 sv_SE.UTF-8 \
> +	   hu_HU.UTF-8 it_IT.UTF-8 sr_RS.UTF-8 zh_CN.UTF-8
> +LOCALE_SRCS := $(shell echo "$(LOCALES)"|sed 's/\([^ .]*\)[^ ]*/\1/g')
> +CHARMAPS := $(shell echo "$(LOCALES)" | \
> +		    sed -e 's/[^ .]*[.]\([^ ]*\)/\1/g' -e s/SJIS/SHIFT_JIS/g)
> +CTYPE_FILES = $(addsuffix /LC_CTYPE,$(LOCALES))
> +gen-locales := $(addprefix $(common-objpfx)localedata/,$(CTYPE_FILES))
> +
>  stdlib-bench := strtod
> 
>  benchset := $(string-bench-all) $(stdlib-bench)
> @@ -65,6 +78,19 @@ binaries-bench := $(addprefix $(objpfx)bench-,$(bench))
>  binaries-benchset := $(addprefix $(objpfx)bench-,$(benchset))
>  binaries-bench-malloc := $(addprefix $(objpfx)bench-,$(bench-malloc))
> 
> +# Dependency for the locale files.  We actually make it depend only on
> +# one of the files.
> +$(addprefix $(common-objpfx)localedata/,$(CTYPE_FILES)): %: \
> +  ../localedata/gen-locale.sh \
> +  $(common-objpfx)locale/localedef \
> +  ../localedata/Makefile \
> +  $(addprefix ../localedata/charmaps/,$(CHARMAPS)) \
> +  $(addprefix ../localedata/locales/,$(LOCALE_SRCS))
> +	@$(SHELL) ../localedata/gen-locale.sh $(common-objpfx) \
> +		  '$(built-program-cmd-before-env)' '$(run-program-env)' \
> +		  '$(built-program-cmd-after-env)' $@; \
> +	$(evaluate-test)
> +

AFAICT all of this is copied from localedata/Makefile.  Instead, make
a common snippet that you can include in both these places.  This will
reduce the pain of remembering to update two locations when any of
this code needs to be fixed.

>  # The default duration: 10 seconds.
>  ifndef BENCH_DURATION
>  BENCH_DURATION := 10
> @@ -107,7 +133,7 @@ bench-clean:
>  	rm -f $(binaries-bench-malloc) $(addsuffix .o,$(binaries-bench-malloc))
>  	rm -f $(timing-type) $(addsuffix .o,$(timing-type))
> 
> -bench: $(timing-type) bench-set bench-func bench-malloc
> +bench: $(timing-type) $(gen-locales) bench-set bench-func bench-malloc
> 
>  bench-set: $(binaries-benchset)
>  	for run in $^; do \
> 
> diff --git a/localedata/gen-locale.sh b/localedata/gen-locale.sh
> index 4156f9e..2a48a34 100644
> --- a/localedata/gen-locale.sh
> +++ b/localedata/gen-locale.sh
> @@ -30,7 +30,8 @@ generate_locale ()
>     charmap=$1
>     input=$2
>     out=$3
> -  if ${localedef_before_env} ${run_program_env} I18NPATH=. \
> +  if ${localedef_before_env} ${run_program_env} \
> +     I18NPATH=${common_objpfx}../localedata \

Shouldn't this be I18NPATH=${common_objpfx}localedata?

>        ${localedef_after_env} --quiet -c -f $charmap -i $input \
>   			    ${common_objpfx}localedata/$out
>     then
>
Leonhard Holz March 26, 2015, 1:31 p.m. UTC | #4
>> +static int
>> +bench_file_list (json_ctx_t *json_ctx, const char *dirname, const char *locale)
>
> If you just create a file with this information once, you can collapse
> this special case into the other cases and get rid of this function.
>

This implies that the file list outdates over time. Is this ok?

>> diff --git a/localedata/gen-locale.sh b/localedata/gen-locale.sh
>> index 4156f9e..2a48a34 100644
>> --- a/localedata/gen-locale.sh
>> +++ b/localedata/gen-locale.sh
>> @@ -30,7 +30,8 @@ generate_locale ()
>>      charmap=$1
>>      input=$2
>>      out=$3
>> -  if ${localedef_before_env} ${run_program_env} I18NPATH=. \
>> +  if ${localedef_before_env} ${run_program_env} \
>> +     I18NPATH=${common_objpfx}../localedata \
>
> Shouldn't this be I18NPATH=${common_objpfx}localedata?
>

I18NPATH has to point to glibc/localedata and common_objpfx points to the build 
directory. So I18NPATH=${common_objpfx}localedata should not work.
Andreas Schwab March 26, 2015, 1:44 p.m. UTC | #5
Leonhard Holz <leonhard.holz@web.de> writes:

>>> diff --git a/localedata/gen-locale.sh b/localedata/gen-locale.sh
>>> index 4156f9e..2a48a34 100644
>>> --- a/localedata/gen-locale.sh
>>> +++ b/localedata/gen-locale.sh
>>> @@ -30,7 +30,8 @@ generate_locale ()
>>>      charmap=$1
>>>      input=$2
>>>      out=$3
>>> -  if ${localedef_before_env} ${run_program_env} I18NPATH=. \
>>> +  if ${localedef_before_env} ${run_program_env} \
>>> +     I18NPATH=${common_objpfx}../localedata \
>>
>> Shouldn't this be I18NPATH=${common_objpfx}localedata?
>>
>
> I18NPATH has to point to glibc/localedata and common_objpfx points to the
> build directory. So I18NPATH=${common_objpfx}localedata should not work.

${common_objpfx}../ doesn't get you to the source directory.

Andreas.
Siddhesh Poyarekar March 26, 2015, 4:22 p.m. UTC | #6
On Thu, Mar 26, 2015 at 02:31:29PM +0100, Leonhard Holz wrote:
> This implies that the file list outdates over time. Is this ok?

Sure, we don;t really care about that, do we?  If you do, then just
regenerate the file in the make target by simply doing:

    ls > strcoll-inputs/en_US-tmp
    mv strcoll-inputs/en_US{-tmp,}

and maintain strcoll-inputs/C as a symlink to strcoll-inputs/en_US.

> I18NPATH has to point to glibc/localedata and common_objpfx points to the
> build directory. So I18NPATH=${common_objpfx}localedata should not work.

Then you're looking for $(srcdir) (after you set it to @srcdir@ I
think) and not $(common_objpfx).  Actually since you're in benchtests/
when you're running this, you can just set it as ../localedata,
i.e. without the $(common_objpfx).

Siddhesh
diff mbox

Patch

diff --git a/benchtests/bench-strcoll.c b/benchtests/bench-strcoll.c
index e69de29..6636e38 100644
--- a/benchtests/bench-strcoll.c
+++ b/benchtests/bench-strcoll.c
@@ -0,0 +1,382 @@ 
+/* Measure strcoll execution time in different locales.
+   Copyright (C) 2015 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define TEST_MAIN
+#define TEST_NAME "strcoll"
+
+#include <stdio.h>
+#include <fcntl.h>
+#include <assert.h>
+#include <stdlib.h>
+#include <locale.h>
+#include <unistd.h>
+#include <dirent.h>
+#include "json-lib.h"
+#include "bench-timing.h"
+
+/* many thanks to http://generator.lorem-ipsum.info/ */
+#define CHARSET "UTF-8"
+#define INPUT_PREFIX "lorem_ipsum_"
+
+static const char *const locales[] = {
+  "vi_VN",
+  "en_US",
+  "ar_SA",
+  "en_US",
+  "zh_CN",
+  "cs_CZ",
+  "en_GB",
+  "da_DK",
+  "pl_PL",
+  "fr_FR",
+  "pt_PT",
+  "el_GR",
+  "ru_RU",
+  "iw_IL",
+  "es_ES",
+  "hi_IN",
+  "sv_SE",
+  "hu_HU",
+  "tr_TR",
+  "is_IS",
+  "it_IT",
+  "sr_RS",
+  "ja_JP"
+};
+
+#define TEXTFILE_DELIMITER " \n\r\t.,?!"
+#define FILENAMES_DELIMITER "\n\r"
+
+static int
+is_dir (const char *filename)
+{
+  int is_dir = 0;
+  struct stat stats;
+
+  if (stat (filename, &stats) == 0)
+    is_dir = stats.st_mode & S_IFDIR;
+
+  return is_dir;
+}
+
+static char *
+concat_path (const char *dirname, const char *filename)
+{
+  char *path;
+  int ret = asprintf (&path, "%s/%s", dirname, filename);
+  assert (ret > 0);
+  return path;
+}
+
+static char *
+read_file (const char *filename)
+{
+  struct stat stats;
+  char *buffer = NULL;
+  int fd = open (filename, O_CLOEXEC);
+
+  if (fd >= 0)
+    {
+      if (fstat (fd, &stats) == 0)
+	{
+	  buffer = malloc (stats.st_size + 1);
+	  if (buffer)
+	    {
+	      if (read (fd, buffer, stats.st_size) == stats.st_size)
+		buffer[stats.st_size] = '\0';
+	      else
+		{
+		  free (buffer);
+		  buffer = NULL;
+		}
+	    }
+	}
+      close (fd);
+    }
+
+  return buffer;
+}
+
+static size_t
+count_words (const char *text, const char *delim)
+{
+  size_t wordcount = 0;
+  char *tmp = strdup (text);
+
+  char *token = strtok (tmp, delim);
+  while (token != NULL)
+    {
+      if (*token != '\0')
+	wordcount++;
+      token = strtok (NULL, delim);
+    }
+
+  free (tmp);
+  return wordcount;
+}
+
+typedef struct
+{
+  size_t size;
+  char **words;
+} word_list;
+
+static word_list *
+new_word_list (size_t size)
+{
+  word_list *list = malloc (sizeof (word_list));
+  assert (list != NULL);
+  list->size = size;
+  list->words = malloc (size * sizeof (char *));
+  assert (list->words != NULL);
+  return list;
+}
+
+static word_list *
+merge_word_list (word_list * dst, word_list * src)
+{
+  size_t i, n = dst->size;
+  dst->size += src->size;
+
+  char **new_words = malloc (dst->size * sizeof (char *));
+  assert (new_words != NULL);
+  memcpy (new_words, dst->words, n * sizeof (char *));
+  free (dst->words);
+  dst->words = new_words;
+
+  for (i = 0; i < src->size; i++)
+    dst->words[i + n] = src->words[i];
+
+  free (src->words);
+  free (src);
+
+  return dst;
+}
+
+static word_list *
+file_word_list (const char *dirname)
+{
+  DIR *dir;
+  struct dirent *ent;
+  word_list *list = NULL;
+
+  if ((dir = opendir (dirname)) != NULL)
+    {
+      size_t ent_cnt = 0, i = 0;
+      word_list *sublist = new_word_list (0);
+
+      while ((ent = readdir (dir)) != NULL)
+	if (ent->d_name[0] != '.')
+	  {
+	    char *subdirname = concat_path (dirname, ent->d_name);
+	    if (is_dir (subdirname))
+		merge_word_list (sublist, file_word_list (subdirname));
+	    free (subdirname);
+
+	    ent_cnt++;
+	  }
+
+      rewinddir (dir);
+      list = new_word_list (ent_cnt);
+      while ((ent = readdir (dir)) != NULL)
+	if (ent->d_name[0] != '.')
+	  {
+	    list->words[i] = strdup (ent->d_name);
+	    i++;
+	  }
+
+      merge_word_list (list, sublist);
+      closedir (dir);
+    }
+
+  return list;
+}
+
+static word_list *
+str_word_list (const char *str, const char *delim)
+{
+  size_t n = 0;
+  word_list *list = new_word_list (count_words (str, delim));
+
+  char *toks = strdup (str);
+  char *word = strtok (toks, delim);
+  while (word != NULL && n < list->size)
+    {
+      if (*word != '\0')
+	list->words[n++] = strdup (word);
+      word = strtok (NULL, delim);
+    }
+
+  free (toks);
+  return list;
+}
+
+static word_list *
+copy_word_list (const word_list *list)
+{
+  size_t i;
+  word_list *copy = new_word_list (list->size);
+
+  for (i = 0; i < list->size; i++)
+    copy->words[i] = strdup (list->words[i]);
+
+  return copy;
+}
+
+static void
+free_word_list (word_list *list)
+{
+  size_t i;
+  for (i = 0; i < list->size; i++)
+    free (list->words[i]);
+
+  free (list->words);
+  free (list);
+}
+
+static int
+compare_words (const void *a, const void *b)
+{
+  const char *s1 = *(char **) a;
+  const char *s2 = *(char **) b;
+  return strcoll (s1, s2);
+}
+
+#undef INNER_LOOP_ITERS
+#define INNER_LOOP_ITERS 16
+
+static void
+bench_list (json_ctx_t *json_ctx, word_list *list)
+{
+  size_t i;
+  timing_t start, stop, cur;
+
+  word_list **tests = malloc (INNER_LOOP_ITERS * sizeof (word_list *));
+  assert (tests != NULL);
+  for (i = 0; i < INNER_LOOP_ITERS; i++)
+    tests[i] = copy_word_list (list);
+
+  TIMING_NOW (start);
+  for (i = 0; i < INNER_LOOP_ITERS; i++)
+    qsort (tests[i]->words, tests[i]->size, sizeof (char *), compare_words);
+  TIMING_NOW (stop);
+
+  TIMING_DIFF (cur, start, stop);
+  setlocale (LC_ALL, "en_US.UTF-8");
+  json_attr_double (json_ctx, "duration", cur);
+  json_attr_double (json_ctx, "iterations", i);
+  json_attr_double (json_ctx, "mean", cur / i);
+
+  for (i = 0; i < INNER_LOOP_ITERS; i++)
+    free_word_list (tests[i]);
+  free (tests);
+}
+
+#define OK 0
+#define ERROR_LOCALE 1
+#define ERROR_IO 2
+
+static int
+bench_file (json_ctx_t *json_ctx, const char *filename, const char *delim,
+  const char *locale)
+{
+  if (setlocale (LC_ALL, locale) == NULL)
+    return ERROR_LOCALE;
+
+  char *text = read_file (filename);
+  if (text == NULL)
+    return ERROR_IO;
+
+  word_list *list = str_word_list (text, delim);
+
+  json_attr_object_begin (json_ctx, locale);
+  bench_list (json_ctx, list);
+  json_attr_object_end (json_ctx);
+
+  free_word_list (list);
+  free (text);
+  return OK;
+}
+
+static int
+bench_file_list (json_ctx_t *json_ctx, const char *dirname, const char *locale)
+{
+  if (setlocale (LC_ALL, locale) == NULL)
+    return ERROR_LOCALE;
+
+  word_list *list = file_word_list (dirname);
+  if (list == NULL)
+    return ERROR_IO;
+
+  json_attr_object_begin (json_ctx, "");
+  json_attr_string (json_ctx, "locale", locale);
+  bench_list (json_ctx, list);
+  json_attr_object_end (json_ctx);
+
+  free_word_list (list);
+  return OK;
+}
+
+static int
+do_bench (void)
+{
+  timing_t res;
+  TIMING_INIT (res);
+
+  json_ctx_t *json_ctx = malloc (sizeof (json_ctx_t));
+  assert (json_ctx != NULL);
+  json_init (json_ctx, 2, stdout);
+  json_attr_object_begin (json_ctx, "strcoll");
+
+  int result = bench_file_list (json_ctx, "..", "C");
+  if (result != OK) {
+    error:
+    if (result == ERROR_LOCALE)
+      printf ("Failed to set locale en_US.UTF-8, aborting!\n");
+    else if (result == ERROR_IO)
+      printf ("Failed to read libc directory, aborting!\n");
+    else
+      printf ("Error, aborting!\n");
+    return result;
+  }
+  result = bench_file_list (json_ctx, "..", "en_US.UTF-8");
+  if (result != OK)
+    goto error;
+
+  size_t i;
+  char filename[40], locale[15];
+  for (i = 0; i < (sizeof (locales) / sizeof (locales[0])); i++)
+    {
+      sprintf (locale, "%s." CHARSET, locales[i]);
+      sprintf (filename, INPUT_PREFIX "%s", locales[i]);
+      result = bench_file (json_ctx, filename, TEXTFILE_DELIMITER, locale);
+      if (result != OK)
+	goto error;
+    }
+
+  json_attr_object_end (json_ctx);
+  free (json_ctx);
+  return OK;
+}
+
+#define TEST_FUNCTION do_bench ()
+
+/* On slower platforms this test needs more than the default 2 seconds.  */
+#define TIMEOUT 10
+
+#include "../test-skeleton.c"

diff --git a/benchtests/Makefile b/benchtests/Makefile
index 08603a2..4aaff73 100644
--- a/benchtests/Makefile
+++ b/benchtests/Makefile
@@ -34,9 +34,22 @@  string-bench := bcopy bzero memccpy memchr memcmp memcpy memmem 
memmove \
  		mempcpy memset rawmemchr stpcpy stpncpy strcasecmp strcasestr \
  		strcat strchr strchrnul strcmp strcpy strcspn strlen \
  		strncasecmp strncat strncmp strncpy strnlen strpbrk strrchr \
-		strspn strstr strcpy_chk stpcpy_chk memrchr strsep strtok
+		strspn strstr strcpy_chk stpcpy_chk memrchr strsep strtok \
+		strcoll
  string-bench-all := $(string-bench)

+# We have to generate locales
+LOCALES := en_US.UTF-8 tr_TR.UTF-8 cs_CZ.UTF-8 fa_IR.UTF-8 fr_FR.UTF-8 \
+	   ja_JP.UTF-8 si_LK.UTF-8 en_GB.UTF-8 vi_VN.UTF-8 ar_SA.UTF-8 \
+	   da_DK.UTF-8 pl_PL.UTF-8 pt_PT.UTF-8 el_GR.UTF-8 ru_RU.UTF-8 \
+	   iw_IL.UTF-8 is_IS.UTF-8 es_ES.UTF-8 hi_IN.UTF-8 sv_SE.UTF-8 \
+	   hu_HU.UTF-8 it_IT.UTF-8 sr_RS.UTF-8 zh_CN.UTF-8
+LOCALE_SRCS := $(shell echo "$(LOCALES)"|sed 's/\([^ .]*\)[^ ]*/\1/g')
+CHARMAPS := $(shell echo "$(LOCALES)" | \
+		    sed -e 's/[^ .]*[.]\([^ ]*\)/\1/g' -e s/SJIS/SHIFT_JIS/g)
+CTYPE_FILES = $(addsuffix /LC_CTYPE,$(LOCALES))
+gen-locales := $(addprefix $(common-objpfx)localedata/,$(CTYPE_FILES))
+
  stdlib-bench := strtod

  benchset := $(string-bench-all) $(stdlib-bench)
@@ -65,6 +78,19 @@  binaries-bench := $(addprefix $(objpfx)bench-,$(bench))
  binaries-benchset := $(addprefix $(objpfx)bench-,$(benchset))
  binaries-bench-malloc := $(addprefix $(objpfx)bench-,$(bench-malloc))

+# Dependency for the locale files.  We actually make it depend only on
+# one of the files.
+$(addprefix $(common-objpfx)localedata/,$(CTYPE_FILES)): %: \
+  ../localedata/gen-locale.sh \
+  $(common-objpfx)locale/localedef \
+  ../localedata/Makefile \
+  $(addprefix ../localedata/charmaps/,$(CHARMAPS)) \
+  $(addprefix ../localedata/locales/,$(LOCALE_SRCS))
+	@$(SHELL) ../localedata/gen-locale.sh $(common-objpfx) \
+		  '$(built-program-cmd-before-env)' '$(run-program-env)' \
+		  '$(built-program-cmd-after-env)' $@; \
+	$(evaluate-test)
+
  # The default duration: 10 seconds.
  ifndef BENCH_DURATION
  BENCH_DURATION := 10
@@ -107,7 +133,7 @@  bench-clean:
  	rm -f $(binaries-bench-malloc) $(addsuffix .o,$(binaries-bench-malloc))
  	rm -f $(timing-type) $(addsuffix .o,$(timing-type))

-bench: $(timing-type) bench-set bench-func bench-malloc
+bench: $(timing-type) $(gen-locales) bench-set bench-func bench-malloc

  bench-set: $(binaries-benchset)
  	for run in $^; do \

diff --git a/localedata/gen-locale.sh b/localedata/gen-locale.sh
index 4156f9e..2a48a34 100644
--- a/localedata/gen-locale.sh
+++ b/localedata/gen-locale.sh
@@ -30,7 +30,8 @@  generate_locale ()
     charmap=$1
     input=$2
     out=$3
-  if ${localedef_before_env} ${run_program_env} I18NPATH=. \
+  if ${localedef_before_env} ${run_program_env} \
+     I18NPATH=${common_objpfx}../localedata \
        ${localedef_after_env} --quiet -c -f $charmap -i $input \
   			    ${common_objpfx}localedata/$out
     then