diff mbox series

[24/28] elf: Implement a string table for ldconfig, with tail merging

Message ID a512949efa4f8cfc68362c86d021720a2f56145e.1601569371.git.fweimer@redhat.com
State New
Headers show
Series glibc-hwcaps support | expand

Commit Message

Florian Weimer Oct. 1, 2020, 4:34 p.m. UTC
This will be used in ldconfig to reduce the ld.so.cache size slightly.
---
 elf/Makefile           |   2 +-
 elf/stringtable.c      | 201 +++++++++++++++++++++++++++++++++++++++++
 elf/stringtable.h      |  61 +++++++++++++
 elf/stringtable_free.c |  32 +++++++
 elf/tst-stringtable.c  | 140 ++++++++++++++++++++++++++++
 5 files changed, 435 insertions(+), 1 deletion(-)
 create mode 100644 elf/stringtable.c
 create mode 100644 elf/stringtable.h
 create mode 100644 elf/stringtable_free.c
 create mode 100644 elf/tst-stringtable.c

Comments

Adhemerval Zanella Oct. 20, 2020, 2:25 p.m. UTC | #1
On 01/10/2020 13:34, Florian Weimer via Libc-alpha wrote:
> This will be used in ldconfig to reduce the ld.so.cache size slightly.

Could you extend the commit message to explain why the 32-bit FNV-1a hash
is used and how the hash table is organized (how collisions are handled,
expected memory usage, strategy used to increase/decrease the bucket size)?

It could help also to explain the usercase a bit more, the 'tail merging'
is not straightforward to understand without dig deep in the code.  Also
why kind of cache size decrease do you expect by using strategy?

> ---
>  elf/Makefile           |   2 +-
>  elf/stringtable.c      | 201 +++++++++++++++++++++++++++++++++++++++++
>  elf/stringtable.h      |  61 +++++++++++++
>  elf/stringtable_free.c |  32 +++++++
>  elf/tst-stringtable.c  | 140 ++++++++++++++++++++++++++++
>  5 files changed, 435 insertions(+), 1 deletion(-)
>  create mode 100644 elf/stringtable.c
>  create mode 100644 elf/stringtable.h
>  create mode 100644 elf/stringtable_free.c
>  create mode 100644 elf/tst-stringtable.c
> 
> diff --git a/elf/Makefile b/elf/Makefile
> index 2c36b08c73..ad50a3e16e 100644
> --- a/elf/Makefile
> +++ b/elf/Makefile
> @@ -172,7 +172,7 @@ tests-container := \
>  
>  tests := tst-tls9 tst-leaks1 \
>  	tst-array1 tst-array2 tst-array3 tst-array4 tst-array5 \
> -	tst-auxv
> +	tst-auxv tst-stringtable
>  tests-internal := tst-tls1 tst-tls2 $(tests-static-internal)
>  tests-static := $(tests-static-normal) $(tests-static-internal)
>  
> diff --git a/elf/stringtable.c b/elf/stringtable.c
> new file mode 100644
> index 0000000000..f9ade50249
> --- /dev/null
> +++ b/elf/stringtable.c
> @@ -0,0 +1,201 @@
> +/* String tables for ld.so.cache construction.  Implementation.

This file misses the Copyright year.

> +   This file is part of the GNU C Library.
> +
> +   This program is free software; you can redistribute it and/or modify
> +   it under the terms of the GNU General Public License as published
> +   by the Free Software Foundation; version 2 of the License, or
> +   (at your option) any later version.
> +
> +   This program is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> +   GNU General Public License for more details.
> +
> +   You should have received a copy of the GNU General Public License
> +   along with this program; if not, see <https://www.gnu.org/licenses/>.  */
> +
> +#include <assert.h>
> +#include <error.h>
> +#include <ldconfig.h>
> +#include <libintl.h>
> +#include <stdlib.h>
> +#include <string.h>
> +#include <stringtable.h>
> +
> +static void
> +stringtable_init (struct stringtable *table)
> +{
> +  table->count = 0;
> +  table->allocated = 16;
> +  table->entries = xcalloc (table->allocated, sizeof (table->entries[0]));
> +}
> +

Why 16 elements as initial size? 

> +/* 32-bit FNV-1a hash function.  */
> +static uint32_t
> +fnv1a (const char *string, size_t length)
> +{
> +  const unsigned char *p = (const unsigned char *) string;
> +  uint32_t hash = 2166136261U;
> +  for (size_t i = 0; i < length; ++i)
> +    {
> +      hash ^= p[i];
> +      hash *= 16777619U;
> +    }
> +  return hash;
> +}

Why fnv1a was chosen? 

> +
> +/* Double the capacity of the hash table.  */
> +static void
> +stringtable_rehash (struct stringtable *table)
> +{
> +  /* Cannot overflow because the old allocation size (in bytes) is
> +     larger.  */
> +  uint32_t new_allocated = table->allocated * 2;
> +  struct stringtable_entry **new_entries
> +    = xcalloc (new_allocated, sizeof (table->entries[0]));
> +
> +  uint32_t mask = new_allocated - 1;
> +  for (uint32_t i = 0; i < table->allocated; ++i)
> +    for (struct stringtable_entry *e = table->entries[i]; e != NULL; )
> +      {
> +        struct stringtable_entry *next = e->next;
> +        uint32_t hash = fnv1a (e->string, e->length);
> +        uint32_t new_index = hash & mask;
> +        e->next = new_entries[new_index];
> +        new_entries[new_index] = e;
> +        e = next;
> +      }
> +
> +  free (table->entries);
> +  table->entries = new_entries;
> +  table->allocated = new_allocated;
> +}
> +
> +struct stringtable_entry *
> +stringtable_intern (struct stringtable *table, const char *string)
> +{
> +  if (table->allocated == 0)
> +    stringtable_init (table);

How this could happen? Is it expect the caller to set 'allocated' explicitly?

> +
> +  size_t length = strlen (string);
> +  if (length > (1U << 30))
> +    error (EXIT_FAILURE, 0, _("String table string is too long"));
> +  uint32_t hash = fnv1a (string, length);
> +
> +  /* Return a previously-existing entry.  */
> +  for (struct stringtable_entry *e
> +         = table->entries[hash & (table->allocated - 1)];
> +       e != NULL; e = e->next)
> +    if (e->length == length && memcmp (e->string, string, length) == 0)
> +      return e;
> +
> +  /* Increase the size of the table if necessary.  Keep utilization
> +     below two thirds.  */
> +  if (table->count >= (1U << 30))
> +    error (EXIT_FAILURE, 0, _("String table has too many entries"));
> +  if (table->count * 3 > table->allocated * 2)
> +    stringtable_rehash (table);
> +
> +  /* Add the new table entry.  */
> +  ++table->count;
> +  struct stringtable_entry *e
> +    = xmalloc (offsetof (struct stringtable_entry, string) + length + 1);
> +  uint32_t index = hash & (table->allocated - 1);
> +  e->next = table->entries[index];
> +  table->entries[index] = e;
> +  e->length = length;
> +  e->offset = 0;
> +  memcpy (e->string, string, length + 1);
> +  return e;
> +}
> +
> +/* Sort reversed strings in lexicographic order.  This is used for tail
> +   merging.  */
> +static int
> +finalize_compare (const void *l, const void *r)
> +{
> +  struct stringtable_entry *left = *(struct stringtable_entry **) l;
> +  struct stringtable_entry *right = *(struct stringtable_entry **) r;
> +  size_t to_compare;
> +  if (left->length < right->length)
> +    to_compare = left->length;
> +  else
> +    to_compare = right->length;
> +  for (ssize_t i = to_compare - 1; i >= 0; --i)
> +    {
> +      unsigned char lch = left->string[i];
> +      unsigned char rch = right->string[i];
> +      if (lch != rch)
> +        return lch - rch;
> +    }
> +  if (left->length == right->length)
> +    return 0;
> +  else if (left->length < right->length)
> +    /* Longer strings should come first.  */
> +    return 1;
> +  else
> +    return -1;
> +}

Ok.

> +
> +void
> +stringtable_finalize (struct stringtable *table,
> +                      struct stringtable_finalized *result)
> +{
> +  if (table->count == 0)
> +    {
> +      result->strings = xstrdup ("");
> +      result->size = 0;
> +      return;
> +    }
> +
> +  /* Optimize the order of the strings.  */
> +  struct stringtable_entry **array = xcalloc (table->count, sizeof (*array));
> +  {
> +    size_t j = 0;
> +    for (uint32_t i = 0; i < table->allocated; ++i)
> +      for (struct stringtable_entry *e = table->entries[i]; e != NULL;
> +           e = e->next)
> +        {
> +          array[j] = e;
> +          ++j;
> +        }
> +    assert (j == table->count);
> +  }
> +  qsort (array, table->count, sizeof (*array), finalize_compare);
> +
> +  /* Assign offsets, using table sharing if possible.  */
> +  array[0]->offset = 0;
> +  for (uint32_t j = 1; j < table->count; ++j)
> +    {
> +      struct stringtable_entry *previous = array[j - 1];
> +      struct stringtable_entry *current = array[j];
> +      if (previous->length >= current->length
> +          && memcmp (&previous->string[previous->length - current->length],
> +                     current->string, current->length) == 0)
> +        current->offset = (previous->offset + previous->length
> +                           - current->length);
> +      else if (__builtin_add_overflow (previous->offset,
> +                                       previous->length + 1,
> +                                       &current->offset))
> +        error (EXIT_FAILURE, 0, _("String table is too large"));
> +    }
> +
> +  /* Allocate the result string.  */
> +  {
> +    struct stringtable_entry *last = array[table->count - 1];
> +    if (__builtin_add_overflow (last->offset, last->length + 1,
> +                                &result->size))
> +      error (EXIT_FAILURE, 0, _("String table is too large"));
> +  }
> +  /* The strings are copied from the hash table, so the array is no
> +     longer needed.  */
> +  free (array);
> +  result->strings = xcalloc (result->size, 1);
> +
> +  /* Copy the strings.  */
> +  for (uint32_t i = 0; i < table->allocated; ++i)
> +    for (struct stringtable_entry *e = table->entries[i]; e != NULL;
> +         e = e->next)
> +      if (result->strings[e->offset] == '\0')
> +        memcpy (&result->strings[e->offset], e->string, e->length + 1);
> +}

Ok, I guess allocating a new stringtable_finalized should be simpler than
operating the table itself. 

> diff --git a/elf/stringtable.h b/elf/stringtable.h
> new file mode 100644
> index 0000000000..e35b6c67fd
> --- /dev/null
> +++ b/elf/stringtable.h
> @@ -0,0 +1,61 @@
> +/* String tables for ld.so.cache construction.
> +   This file is part of the GNU C Library.
> +
> +   This program is free software; you can redistribute it and/or modify
> +   it under the terms of the GNU General Public License as published
> +   by the Free Software Foundation; version 2 of the License, or
> +   (at your option) any later version.
> +
> +   This program is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> +   GNU General Public License for more details.
> +
> +   You should have received a copy of the GNU General Public License
> +   along with this program; if not, see <https://www.gnu.org/licenses/>.  */
> +
> +#ifndef _STRINGTABLE_H
> +#define _STRINGTABLE_H
> +
> +#include <stddef.h>
> +#include <stdint.h>
> +
> +/* An entry in the string table.  Only the length and string fields are
> +   expected to be used outside the string table code.  */
> +struct stringtable_entry
> +{
> +  struct stringtable_entry *next; /* For collision resolution.  */
> +  uint32_t length;                /* Length of then string.  */
> +  uint32_t offset;                /* From start of finalized table.  */
> +  char string[];                  /* Null-terminated string.  */
> +};
> +
> +/* A string table.  Zero-initialization produces a valid atable.  */
> +struct stringtable
> +{
> +  struct stringtable_entry **entries;
> +  uint32_t count;                 /* Number of elements in the table.  */
> +  uint32_t allocated;             /* Length of the entries array.  */
> +};
> +
> +/* Adds STRING to TABLE.  May return the address of an existing entry.  */
> +struct stringtable_entry *stringtable_intern (struct stringtable *table,
> +                                              const char *string);

I think this name is confusing, why not just 'stringtable_add' or
'stringtable_add_element'?

> +
> +/* Result of stringtable_finalize.  SIZE bytes at STRINGS should be
> +   written to the file.  */
> +struct stringtable_finalized
> +{
> +  char *strings;
> +  size_t size;
> +};
> +
> +/* Assigns offsets to string table entries and computes the serialized
> +   form of the string table.  */
> +void stringtable_finalize (struct stringtable *table,
> +                           struct stringtable_finalized *result);
> +
> +/* Deallocate the string table (but not the TABLE pointer itself).  */
> +void stringtable_free (struct stringtable *table);
> +
> +#endif /* _STRINGTABLE_H */
> diff --git a/elf/stringtable_free.c b/elf/stringtable_free.c
> new file mode 100644
> index 0000000000..0e5296e429
> --- /dev/null
> +++ b/elf/stringtable_free.c
> @@ -0,0 +1,32 @@
> +/* String tables for ld.so.cache construction.  Deallocation (for tests only).
> +   This file is part of the GNU C Library.
> +
> +   This program is free software; you can redistribute it and/or modify
> +   it under the terms of the GNU General Public License as published
> +   by the Free Software Foundation; version 2 of the License, or
> +   (at your option) any later version.
> +
> +   This program is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> +   GNU General Public License for more details.
> +
> +   You should have received a copy of the GNU General Public License
> +   along with this program; if not, see <https://www.gnu.org/licenses/>.  */
> +
> +#include <stdlib.h>
> +#include <stringtable.h>
> +
> +void
> +stringtable_free (struct stringtable *table)
> +{
> +  for (uint32_t i = 0; i < table->allocated; ++i)
> +    for (struct stringtable_entry *e = table->entries[i]; e != NULL; )
> +      {
> +        struct stringtable_entry *next = e->next;
> +        free (e);
> +        e = next;
> +      }
> +  free (table->entries);
> +  *table = (struct stringtable) { 0, };
> +}

Ok.

> diff --git a/elf/tst-stringtable.c b/elf/tst-stringtable.c
> new file mode 100644
> index 0000000000..78ca5434df
> --- /dev/null
> +++ b/elf/tst-stringtable.c
> @@ -0,0 +1,140 @@
> +/* Unit test for ldconfig string tables.
> +   This file is part of the GNU C Library.
> +
> +   This program is free software; you can redistribute it and/or modify
> +   it under the terms of the GNU General Public License as published
> +   by the Free Software Foundation; version 2 of the License, or
> +   (at your option) any later version.
> +
> +   This program is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> +   GNU General Public License for more details.
> +
> +   You should have received a copy of the GNU General Public License
> +   along with this program; if not, see <https://www.gnu.org/licenses/>.  */
> +
> +#include <stdlib.h>
> +#include <string.h>
> +#include <stringtable.h>
> +#include <support/check.h>
> +#include <support/support.h>
> +
> +static int
> +do_test (void)
> +{
> +  /* Empty string table.  */
> +  {
> +    struct stringtable s = { 0, };
> +    struct stringtable_finalized f;
> +    stringtable_finalize (&s, &f);
> +    TEST_COMPARE_STRING (f.strings, "");
> +    TEST_COMPARE (f.size, 0);
> +    free (f.strings);
> +    stringtable_free (&s);
> +  }
> +
> +  /* String table with one empty string.  */
> +  {
> +    struct stringtable s = { 0, };
> +    struct stringtable_entry *e = stringtable_intern (&s, "");
> +    TEST_COMPARE_STRING (e->string, "");
> +    TEST_COMPARE (e->length, 0);
> +    TEST_COMPARE (s.count, 1);
> +
> +    struct stringtable_finalized f;
> +    stringtable_finalize (&s, &f);
> +    TEST_COMPARE (e->offset, 0);
> +    TEST_COMPARE_STRING (f.strings, "");
> +    TEST_COMPARE (f.size, 1);
> +    free (f.strings);
> +    stringtable_free (&s);
> +  }
> +
> +  /* String table with one non-empty string.  */
> +  {
> +    struct stringtable s = { 0, };
> +    struct stringtable_entry *e = stringtable_intern (&s, "name");
> +    TEST_COMPARE_STRING (e->string, "name");
> +    TEST_COMPARE (e->length, 4);
> +    TEST_COMPARE (s.count, 1);
> +
> +    struct stringtable_finalized f;
> +    stringtable_finalize (&s, &f);
> +    TEST_COMPARE (e->offset, 0);
> +    TEST_COMPARE_STRING (f.strings, "name");
> +    TEST_COMPARE (f.size, 5);
> +    free (f.strings);
> +    stringtable_free (&s);
> +  }
> +
> +  /* Two strings, one is a prefix of the other.  Tail-merging can only
> +     happen in one way in this case.  */
> +  {
> +    struct stringtable s = { 0, };
> +    struct stringtable_entry *suffix = stringtable_intern (&s, "suffix");
> +    TEST_COMPARE_STRING (suffix->string, "suffix");
> +    TEST_COMPARE (suffix->length, 6);
> +    TEST_COMPARE (s.count, 1);
> +
> +    struct stringtable_entry *prefix
> +      = stringtable_intern (&s, "prefix-suffix");
> +    TEST_COMPARE_STRING (prefix->string, "prefix-suffix");
> +    TEST_COMPARE (prefix->length, strlen ("prefix-suffix"));
> +    TEST_COMPARE (s.count, 2);
> +
> +    struct stringtable_finalized f;
> +    stringtable_finalize (&s, &f);
> +    TEST_COMPARE (prefix->offset, 0);
> +    TEST_COMPARE (suffix->offset, strlen ("prefix-"));
> +    TEST_COMPARE_STRING (f.strings, "prefix-suffix");
> +    TEST_COMPARE (f.size, sizeof ("prefix-suffix"));
> +    free (f.strings);
> +    stringtable_free (&s);
> +  }
> +
> +  /* String table with various shared prefixes.  Triggers hash
> +     resizing.  */
> +  {
> +    enum { count = 1500 };
> +    char *strings[2 * count];
> +    struct stringtable_entry *entries[2 * count];
> +    struct stringtable s = { 0, };
> +    for (int i = 0; i < count; ++i)
> +      {
> +        strings[i] = xasprintf ("%d", i);
> +        entries[i] = stringtable_intern (&s, strings[i]);
> +        TEST_COMPARE (entries[i]->length, strlen (strings[i]));
> +        TEST_COMPARE_STRING (entries[i]->string, strings[i]);
> +        strings[i + count] = xasprintf ("prefix/%d", i);
> +        entries[i + count] = stringtable_intern (&s, strings[i + count]);
> +        TEST_COMPARE (entries[i + count]->length, strlen (strings[i + count]));
> +        TEST_COMPARE_STRING (entries[i + count]->string, strings[i + count]);
> +      }
> +
> +    struct stringtable_finalized f;
> +    stringtable_finalize (&s, &f);
> +
> +    for (int i = 0; i < 2 * count; ++i)
> +      {
> +        TEST_COMPARE (entries[i]->length, strlen (strings[i]));
> +        TEST_COMPARE_STRING (entries[i]->string, strings[i]);
> +        TEST_COMPARE_STRING (f.strings + entries[i]->offset, strings[i]);
> +        free (strings[i]);
> +      }
> +
> +    free (f.strings);
> +    stringtable_free (&s);
> +  }
> +
> +  return 0;
> +}
> +
> +#include <support/test-driver.c>
> +
> +/* Re-compile the string table implementation here.  It is not
> +   possible to link against the actual build because it was built for
> +   use in ldconfig.  */
> +#define _(arg) arg
> +#include "stringtable.c"
> +#include "stringtable_free.c"
> 

Ok.
Florian Weimer Oct. 30, 2020, 5:08 p.m. UTC | #2
* Adhemerval Zanella via Libc-alpha:

> On 01/10/2020 13:34, Florian Weimer via Libc-alpha wrote:
>> This will be used in ldconfig to reduce the ld.so.cache size slightly.
>
> Could you extend the commit message to explain why the 32-bit FNV-1a hash
> is used and how the hash table is organized (how collisions are handled,
> expected memory usage, strategy used to increase/decrease the bucket size)?
>
> It could help also to explain the usercase a bit more, the 'tail merging'
> is not straightforward to understand without dig deep in the code.  Also
> why kind of cache size decrease do you expect by using strategy?

That depends whether cached DSOs use sonames as file names (so that no
symbolic links are needed).  Typically that's not the case today, so the
savings are really small.  glibc-hwcaps may change that.

The repost will have an expanded commit message:

elf: Implement a string table for ldconfig, with tail merging

This will be used in ldconfig to reduce the ld.so.cache size slightly.

Tail merging is an optimization where a pointer points into another
string if the first string is a suffix of the second string.

The hash function FNV-1a was chosen because it is simple and achieves
good dispersion even for short strings (so that the hash table bucket
count can be a power of two).  It is clearly superior to the hsearch
hash and the ELF hash in this regard.

The hash table uses chaining for collision resolution.

>> diff --git a/elf/stringtable.c b/elf/stringtable.c
>> new file mode 100644
>> index 0000000000..f9ade50249
>> --- /dev/null
>> +++ b/elf/stringtable.c
>> @@ -0,0 +1,201 @@
>> +/* String tables for ld.so.cache construction.  Implementation.
>
> This file misses the Copyright year.

Fixed throughout.

>> +static void
>> +stringtable_init (struct stringtable *table)
>> +{
>> +  table->count = 0;
>> +  table->allocated = 16;
>> +  table->entries = xcalloc (table->allocated, sizeof (table->entries[0]));
>> +}
>> +
>
> Why 16 elements as initial size?

I'm increasing it to 128 with a comment.  128 is based on the number of
DSOs within glibc itself.

>> +struct stringtable_entry *
>> +stringtable_intern (struct stringtable *table, const char *string)
>> +{
>> +  if (table->allocated == 0)
>> +    stringtable_init (table);
>
> How this could happen? Is it expect the caller to set 'allocated'
> explicitly?

Zero-initialization is valid.  stringtable_free also leaves the table
ready for re-use.

>> +  /* Copy the strings.  */
>> +  for (uint32_t i = 0; i < table->allocated; ++i)
>> +    for (struct stringtable_entry *e = table->entries[i]; e != NULL;
>> +         e = e->next)
>> +      if (result->strings[e->offset] == '\0')
>> +        memcpy (&result->strings[e->offset], e->string, e->length + 1);
>> +}
>
> Ok, I guess allocating a new stringtable_finalized should be simpler than
> operating the table itself.

Sorry, I don't understand.  Do you mean reusing the table allocation in
some way?  Yes, that would be fairly complicated.

>> +/* Adds STRING to TABLE.  May return the address of an existing entry.  */
>> +struct stringtable_entry *stringtable_intern (struct stringtable *table,
>> +                                              const char *string);
>
> I think this name is confusing, why not just 'stringtable_add' or
> 'stringtable_add_element'?

I'm changing it to stringtable_add.  Didn't realize that interning is
obscure terminology.

Thanls,
Florian
Adhemerval Zanella Nov. 3, 2020, 1:05 p.m. UTC | #3
On 30/10/2020 14:08, Florian Weimer wrote:
> * Adhemerval Zanella via Libc-alpha:
> 
>> On 01/10/2020 13:34, Florian Weimer via Libc-alpha wrote:
>>> This will be used in ldconfig to reduce the ld.so.cache size slightly.
>>
>> Could you extend the commit message to explain why the 32-bit FNV-1a hash
>> is used and how the hash table is organized (how collisions are handled,
>> expected memory usage, strategy used to increase/decrease the bucket size)?
>>
>> It could help also to explain the usercase a bit more, the 'tail merging'
>> is not straightforward to understand without dig deep in the code.  Also
>> why kind of cache size decrease do you expect by using strategy?
> 
> That depends whether cached DSOs use sonames as file names (so that no
> symbolic links are needed).  Typically that's not the case today, so the
> savings are really small.  glibc-hwcaps may change that.
> 
> The repost will have an expanded commit message:
> 
> elf: Implement a string table for ldconfig, with tail merging
> 
> This will be used in ldconfig to reduce the ld.so.cache size slightly.
> 
> Tail merging is an optimization where a pointer points into another
> string if the first string is a suffix of the second string.
> 
> The hash function FNV-1a was chosen because it is simple and achieves
> good dispersion even for short strings (so that the hash table bucket
> count can be a power of two).  It is clearly superior to the hsearch
> hash and the ELF hash in this regard.
> 
> The hash table uses chaining for collision resolution.

Looks better, thanks.

> 
>>> diff --git a/elf/stringtable.c b/elf/stringtable.c
>>> new file mode 100644
>>> index 0000000000..f9ade50249
>>> --- /dev/null
>>> +++ b/elf/stringtable.c
>>> @@ -0,0 +1,201 @@
>>> +/* String tables for ld.so.cache construction.  Implementation.
>>
>> This file misses the Copyright year.
> 
> Fixed throughout.
> 
>>> +static void
>>> +stringtable_init (struct stringtable *table)
>>> +{
>>> +  table->count = 0;
>>> +  table->allocated = 16;
>>> +  table->entries = xcalloc (table->allocated, sizeof (table->entries[0]));
>>> +}
>>> +
>>
>> Why 16 elements as initial size?
> 
> I'm increasing it to 128 with a comment.  128 is based on the number of
> DSOs within glibc itself.
> 
>>> +struct stringtable_entry *
>>> +stringtable_intern (struct stringtable *table, const char *string)
>>> +{
>>> +  if (table->allocated == 0)
>>> +    stringtable_init (table);
>>
>> How this could happen? Is it expect the caller to set 'allocated'
>> explicitly?
> 
> Zero-initialization is valid.  stringtable_free also leaves the table
> ready for re-use.

Wouldn't both usages make the above check unnecessary? 

> 
>>> +  /* Copy the strings.  */
>>> +  for (uint32_t i = 0; i < table->allocated; ++i)
>>> +    for (struct stringtable_entry *e = table->entries[i]; e != NULL;
>>> +         e = e->next)
>>> +      if (result->strings[e->offset] == '\0')
>>> +        memcpy (&result->strings[e->offset], e->string, e->length + 1);
>>> +}
>>
>> Ok, I guess allocating a new stringtable_finalized should be simpler than
>> operating the table itself.
> 
> Sorry, I don't understand.  Do you mean reusing the table allocation in
> some way?  Yes, that would be fairly complicated.

Yes, to avoid allocation memory on both new item insertion and on the
finalize operation itself. But I think such optimization does not really
matter for the ldconfig usage.

> 
>>> +/* Adds STRING to TABLE.  May return the address of an existing entry.  */
>>> +struct stringtable_entry *stringtable_intern (struct stringtable *table,
>>> +                                              const char *string);
>>
>> I think this name is confusing, why not just 'stringtable_add' or
>> 'stringtable_add_element'?
> 
> I'm changing it to stringtable_add.  Didn't realize that interning is
> obscure terminology.
> 
> Thanls,
> Florian
>
Florian Weimer Nov. 3, 2020, 3:29 p.m. UTC | #4
* Adhemerval Zanella:

>>>> +struct stringtable_entry *
>>>> +stringtable_intern (struct stringtable *table, const char *string)
>>>> +{
>>>> +  if (table->allocated == 0)
>>>> +    stringtable_init (table);
>>>
>>> How this could happen? Is it expect the caller to set 'allocated'
>>> explicitly?
>> 
>> Zero-initialization is valid.  stringtable_free also leaves the table
>> ready for re-use.
>
> Wouldn't both usages make the above check unnecessary?

Doubling zero is still zero, so the growth code path doesn't work.

Thanks,
Florian
diff mbox series

Patch

diff --git a/elf/Makefile b/elf/Makefile
index 2c36b08c73..ad50a3e16e 100644
--- a/elf/Makefile
+++ b/elf/Makefile
@@ -172,7 +172,7 @@  tests-container := \
 
 tests := tst-tls9 tst-leaks1 \
 	tst-array1 tst-array2 tst-array3 tst-array4 tst-array5 \
-	tst-auxv
+	tst-auxv tst-stringtable
 tests-internal := tst-tls1 tst-tls2 $(tests-static-internal)
 tests-static := $(tests-static-normal) $(tests-static-internal)
 
diff --git a/elf/stringtable.c b/elf/stringtable.c
new file mode 100644
index 0000000000..f9ade50249
--- /dev/null
+++ b/elf/stringtable.c
@@ -0,0 +1,201 @@ 
+/* String tables for ld.so.cache construction.  Implementation.
+   This file is part of the GNU C Library.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published
+   by the Free Software Foundation; version 2 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <https://www.gnu.org/licenses/>.  */
+
+#include <assert.h>
+#include <error.h>
+#include <ldconfig.h>
+#include <libintl.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stringtable.h>
+
+static void
+stringtable_init (struct stringtable *table)
+{
+  table->count = 0;
+  table->allocated = 16;
+  table->entries = xcalloc (table->allocated, sizeof (table->entries[0]));
+}
+
+/* 32-bit FNV-1a hash function.  */
+static uint32_t
+fnv1a (const char *string, size_t length)
+{
+  const unsigned char *p = (const unsigned char *) string;
+  uint32_t hash = 2166136261U;
+  for (size_t i = 0; i < length; ++i)
+    {
+      hash ^= p[i];
+      hash *= 16777619U;
+    }
+  return hash;
+}
+
+/* Double the capacity of the hash table.  */
+static void
+stringtable_rehash (struct stringtable *table)
+{
+  /* Cannot overflow because the old allocation size (in bytes) is
+     larger.  */
+  uint32_t new_allocated = table->allocated * 2;
+  struct stringtable_entry **new_entries
+    = xcalloc (new_allocated, sizeof (table->entries[0]));
+
+  uint32_t mask = new_allocated - 1;
+  for (uint32_t i = 0; i < table->allocated; ++i)
+    for (struct stringtable_entry *e = table->entries[i]; e != NULL; )
+      {
+        struct stringtable_entry *next = e->next;
+        uint32_t hash = fnv1a (e->string, e->length);
+        uint32_t new_index = hash & mask;
+        e->next = new_entries[new_index];
+        new_entries[new_index] = e;
+        e = next;
+      }
+
+  free (table->entries);
+  table->entries = new_entries;
+  table->allocated = new_allocated;
+}
+
+struct stringtable_entry *
+stringtable_intern (struct stringtable *table, const char *string)
+{
+  if (table->allocated == 0)
+    stringtable_init (table);
+
+  size_t length = strlen (string);
+  if (length > (1U << 30))
+    error (EXIT_FAILURE, 0, _("String table string is too long"));
+  uint32_t hash = fnv1a (string, length);
+
+  /* Return a previously-existing entry.  */
+  for (struct stringtable_entry *e
+         = table->entries[hash & (table->allocated - 1)];
+       e != NULL; e = e->next)
+    if (e->length == length && memcmp (e->string, string, length) == 0)
+      return e;
+
+  /* Increase the size of the table if necessary.  Keep utilization
+     below two thirds.  */
+  if (table->count >= (1U << 30))
+    error (EXIT_FAILURE, 0, _("String table has too many entries"));
+  if (table->count * 3 > table->allocated * 2)
+    stringtable_rehash (table);
+
+  /* Add the new table entry.  */
+  ++table->count;
+  struct stringtable_entry *e
+    = xmalloc (offsetof (struct stringtable_entry, string) + length + 1);
+  uint32_t index = hash & (table->allocated - 1);
+  e->next = table->entries[index];
+  table->entries[index] = e;
+  e->length = length;
+  e->offset = 0;
+  memcpy (e->string, string, length + 1);
+  return e;
+}
+
+/* Sort reversed strings in lexicographic order.  This is used for tail
+   merging.  */
+static int
+finalize_compare (const void *l, const void *r)
+{
+  struct stringtable_entry *left = *(struct stringtable_entry **) l;
+  struct stringtable_entry *right = *(struct stringtable_entry **) r;
+  size_t to_compare;
+  if (left->length < right->length)
+    to_compare = left->length;
+  else
+    to_compare = right->length;
+  for (ssize_t i = to_compare - 1; i >= 0; --i)
+    {
+      unsigned char lch = left->string[i];
+      unsigned char rch = right->string[i];
+      if (lch != rch)
+        return lch - rch;
+    }
+  if (left->length == right->length)
+    return 0;
+  else if (left->length < right->length)
+    /* Longer strings should come first.  */
+    return 1;
+  else
+    return -1;
+}
+
+void
+stringtable_finalize (struct stringtable *table,
+                      struct stringtable_finalized *result)
+{
+  if (table->count == 0)
+    {
+      result->strings = xstrdup ("");
+      result->size = 0;
+      return;
+    }
+
+  /* Optimize the order of the strings.  */
+  struct stringtable_entry **array = xcalloc (table->count, sizeof (*array));
+  {
+    size_t j = 0;
+    for (uint32_t i = 0; i < table->allocated; ++i)
+      for (struct stringtable_entry *e = table->entries[i]; e != NULL;
+           e = e->next)
+        {
+          array[j] = e;
+          ++j;
+        }
+    assert (j == table->count);
+  }
+  qsort (array, table->count, sizeof (*array), finalize_compare);
+
+  /* Assign offsets, using table sharing if possible.  */
+  array[0]->offset = 0;
+  for (uint32_t j = 1; j < table->count; ++j)
+    {
+      struct stringtable_entry *previous = array[j - 1];
+      struct stringtable_entry *current = array[j];
+      if (previous->length >= current->length
+          && memcmp (&previous->string[previous->length - current->length],
+                     current->string, current->length) == 0)
+        current->offset = (previous->offset + previous->length
+                           - current->length);
+      else if (__builtin_add_overflow (previous->offset,
+                                       previous->length + 1,
+                                       &current->offset))
+        error (EXIT_FAILURE, 0, _("String table is too large"));
+    }
+
+  /* Allocate the result string.  */
+  {
+    struct stringtable_entry *last = array[table->count - 1];
+    if (__builtin_add_overflow (last->offset, last->length + 1,
+                                &result->size))
+      error (EXIT_FAILURE, 0, _("String table is too large"));
+  }
+  /* The strings are copied from the hash table, so the array is no
+     longer needed.  */
+  free (array);
+  result->strings = xcalloc (result->size, 1);
+
+  /* Copy the strings.  */
+  for (uint32_t i = 0; i < table->allocated; ++i)
+    for (struct stringtable_entry *e = table->entries[i]; e != NULL;
+         e = e->next)
+      if (result->strings[e->offset] == '\0')
+        memcpy (&result->strings[e->offset], e->string, e->length + 1);
+}
diff --git a/elf/stringtable.h b/elf/stringtable.h
new file mode 100644
index 0000000000..e35b6c67fd
--- /dev/null
+++ b/elf/stringtable.h
@@ -0,0 +1,61 @@ 
+/* String tables for ld.so.cache construction.
+   This file is part of the GNU C Library.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published
+   by the Free Software Foundation; version 2 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <https://www.gnu.org/licenses/>.  */
+
+#ifndef _STRINGTABLE_H
+#define _STRINGTABLE_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+/* An entry in the string table.  Only the length and string fields are
+   expected to be used outside the string table code.  */
+struct stringtable_entry
+{
+  struct stringtable_entry *next; /* For collision resolution.  */
+  uint32_t length;                /* Length of then string.  */
+  uint32_t offset;                /* From start of finalized table.  */
+  char string[];                  /* Null-terminated string.  */
+};
+
+/* A string table.  Zero-initialization produces a valid atable.  */
+struct stringtable
+{
+  struct stringtable_entry **entries;
+  uint32_t count;                 /* Number of elements in the table.  */
+  uint32_t allocated;             /* Length of the entries array.  */
+};
+
+/* Adds STRING to TABLE.  May return the address of an existing entry.  */
+struct stringtable_entry *stringtable_intern (struct stringtable *table,
+                                              const char *string);
+
+/* Result of stringtable_finalize.  SIZE bytes at STRINGS should be
+   written to the file.  */
+struct stringtable_finalized
+{
+  char *strings;
+  size_t size;
+};
+
+/* Assigns offsets to string table entries and computes the serialized
+   form of the string table.  */
+void stringtable_finalize (struct stringtable *table,
+                           struct stringtable_finalized *result);
+
+/* Deallocate the string table (but not the TABLE pointer itself).  */
+void stringtable_free (struct stringtable *table);
+
+#endif /* _STRINGTABLE_H */
diff --git a/elf/stringtable_free.c b/elf/stringtable_free.c
new file mode 100644
index 0000000000..0e5296e429
--- /dev/null
+++ b/elf/stringtable_free.c
@@ -0,0 +1,32 @@ 
+/* String tables for ld.so.cache construction.  Deallocation (for tests only).
+   This file is part of the GNU C Library.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published
+   by the Free Software Foundation; version 2 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <https://www.gnu.org/licenses/>.  */
+
+#include <stdlib.h>
+#include <stringtable.h>
+
+void
+stringtable_free (struct stringtable *table)
+{
+  for (uint32_t i = 0; i < table->allocated; ++i)
+    for (struct stringtable_entry *e = table->entries[i]; e != NULL; )
+      {
+        struct stringtable_entry *next = e->next;
+        free (e);
+        e = next;
+      }
+  free (table->entries);
+  *table = (struct stringtable) { 0, };
+}
diff --git a/elf/tst-stringtable.c b/elf/tst-stringtable.c
new file mode 100644
index 0000000000..78ca5434df
--- /dev/null
+++ b/elf/tst-stringtable.c
@@ -0,0 +1,140 @@ 
+/* Unit test for ldconfig string tables.
+   This file is part of the GNU C Library.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published
+   by the Free Software Foundation; version 2 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <https://www.gnu.org/licenses/>.  */
+
+#include <stdlib.h>
+#include <string.h>
+#include <stringtable.h>
+#include <support/check.h>
+#include <support/support.h>
+
+static int
+do_test (void)
+{
+  /* Empty string table.  */
+  {
+    struct stringtable s = { 0, };
+    struct stringtable_finalized f;
+    stringtable_finalize (&s, &f);
+    TEST_COMPARE_STRING (f.strings, "");
+    TEST_COMPARE (f.size, 0);
+    free (f.strings);
+    stringtable_free (&s);
+  }
+
+  /* String table with one empty string.  */
+  {
+    struct stringtable s = { 0, };
+    struct stringtable_entry *e = stringtable_intern (&s, "");
+    TEST_COMPARE_STRING (e->string, "");
+    TEST_COMPARE (e->length, 0);
+    TEST_COMPARE (s.count, 1);
+
+    struct stringtable_finalized f;
+    stringtable_finalize (&s, &f);
+    TEST_COMPARE (e->offset, 0);
+    TEST_COMPARE_STRING (f.strings, "");
+    TEST_COMPARE (f.size, 1);
+    free (f.strings);
+    stringtable_free (&s);
+  }
+
+  /* String table with one non-empty string.  */
+  {
+    struct stringtable s = { 0, };
+    struct stringtable_entry *e = stringtable_intern (&s, "name");
+    TEST_COMPARE_STRING (e->string, "name");
+    TEST_COMPARE (e->length, 4);
+    TEST_COMPARE (s.count, 1);
+
+    struct stringtable_finalized f;
+    stringtable_finalize (&s, &f);
+    TEST_COMPARE (e->offset, 0);
+    TEST_COMPARE_STRING (f.strings, "name");
+    TEST_COMPARE (f.size, 5);
+    free (f.strings);
+    stringtable_free (&s);
+  }
+
+  /* Two strings, one is a prefix of the other.  Tail-merging can only
+     happen in one way in this case.  */
+  {
+    struct stringtable s = { 0, };
+    struct stringtable_entry *suffix = stringtable_intern (&s, "suffix");
+    TEST_COMPARE_STRING (suffix->string, "suffix");
+    TEST_COMPARE (suffix->length, 6);
+    TEST_COMPARE (s.count, 1);
+
+    struct stringtable_entry *prefix
+      = stringtable_intern (&s, "prefix-suffix");
+    TEST_COMPARE_STRING (prefix->string, "prefix-suffix");
+    TEST_COMPARE (prefix->length, strlen ("prefix-suffix"));
+    TEST_COMPARE (s.count, 2);
+
+    struct stringtable_finalized f;
+    stringtable_finalize (&s, &f);
+    TEST_COMPARE (prefix->offset, 0);
+    TEST_COMPARE (suffix->offset, strlen ("prefix-"));
+    TEST_COMPARE_STRING (f.strings, "prefix-suffix");
+    TEST_COMPARE (f.size, sizeof ("prefix-suffix"));
+    free (f.strings);
+    stringtable_free (&s);
+  }
+
+  /* String table with various shared prefixes.  Triggers hash
+     resizing.  */
+  {
+    enum { count = 1500 };
+    char *strings[2 * count];
+    struct stringtable_entry *entries[2 * count];
+    struct stringtable s = { 0, };
+    for (int i = 0; i < count; ++i)
+      {
+        strings[i] = xasprintf ("%d", i);
+        entries[i] = stringtable_intern (&s, strings[i]);
+        TEST_COMPARE (entries[i]->length, strlen (strings[i]));
+        TEST_COMPARE_STRING (entries[i]->string, strings[i]);
+        strings[i + count] = xasprintf ("prefix/%d", i);
+        entries[i + count] = stringtable_intern (&s, strings[i + count]);
+        TEST_COMPARE (entries[i + count]->length, strlen (strings[i + count]));
+        TEST_COMPARE_STRING (entries[i + count]->string, strings[i + count]);
+      }
+
+    struct stringtable_finalized f;
+    stringtable_finalize (&s, &f);
+
+    for (int i = 0; i < 2 * count; ++i)
+      {
+        TEST_COMPARE (entries[i]->length, strlen (strings[i]));
+        TEST_COMPARE_STRING (entries[i]->string, strings[i]);
+        TEST_COMPARE_STRING (f.strings + entries[i]->offset, strings[i]);
+        free (strings[i]);
+      }
+
+    free (f.strings);
+    stringtable_free (&s);
+  }
+
+  return 0;
+}
+
+#include <support/test-driver.c>
+
+/* Re-compile the string table implementation here.  It is not
+   possible to link against the actual build because it was built for
+   use in ldconfig.  */
+#define _(arg) arg
+#include "stringtable.c"
+#include "stringtable_free.c"