diff mbox series

[2/7] stdlib: Simplify getenv

Message ID 20230213135558.3328727-3-adhemerval.zanella@linaro.org
State New
Headers show
Series Remove _STRING_ARCH_unaligned | expand

Commit Message

Adhemerval Zanella Feb. 13, 2023, 1:55 p.m. UTC
And remove _STRING_ARCH_unaligned usage.  The small size optimization
just adds boilerplate code.

Checked on x86_64-linux-gnu and i686-linux-gnu.
---
 stdlib/getenv.c | 63 ++++---------------------------------------------
 1 file changed, 4 insertions(+), 59 deletions(-)

Comments

Wilco Dijkstra Feb. 15, 2023, 5:50 p.m. UTC | #1
Hi Adhemerval,

+  size_t len = __strchrnul (name, '=') - name;

This is a change in behaviour, ignoring anything behind a '=' in name. What is
wrong with strlen?

+  for (char **ep = __environ; *ep != NULL; ++ep)
     {
-      size_t len = strlen (name);
-#if _STRING_ARCH_unaligned
-      name_start = *(const uint16_t *) name;
-#else
-      name_start = (((const unsigned char *) name)[0]
-                   | (((const unsigned char *) name)[1] << 8));
-#endif
-      len -= 2;
-      name += 2;
-
-      for (ep = __environ; *ep != NULL; ++ep)
-       {
-#if _STRING_ARCH_unaligned
-         uint16_t ep_start = *(uint16_t *) *ep;
-#else
-         uint16_t ep_start = (((unsigned char *) *ep)[0]
-                              | (((unsigned char *) *ep)[1] << 8));
-#endif
-
-         if (name_start == ep_start && !strncmp (*ep + 2, name, len)
-             && (*ep)[len + 2] == '=')
-           return &(*ep)[len + 3];
-       }
+      if (strncmp (name, *ep, len) == 0 && (*ep)[len] == '=')
+       return *ep + len + 1;

It's still useful to keep a check for the first character, eg. name[0] == (*ep)[0] as this
is about twice as fast as always calling strncmp.

Cheers,
Wilco
Adhemerval Zanella Feb. 16, 2023, 1:49 p.m. UTC | #2
On 15/02/23 14:50, Wilco Dijkstra wrote:
> Hi Adhemerval,
> 
> +  size_t len = __strchrnul (name, '=') - name;
> 
> This is a change in behaviour, ignoring anything behind a '=' in name. What is
> wrong with strlen?

Indeed it is better to keep current semantic.

> 
> +  for (char **ep = __environ; *ep != NULL; ++ep)
>      {
> -      size_t len = strlen (name);
> -#if _STRING_ARCH_unaligned
> -      name_start = *(const uint16_t *) name;
> -#else
> -      name_start = (((const unsigned char *) name)[0]
> -                   | (((const unsigned char *) name)[1] << 8));
> -#endif
> -      len -= 2;
> -      name += 2;
> -
> -      for (ep = __environ; *ep != NULL; ++ep)
> -       {
> -#if _STRING_ARCH_unaligned
> -         uint16_t ep_start = *(uint16_t *) *ep;
> -#else
> -         uint16_t ep_start = (((unsigned char *) *ep)[0]
> -                              | (((unsigned char *) *ep)[1] << 8));
> -#endif
> -
> -         if (name_start == ep_start && !strncmp (*ep + 2, name, len)
> -             && (*ep)[len + 2] == '=')
> -           return &(*ep)[len + 3];
> -       }
> +      if (strncmp (name, *ep, len) == 0 && (*ep)[len] == '=')
> +       return *ep + len + 1;
> 
> It's still useful to keep a check for the first character, eg. name[0] == (*ep)[0] as this
> is about twice as fast as always calling strncmp.

Alright, updated patch below.

From ddfb6419c5e74b34fb22454581fa0636275ec942 Mon Sep 17 00:00:00 2001
From: Adhemerval Zanella <adhemerval.zanella@linaro.org>
Date: Thu, 9 Feb 2023 10:36:57 -0300
Subject: [PATCH 2/7] stdlib: Simplify getenv

And remove _STRING_ARCH_unaligned usage.

Checked on x86_64-linux-gnu and i686-linux-gnu.
---
 stdlib/getenv.c | 68 +++++++------------------------------------------
 1 file changed, 9 insertions(+), 59 deletions(-)

diff --git a/stdlib/getenv.c b/stdlib/getenv.c
index e3157ce2f3..61a42d941d 100644
--- a/stdlib/getenv.c
+++ b/stdlib/getenv.c
@@ -15,76 +15,26 @@
    License along with the GNU C Library; if not, see
    <https://www.gnu.org/licenses/>.  */
 
-#include <endian.h>
-#include <errno.h>
-#include <stdint.h>
+#include <stdbool.h>
 #include <stdlib.h>
 #include <string.h>
 #include <unistd.h>
 
-
-/* Return the value of the environment variable NAME.  This implementation
-   is tuned a bit in that it assumes no environment variable has an empty
-   name which of course should always be true.  We have a special case for
-   one character names so that for the general case we can assume at least
-   two characters which we can access.  By doing this we can avoid using the
-   `strncmp' most of the time.  */
 char *
 getenv (const char *name)
 {
-  char **ep;
-  uint16_t name_start;
-
   if (__environ == NULL || name[0] == '\0')
     return NULL;
 
-  if (name[1] == '\0')
-    {
-      /* The name of the variable consists of only one character.  Therefore
-	 the first two characters of the environment entry are this character
-	 and a '=' character.  */
-#if __BYTE_ORDER == __LITTLE_ENDIAN || !_STRING_ARCH_unaligned
-      name_start = ('=' << 8) | *(const unsigned char *) name;
-#else
-      name_start = '=' | ((*(const unsigned char *) name) << 8);
-#endif
-      for (ep = __environ; *ep != NULL; ++ep)
-	{
-#if _STRING_ARCH_unaligned
-	  uint16_t ep_start = *(uint16_t *) *ep;
-#else
-	  uint16_t ep_start = (((unsigned char *) *ep)[0]
-			       | (((unsigned char *) *ep)[1] << 8));
-#endif
-	  if (name_start == ep_start)
-	    return &(*ep)[2];
-	}
-    }
-  else
-    {
-      size_t len = strlen (name);
-#if _STRING_ARCH_unaligned
-      name_start = *(const uint16_t *) name;
-#else
-      name_start = (((const unsigned char *) name)[0]
-		    | (((const unsigned char *) name)[1] << 8));
-#endif
-      len -= 2;
-      name += 2;
-
-      for (ep = __environ; *ep != NULL; ++ep)
-	{
-#if _STRING_ARCH_unaligned
-	  uint16_t ep_start = *(uint16_t *) *ep;
-#else
-	  uint16_t ep_start = (((unsigned char *) *ep)[0]
-			       | (((unsigned char *) *ep)[1] << 8));
-#endif
+  bool single_char = name[1] == '\0';
 
-	  if (name_start == ep_start && !strncmp (*ep + 2, name, len)
-	      && (*ep)[len + 2] == '=')
-	    return &(*ep)[len + 3];
-	}
+  size_t len = strlen (name);;
+  for (char **ep = __environ; *ep != NULL; ++ep)
+    {
+      if (single_char && (*ep)[0] == name[0] && (*ep)[1] == '=')
+	return *ep + 2;
+      else if (strncmp (name, *ep, len) == 0 && (*ep)[len] == '=')
+	return *ep + len + 1;
     }
 
   return NULL;
Andreas Schwab Feb. 16, 2023, 2:02 p.m. UTC | #3
On Feb 16 2023, Adhemerval Zanella Netto via Libc-alpha wrote:

> +  size_t len = strlen (name);;

Extra semicolon.
Wilco Dijkstra Feb. 16, 2023, 6:02 p.m. UTC | #4
Hi Adhemerval,

+  size_t len = strlen (name);;
+  for (char **ep = __environ; *ep != NULL; ++ep)
+    {
+      if (single_char && (*ep)[0] == name[0] && (*ep)[1] == '=')
+       return *ep + 2;
+      else if (strncmp (name, *ep, len) == 0 && (*ep)[len] == '=')
+       return *ep + len + 1;

This is about 10-20% slower than the previous patch both for single-char and
multi-char case... The approach I showed it is far simpler and 3-4 times faster
(within 10% of original performance).

Cheers,
Wilco
Adhemerval Zanella Feb. 16, 2023, 6:38 p.m. UTC | #5
On 16/02/23 15:02, Wilco Dijkstra wrote:
> Hi Adhemerval,
> 
> +  size_t len = strlen (name);;
> +  for (char **ep = __environ; *ep != NULL; ++ep)
> +    {
> +      if (single_char && (*ep)[0] == name[0] && (*ep)[1] == '=')
> +       return *ep + 2;
> +      else if (strncmp (name, *ep, len) == 0 && (*ep)[len] == '=')
> +       return *ep + len + 1;
> 
> This is about 10-20% slower than the previous patch both for single-char and
> multi-char case... The approach I showed it is far simpler and 3-4 times faster
> (within 10% of original performance).

Do you mean something like:

  size_t len = strlen (name);;
  for (char **ep = __environ; *ep != NULL; ++ep)
    {
      if (((*ep)[0] == name[0] && (*ep)[1] == '=')
          || (strncmp (name, *ep, len) == 0 && (*ep)[len] == '='))
        return *ep + len + 1;
    }
Wilco Dijkstra Feb. 16, 2023, 6:47 p.m. UTC | #6
Hi Adhemerval,

> Do you mean something like:
>
>  size_t len = strlen (name);;
>  for (char **ep = __environ; *ep != NULL; ++ep)
>    {
>      if (((*ep)[0] == name[0] && (*ep)[1] == '=')
>          || (strncmp (name, *ep, len) == 0 && (*ep)[len] == '='))
>        return *ep + len + 1;
>    }

No I meant checking the first character first before doing more expensive calls, so:

      if (name[0] == (*ep)[0] && strncmp (name, *ep, len) == 0 && (*ep)[len] == '=')
        return *ep + len + 1;

Basically this means you get a very tight loop checking the first character - this avoids
most calls to strncmp. This is basically the same result as the current implementation
but without all the defines and complex unaligned/big-endian stuff!

Cheers,
Wilco
Adhemerval Zanella Feb. 16, 2023, 7:29 p.m. UTC | #7
On 16/02/23 15:47, Wilco Dijkstra wrote:
> Hi Adhemerval,
> 
>> Do you mean something like:
>>
>>   size_t len = strlen (name);;
>>   for (char **ep = __environ; *ep != NULL; ++ep)
>>     {
>>       if (((*ep)[0] == name[0] && (*ep)[1] == '=')
>>           || (strncmp (name, *ep, len) == 0 && (*ep)[len] == '='))
>>         return *ep + len + 1;
>>     }
> 
> No I meant checking the first character first before doing more expensive calls, so:
> 
>       if (name[0] == (*ep)[0] && strncmp (name, *ep, len) == 0 && (*ep)[len] == '=')
>         return *ep + len + 1;
> 
> Basically this means you get a very tight loop checking the first character - this avoids
> most calls to strncmp. This is basically the same result as the current implementation
> but without all the defines and complex unaligned/big-endian stuff!

Ah ok, I see it now.  Do the follow addresses the issues pointed out?


From 373682e6c290f1c38d0319b5695e6a12a388be0c Mon Sep 17 00:00:00 2001
From: Adhemerval Zanella <adhemerval.zanella@linaro.org>
Date: Thu, 9 Feb 2023 10:36:57 -0300
Subject: [PATCH 2/7] stdlib: Simplify getenv

And remove _STRING_ARCH_unaligned usage.

Checked on x86_64-linux-gnu and i686-linux-gnu.
---
 stdlib/getenv.c | 64 ++++---------------------------------------------
 1 file changed, 5 insertions(+), 59 deletions(-)

diff --git a/stdlib/getenv.c b/stdlib/getenv.c
index e3157ce2f3..6942d23433 100644
--- a/stdlib/getenv.c
+++ b/stdlib/getenv.c
@@ -15,76 +15,22 @@
    License along with the GNU C Library; if not, see
    <https://www.gnu.org/licenses/>.  */
 
-#include <endian.h>
-#include <errno.h>
-#include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
 #include <unistd.h>
 
-
-/* Return the value of the environment variable NAME.  This implementation
-   is tuned a bit in that it assumes no environment variable has an empty
-   name which of course should always be true.  We have a special case for
-   one character names so that for the general case we can assume at least
-   two characters which we can access.  By doing this we can avoid using the
-   `strncmp' most of the time.  */
 char *
 getenv (const char *name)
 {
-  char **ep;
-  uint16_t name_start;
-
   if (__environ == NULL || name[0] == '\0')
     return NULL;
 
-  if (name[1] == '\0')
-    {
-      /* The name of the variable consists of only one character.  Therefore
-	 the first two characters of the environment entry are this character
-	 and a '=' character.  */
-#if __BYTE_ORDER == __LITTLE_ENDIAN || !_STRING_ARCH_unaligned
-      name_start = ('=' << 8) | *(const unsigned char *) name;
-#else
-      name_start = '=' | ((*(const unsigned char *) name) << 8);
-#endif
-      for (ep = __environ; *ep != NULL; ++ep)
-	{
-#if _STRING_ARCH_unaligned
-	  uint16_t ep_start = *(uint16_t *) *ep;
-#else
-	  uint16_t ep_start = (((unsigned char *) *ep)[0]
-			       | (((unsigned char *) *ep)[1] << 8));
-#endif
-	  if (name_start == ep_start)
-	    return &(*ep)[2];
-	}
-    }
-  else
+  size_t len = strlen (name);;
+  for (char **ep = __environ; *ep != NULL; ++ep)
     {
-      size_t len = strlen (name);
-#if _STRING_ARCH_unaligned
-      name_start = *(const uint16_t *) name;
-#else
-      name_start = (((const unsigned char *) name)[0]
-		    | (((const unsigned char *) name)[1] << 8));
-#endif
-      len -= 2;
-      name += 2;
-
-      for (ep = __environ; *ep != NULL; ++ep)
-	{
-#if _STRING_ARCH_unaligned
-	  uint16_t ep_start = *(uint16_t *) *ep;
-#else
-	  uint16_t ep_start = (((unsigned char *) *ep)[0]
-			       | (((unsigned char *) *ep)[1] << 8));
-#endif
-
-	  if (name_start == ep_start && !strncmp (*ep + 2, name, len)
-	      && (*ep)[len + 2] == '=')
-	    return &(*ep)[len + 3];
-	}
+      if (name[0] == (*ep)[0]
+	  && strncmp (name, *ep, len) == 0 && (*ep)[len] == '=')
+	return *ep + len + 1;
     }
 
   return NULL;
Wilco Dijkstra Feb. 17, 2023, 5:13 p.m. UTC | #8
Hi Adhemerval,

> Ah ok, I see it now.  Do the follow addresses the issues pointed out?

Yes that looks good to me - just one thing (as pointed out by Andreas):

+  size_t len = strlen (name);;

Double semicolon. OK with that fixed.

Reviewed-by: Wilco Dijkstra  <Wilco.Dijkstra@arm.com>

Cheers,
Wilco
diff mbox series

Patch

diff --git a/stdlib/getenv.c b/stdlib/getenv.c
index e3157ce2f3..449bba1e16 100644
--- a/stdlib/getenv.c
+++ b/stdlib/getenv.c
@@ -15,76 +15,21 @@ 
    License along with the GNU C Library; if not, see
    <https://www.gnu.org/licenses/>.  */
 
-#include <endian.h>
-#include <errno.h>
-#include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
 #include <unistd.h>
 
-
-/* Return the value of the environment variable NAME.  This implementation
-   is tuned a bit in that it assumes no environment variable has an empty
-   name which of course should always be true.  We have a special case for
-   one character names so that for the general case we can assume at least
-   two characters which we can access.  By doing this we can avoid using the
-   `strncmp' most of the time.  */
 char *
 getenv (const char *name)
 {
-  char **ep;
-  uint16_t name_start;
-
   if (__environ == NULL || name[0] == '\0')
     return NULL;
 
-  if (name[1] == '\0')
-    {
-      /* The name of the variable consists of only one character.  Therefore
-	 the first two characters of the environment entry are this character
-	 and a '=' character.  */
-#if __BYTE_ORDER == __LITTLE_ENDIAN || !_STRING_ARCH_unaligned
-      name_start = ('=' << 8) | *(const unsigned char *) name;
-#else
-      name_start = '=' | ((*(const unsigned char *) name) << 8);
-#endif
-      for (ep = __environ; *ep != NULL; ++ep)
-	{
-#if _STRING_ARCH_unaligned
-	  uint16_t ep_start = *(uint16_t *) *ep;
-#else
-	  uint16_t ep_start = (((unsigned char *) *ep)[0]
-			       | (((unsigned char *) *ep)[1] << 8));
-#endif
-	  if (name_start == ep_start)
-	    return &(*ep)[2];
-	}
-    }
-  else
+  size_t len = __strchrnul (name, '=') - name;
+  for (char **ep = __environ; *ep != NULL; ++ep)
     {
-      size_t len = strlen (name);
-#if _STRING_ARCH_unaligned
-      name_start = *(const uint16_t *) name;
-#else
-      name_start = (((const unsigned char *) name)[0]
-		    | (((const unsigned char *) name)[1] << 8));
-#endif
-      len -= 2;
-      name += 2;
-
-      for (ep = __environ; *ep != NULL; ++ep)
-	{
-#if _STRING_ARCH_unaligned
-	  uint16_t ep_start = *(uint16_t *) *ep;
-#else
-	  uint16_t ep_start = (((unsigned char *) *ep)[0]
-			       | (((unsigned char *) *ep)[1] << 8));
-#endif
-
-	  if (name_start == ep_start && !strncmp (*ep + 2, name, len)
-	      && (*ep)[len + 2] == '=')
-	    return &(*ep)[len + 3];
-	}
+      if (strncmp (name, *ep, len) == 0 && (*ep)[len] == '=')
+	return *ep + len + 1;
     }
 
   return NULL;