diff mbox

Cleanup of input.c

Message ID AM4PR0701MB2162CFECA9ECD8D497BC4477E4CD0@AM4PR0701MB2162.eurprd07.prod.outlook.com
State New
Headers show

Commit Message

Bernd Edlinger Sept. 26, 2016, 2:30 p.m. UTC
Hi,

this started, because I saw get_next_line returns -1 on error, instead
of false: PR 77699.

But when I was there I also saw that read_line_num is using memmove on
non-aliased src & dest, instead of memcpy.  But then I see that
also adding a NUL byte is superfluos, and all the copying as well....


So the result is a cleanup of input.c that avoids lots of copying
altogether, because it is no longer necessary to do so.


Bootstrapped and reg-tested on x86_64-pc-linux-gnu.
Is it OK for trunk?


Thanks
Bernd.

Comments

David Malcolm Sept. 30, 2016, 10:16 p.m. UTC | #1
On Mon, 2016-09-26 at 14:30 +0000, Bernd Edlinger wrote:
> Hi,
> 
> this started, because I saw get_next_line returns -1 on error,
> instead
> of false: PR 77699.
> 
> But when I was there I also saw that read_line_num is using memmove
> on
> non-aliased src & dest, instead of memcpy.  But then I see that
> also adding a NUL byte is superfluos, and all the copying as well....
> 
> 
> So the result is a cleanup of input.c that avoids lots of copying
> altogether, because it is no longer necessary to do so.
> 
> 
> Bootstrapped and reg-tested on x86_64-pc-linux-gnu.
> Is it OK for trunk?

I'll attempt to review this with my "diagnostics maintainer" hat on - 
 but I'm not as familiar with input.c's implementation as with the restof the diagnostics subsystem.

Did you try running this under valgrind?
  (e.g. "make selftest-valgrind" at least)

FWIW I made the following notes to try to understand the existing
behavior:

  get_next_line:
    sets its *line to values related to c->data, the buffered content
      of all of the file so far...
      c->data can be reallocated in maybe_grow, called by read_data,
      called by maybe_read_line, called by get_next_line
      invalidating the exposed ptr whenever the buffer growth needs a
      re-allocation.

  read_next_line:
    copies the exposed part of c->data to *line
    Note that it's a different *line:
    It calls get_next_line with &l, so *line in get_next_line is 
    writing to "l" in read_next_line, exposing part of c->data
    read_next_line ensures that the passed-in-ptr's buffer is big
    enough, then does a memcpy to it, effectively from c->data.

  goto_next_line:
    also calls get_next_line, but doesn't bother copying the result
    anywhere

  read_line_num:
    calls goto_next_line repeatedly
    then calls read_next_line, thus effectively read_next_line is 
    copying data to the passed-in-ptr

  location_get_source_line:
    calls read_next_line, managing:
      static char *buffer;
      static ssize_t len;
    as the place where the copies are written to
    Hence only one such buffer is live per-process, and it can't leak.
    Hence location_get_source_line can't leak, but the buffer exposed
    to the caller is only valid until the next call to
    location_get_source_line.

> 2016-09-26  Bernd Edlinger  <bernd.edlinger@hotmail.de>
> 
> 	PR preprocessor/77699
> 	* input.c (maybe_grow): Don't allocate one byte extra
headroom.
> 	(get_next_line): Return false on error.
> 	(read_next_line): Removed, use get_next_line instead.
> 	(read_line_num): Don't copy the line.
> 	(location_get_source_line): Don't use static data.
> 
> 
> Index: gcc/input.c
> ===================================================================
> --- gcc/input.c	(revision 240471)
> +++ gcc/input.c	(working copy)
> @@ -432,7 +432,7 @@ maybe_grow (fcache *c)
>      return;
>  
>    size_t size = c->size == 0 ? fcache_buffer_size : c->size * 2;
> -  c->data = XRESIZEVEC (char, c->data, size + 1);
> +  c->data = XRESIZEVEC (char, c->data, size);
>    c->size = size;
>  }

Seems reasonable; I can't see anywhere where an extra byte could get
used.


> @@ -534,7 +534,7 @@ get_next_line (fcache *c, char **line, ssize_t
*li
>      }
>  
>    if (ferror (c->fp))
> -    return -1;
> +    return false;

OK

>    /* At this point, we've found the end of the of line.  It either
>       points to the '\n' or to one byte after the last byte of the
> @@ -597,36 +597,6 @@ get_next_line (fcache *c, char **line, ssize_t
*li
>    return true;
>  }
>  
> -/* Reads the next line from FILE into *LINE.  If *LINE is too small
> -   (or NULL) it is allocated (or extended) to have enough space to
> -   containe the line.  *LINE_LENGTH must contain the size of the
> -   initial*LINE buffer.  It's then updated by this function to the
> -   actual length of the returned line.  Note that the returned line
> -   can contain several zero bytes.  Also note that the returned
string
> -   is allocated in static storage that is going to be re-used by
> -   subsequent invocations of read_line.  */
> -
> -static bool
> -read_next_line (fcache *cache, char ** line, ssize_t *line_len)
> -{
> -  char *l = NULL;
> -  ssize_t len = 0;
> -
> -  if (!get_next_line (cache, &l, &len))
> -    return false;
> -
> -  if (*line == NULL)
> -    *line = XNEWVEC (char, len);
> -  else
> -    if (*line_len < len)
> -	*line = XRESIZEVEC (char, *line, len);
> -
> -  memcpy (*line, l, len);
> -  *line_len = len;
> -
> -  return true;
> -}
> -
>  /* Consume the next bytes coming from the cache (or from its
>     underlying file if there are remaining unread bytes in the file)
>     until we reach the next end-of-line (or end-of-file).  There is
no
> @@ -651,7 +621,7 @@ goto_next_line (fcache *cache)
>  
>  static bool
>  read_line_num (fcache *c, size_t line_num,
> -	       char ** line, ssize_t *line_len)
> +	       char **line, ssize_t *line_len)
>  {
>    gcc_assert (line_num > 0);
>  
> @@ -705,12 +675,8 @@ read_line_num (fcache *c, size_t line_num,
>  	    {
>  	      /* We have the start/end of the line.  Let's just copy
>  		 it again and we are done.  */
> -	      ssize_t len = i->end_pos - i->start_pos + 1;
> -	      if (*line_len < len)
> -		*line = XRESIZEVEC (char, *line, len);
> -	      memmove (*line, c->data + i->start_pos, len);
> -	      (*line)[len - 1] = '\0';
> -	      *line_len = --len;
> +	      *line = c->data + i->start_pos;
> +	      *line_len = i->end_pos - i->start_pos;
>  	      return true;
>  	    }

I notice that the old code adds a nul-terminator for this case (where
line_num < c->line_num), but not for the case where the line is
obtained via read_next_line.  Ugh.  I wonder if this code at one time
guaranteed NUL-termination, and then bit-rotted.

I believe that are callers of the input.h interface are set up
throughout to not expect nul-terminators on lines, so I think that
change is OK.

> @@ -735,7 +701,7 @@ read_line_num (fcache *c, size_t line_num,
>  
>    /* The line we want is the next one.  Let's read and copy it back
to
>       the caller.  */
> -  return read_next_line (c, line, line_len);
> +  return get_next_line (c, line, line_len);
>  }

If I understand things, rather than returning a copy of the line data,
it will now return a ptr into c->data i.e. fcache's copy of the prefix
of the file that it's buffered so far, right?


>  /* Return the physical source line that corresponds to
FILE_PATH/LINE in a
> @@ -748,8 +714,8 @@ const char *
>  location_get_source_line (const char *file_path, int line,
>  			  int *line_len)
>  {
> -  static char *buffer;
> -  static ssize_t len;
> +  char *buffer;
> +  ssize_t len;
>  
>    if (line == 0)
>      return NULL;

If I understand things, the old behavior was to have a single buffered
line copy, managed by the "static" vars in location_get_source_line, so
that the buffer is only valid until the next call to
location_get_source_line.

If I understand the proposed change, the new behavior will be that the
result will be a ptr into the inside of fcache->data, and thus will
also be only guaranteed valid the next call to location_get_source_line
- since fcache->data could grow, and that growth could lead to a
reallocation.

I like the patch: I believe it makes the implementation simpler, and
slightly more efficient.  Before I'm comfortable approving this, please
could you update the patch to also update some of the comments in
input.c, to better reflect the lifetimes of the various entities.

For example get_next_line's comment currently talks about "Space for
that line has been allocated in the cache thus *LINE has the same life
time as C.", which I think isn't currently true, and which the patch
may make less true :)

I think we could also use some more selftests for
location_get_source_line... there's test_reading_source_line, but this
doesn't yet exercise the
    if (line_num <= c->line_num)
case in read_line_num, when lines are accessed in reverse order.  So
please also update that selftest to exercise that cache access pattern:
read the 3rd line, then the 2nd line.

Thanks
Dave
Bernd Edlinger Oct. 1, 2016, 7:30 a.m. UTC | #2
On 10/01/16 00:16, David Malcolm wrote:
> On Mon, 2016-09-26 at 14:30 +0000, Bernd Edlinger wrote:

>> Hi,

>>

>> this started, because I saw get_next_line returns -1 on error,

>> instead

>> of false: PR 77699.

>>

>> But when I was there I also saw that read_line_num is using memmove

>> on

>> non-aliased src & dest, instead of memcpy.  But then I see that

>> also adding a NUL byte is superfluos, and all the copying as well....

>>

>>

>> So the result is a cleanup of input.c that avoids lots of copying

>> altogether, because it is no longer necessary to do so.

>>

>>

>> Bootstrapped and reg-tested on x86_64-pc-linux-gnu.

>> Is it OK for trunk?

>

> I'll attempt to review this with my "diagnostics maintainer" hat on -

>   but I'm not as familiar with input.c's implementation as with the restof the diagnostics subsystem.

>


Yes.  I gave a few review comments when Dodji Seketeli wrote the
line buffering code, but Jakub did most of the review.  So I am
already a bit familiar with the history of the code here.

> Did you try running this under valgrind?

>    (e.g. "make selftest-valgrind" at least)

>


did not try.

> FWIW I made the following notes to try to understand the existing

> behavior:

>

>    get_next_line:

>      sets its *line to values related to c->data, the buffered content

>        of all of the file so far...

>        c->data can be reallocated in maybe_grow, called by read_data,

>        called by maybe_read_line, called by get_next_line

>        invalidating the exposed ptr whenever the buffer growth needs a

>        re-allocation.

>


Additionally the file cache can be evicted, and the buffer can be
re-used for another file.

>    read_next_line:

>      copies the exposed part of c->data to *line

>      Note that it's a different *line:

>      It calls get_next_line with &l, so *line in get_next_line is

>      writing to "l" in read_next_line, exposing part of c->data

>      read_next_line ensures that the passed-in-ptr's buffer is big

>      enough, then does a memcpy to it, effectively from c->data.

>

>    goto_next_line:

>      also calls get_next_line, but doesn't bother copying the result

>      anywhere

>

>    read_line_num:

>      calls goto_next_line repeatedly

>      then calls read_next_line, thus effectively read_next_line is

>      copying data to the passed-in-ptr

>

>    location_get_source_line:

>      calls read_next_line, managing:

>        static char *buffer;

>        static ssize_t len;

>      as the place where the copies are written to

>      Hence only one such buffer is live per-process, and it can't leak.

>      Hence location_get_source_line can't leak, but the buffer exposed

>      to the caller is only valid until the next call to

>      location_get_source_line.

>

>> 2016-09-26  Bernd Edlinger  <bernd.edlinger@hotmail.de>

>>

>> 	PR preprocessor/77699

>> 	* input.c (maybe_grow): Don't allocate one byte extra

> headroom.

>> 	(get_next_line): Return false on error.

>> 	(read_next_line): Removed, use get_next_line instead.

>> 	(read_line_num): Don't copy the line.

>> 	(location_get_source_line): Don't use static data.

>>

>>

>> Index: gcc/input.c

>> ===================================================================

>> --- gcc/input.c	(revision 240471)

>> +++ gcc/input.c	(working copy)

>> @@ -432,7 +432,7 @@ maybe_grow (fcache *c)

>>       return;

>>

>>     size_t size = c->size == 0 ? fcache_buffer_size : c->size * 2;

>> -  c->data = XRESIZEVEC (char, c->data, size + 1);

>> +  c->data = XRESIZEVEC (char, c->data, size);

>>     c->size = size;

>>   }

>

> Seems reasonable; I can't see anywhere where an extra byte could get

> used.

>


Yes. It was used only in the memmove which copied one extra byte,
but that was set to zero afterwards.

>

>> @@ -534,7 +534,7 @@ get_next_line (fcache *c, char **line, ssize_t

> *li

>>       }

>>

>>     if (ferror (c->fp))

>> -    return -1;

>> +    return false;

>

> OK

>

>>     /* At this point, we've found the end of the of line.  It either

>>        points to the '\n' or to one byte after the last byte of the

>> @@ -597,36 +597,6 @@ get_next_line (fcache *c, char **line, ssize_t

> *li

>>     return true;

>>   }

>>

>> -/* Reads the next line from FILE into *LINE.  If *LINE is too small

>> -   (or NULL) it is allocated (or extended) to have enough space to

>> -   containe the line.  *LINE_LENGTH must contain the size of the

>> -   initial*LINE buffer.  It's then updated by this function to the

>> -   actual length of the returned line.  Note that the returned line

>> -   can contain several zero bytes.  Also note that the returned

> string

>> -   is allocated in static storage that is going to be re-used by

>> -   subsequent invocations of read_line.  */

>> -

>> -static bool

>> -read_next_line (fcache *cache, char ** line, ssize_t *line_len)

>> -{

>> -  char *l = NULL;

>> -  ssize_t len = 0;

>> -

>> -  if (!get_next_line (cache, &l, &len))

>> -    return false;

>> -

>> -  if (*line == NULL)

>> -    *line = XNEWVEC (char, len);

>> -  else

>> -    if (*line_len < len)

>> -	*line = XRESIZEVEC (char, *line, len);

>> -

>> -  memcpy (*line, l, len);

>> -  *line_len = len;

>> -

>> -  return true;

>> -}

>> -

>>   /* Consume the next bytes coming from the cache (or from its

>>      underlying file if there are remaining unread bytes in the file)

>>      until we reach the next end-of-line (or end-of-file).  There is

> no

>> @@ -651,7 +621,7 @@ goto_next_line (fcache *cache)

>>

>>   static bool

>>   read_line_num (fcache *c, size_t line_num,

>> -	       char ** line, ssize_t *line_len)

>> +	       char **line, ssize_t *line_len)

>>   {

>>     gcc_assert (line_num > 0);

>>

>> @@ -705,12 +675,8 @@ read_line_num (fcache *c, size_t line_num,

>>   	    {

>>   	      /* We have the start/end of the line.  Let's just copy

>>   		 it again and we are done.  */

>> -	      ssize_t len = i->end_pos - i->start_pos + 1;

>> -	      if (*line_len < len)

>> -		*line = XRESIZEVEC (char, *line, len);

>> -	      memmove (*line, c->data + i->start_pos, len);

>> -	      (*line)[len - 1] = '\0';

>> -	      *line_len = --len;

>> +	      *line = c->data + i->start_pos;

>> +	      *line_len = i->end_pos - i->start_pos;

>>   	      return true;

>>   	    }

>

> I notice that the old code adds a nul-terminator for this case (where

> line_num < c->line_num), but not for the case where the line is

> obtained via read_next_line.  Ugh.  I wonder if this code at one time

> guaranteed NUL-termination, and then bit-rotted.

>


Yes, exactly that was the case initially.
This code evolved from read_line which is in still in gcov.c,
one day I will fix that one too, because it has a memory leak.

> I believe that are callers of the input.h interface are set up

> throughout to not expect nul-terminators on lines, so I think that

> change is OK.

>

>> @@ -735,7 +701,7 @@ read_line_num (fcache *c, size_t line_num,

>>

>>     /* The line we want is the next one.  Let's read and copy it back

> to

>>        the caller.  */

>> -  return read_next_line (c, line, line_len);

>> +  return get_next_line (c, line, line_len);

>>   }

>

> If I understand things, rather than returning a copy of the line data,

> it will now return a ptr into c->data i.e. fcache's copy of the prefix

> of the file that it's buffered so far, right?

>


Yes, the pointer is a const char* so will not modify the cache.
All the copying should be unnecessary, and the line data
is only valid until the next line is accessed.

>

>>   /* Return the physical source line that corresponds to

> FILE_PATH/LINE in a

>> @@ -748,8 +714,8 @@ const char *

>>   location_get_source_line (const char *file_path, int line,

>>   			  int *line_len)

>>   {

>> -  static char *buffer;

>> -  static ssize_t len;

>> +  char *buffer;

>> +  ssize_t len;

>>

>>     if (line == 0)

>>       return NULL;

>

> If I understand things, the old behavior was to have a single buffered

> line copy, managed by the "static" vars in location_get_source_line, so

> that the buffer is only valid until the next call to

> location_get_source_line.

>


Yes.  Same as read_line in gcov.c, initially it was a copy of that one.

> If I understand the proposed change, the new behavior will be that the

> result will be a ptr into the inside of fcache->data, and thus will

> also be only guaranteed valid the next call to location_get_source_line

> - since fcache->data could grow, and that growth could lead to a

> reallocation.

>


Yes.  Also the buffer can be reused, when another file is requested.

> I like the patch: I believe it makes the implementation simpler, and

> slightly more efficient.  Before I'm comfortable approving this, please

> could you update the patch to also update some of the comments in

> input.c, to better reflect the lifetimes of the various entities.

>

> For example get_next_line's comment currently talks about "Space for

> that line has been allocated in the cache thus *LINE has the same life

> time as C.", which I think isn't currently true, and which the patch

> may make less true :)

>


Yes.  I will look over the comments.

> I think we could also use some more selftests for

> location_get_source_line... there's test_reading_source_line, but this

> doesn't yet exercise the

>      if (line_num <= c->line_num)

> case in read_line_num, when lines are accessed in reverse order.  So

> please also update that selftest to exercise that cache access pattern:

> read the 3rd line, then the 2nd line.

>


Yes.  I will add that test, maybe also a last line with missing EOL.



Thanks
Bernd.
diff mbox

Patch

2016-09-26  Bernd Edlinger  <bernd.edlinger@hotmail.de>

	PR preprocessor/77699
	* input.c (maybe_grow): Don't allocate one byte extra headroom.
	(get_next_line): Return false on error.
	(read_next_line): Removed, use get_next_line instead.
	(read_line_num): Don't copy the line.
	(location_get_source_line): Don't use static data.


Index: gcc/input.c
===================================================================
--- gcc/input.c	(revision 240471)
+++ gcc/input.c	(working copy)
@@ -432,7 +432,7 @@  maybe_grow (fcache *c)
     return;
 
   size_t size = c->size == 0 ? fcache_buffer_size : c->size * 2;
-  c->data = XRESIZEVEC (char, c->data, size + 1);
+  c->data = XRESIZEVEC (char, c->data, size);
   c->size = size;
 }
 
@@ -534,7 +534,7 @@  get_next_line (fcache *c, char **line, ssize_t *li
     }
 
   if (ferror (c->fp))
-    return -1;
+    return false;
 
   /* At this point, we've found the end of the of line.  It either
      points to the '\n' or to one byte after the last byte of the
@@ -597,36 +597,6 @@  get_next_line (fcache *c, char **line, ssize_t *li
   return true;
 }
 
-/* Reads the next line from FILE into *LINE.  If *LINE is too small
-   (or NULL) it is allocated (or extended) to have enough space to
-   containe the line.  *LINE_LENGTH must contain the size of the
-   initial*LINE buffer.  It's then updated by this function to the
-   actual length of the returned line.  Note that the returned line
-   can contain several zero bytes.  Also note that the returned string
-   is allocated in static storage that is going to be re-used by
-   subsequent invocations of read_line.  */
-
-static bool
-read_next_line (fcache *cache, char ** line, ssize_t *line_len)
-{
-  char *l = NULL;
-  ssize_t len = 0;
-
-  if (!get_next_line (cache, &l, &len))
-    return false;
-
-  if (*line == NULL)
-    *line = XNEWVEC (char, len);
-  else
-    if (*line_len < len)
-	*line = XRESIZEVEC (char, *line, len);
-
-  memcpy (*line, l, len);
-  *line_len = len;
-
-  return true;
-}
-
 /* Consume the next bytes coming from the cache (or from its
    underlying file if there are remaining unread bytes in the file)
    until we reach the next end-of-line (or end-of-file).  There is no
@@ -651,7 +621,7 @@  goto_next_line (fcache *cache)
 
 static bool
 read_line_num (fcache *c, size_t line_num,
-	       char ** line, ssize_t *line_len)
+	       char **line, ssize_t *line_len)
 {
   gcc_assert (line_num > 0);
 
@@ -705,12 +675,8 @@  read_line_num (fcache *c, size_t line_num,
 	    {
 	      /* We have the start/end of the line.  Let's just copy
 		 it again and we are done.  */
-	      ssize_t len = i->end_pos - i->start_pos + 1;
-	      if (*line_len < len)
-		*line = XRESIZEVEC (char, *line, len);
-	      memmove (*line, c->data + i->start_pos, len);
-	      (*line)[len - 1] = '\0';
-	      *line_len = --len;
+	      *line = c->data + i->start_pos;
+	      *line_len = i->end_pos - i->start_pos;
 	      return true;
 	    }
 
@@ -735,7 +701,7 @@  read_line_num (fcache *c, size_t line_num,
 
   /* The line we want is the next one.  Let's read and copy it back to
      the caller.  */
-  return read_next_line (c, line, line_len);
+  return get_next_line (c, line, line_len);
 }
 
 /* Return the physical source line that corresponds to FILE_PATH/LINE in a
@@ -748,8 +714,8 @@  const char *
 location_get_source_line (const char *file_path, int line,
 			  int *line_len)
 {
-  static char *buffer;
-  static ssize_t len;
+  char *buffer;
+  ssize_t len;
 
   if (line == 0)
     return NULL;