diff mbox

[07/22] Implement token range tracking within libcpp and C/C++ FEs

Message ID 1441916913-11547-8-git-send-email-dmalcolm@redhat.com
State New
Headers show

Commit Message

David Malcolm Sept. 10, 2015, 8:28 p.m. UTC
This patch adds source *range* information to libcpp's cpp_token, and to
c_token and cp_token in the C and C++ frontends respectively.

To minimize churn, I kept the existing location_t fields, though in
theory these are always just equal to the start of the source range.

cpplib.h's struct cpp_token had this comment:

  /* A preprocessing token.  This has been carefully packed and should
     occupy 16 bytes on 32-bit hosts and 24 bytes on 64-bit hosts.  */

Does anyone know why this was "carefully packed" and to what extent
this matters?  I'm adding an extra 8 bytes to it (or 4 if we eliminate
the existing location_t).  As far as I can see, these are
short-lived, and there are only relative few alive at any time.
Or is it about making them fast to copy?

gcc/c-family/ChangeLog:
	* c-lex.c (c_lex_with_flags): Add "range" param, and write back
	to *range with the range of the libcpp token.
	* c-pragma.h (c_lex_with_flags): Add "range" param.

gcc/c/ChangeLog:
	* c-parser.c (struct c_token): Add "range" field.
	(c_lex_one_token): Write back to token->range in call to
	c_lex_with_flags.

gcc/cp/ChangeLog:
	* parser.c (eof_token): Add "range" field to initializer.
	(cp_lexer_get_preprocessor_token): Write back to token->range in
	call to c_lex_with_flags.
	* parser.h (struct cp_token): Add "range" field.

libcpp/ChangeLog:
	* include/cpplib.h (struct cpp_token): Add src_range field.
	* lex.c (_cpp_lex_direct): Set up the src_range on the token.
---
 gcc/c-family/c-lex.c    | 7 +++++--
 gcc/c-family/c-pragma.h | 4 ++--
 gcc/c/c-parser.c        | 6 +++++-
 gcc/cp/parser.c         | 5 +++--
 gcc/cp/parser.h         | 2 ++
 libcpp/include/cpplib.h | 4 +++-
 libcpp/lex.c            | 8 ++++++++
 7 files changed, 28 insertions(+), 8 deletions(-)

Comments

Michael Matz Sept. 11, 2015, 1:55 p.m. UTC | #1
Hi,

On Thu, 10 Sep 2015, David Malcolm wrote:

> Does anyone know why this was "carefully packed" and to what extent
> this matters?  I'm adding an extra 8 bytes to it (or 4 if we eliminate
> the existing location_t).  As far as I can see, these are
> short-lived, and there are only relative few alive at any time.

The c++ frontend stores _all_ tokens before starting to parse, so the size 
of cp_token is not totally irrelevant.  It still might not matter much, 
though.


Ciao,
Michael.
Jeff Law Sept. 14, 2015, 7:37 p.m. UTC | #2
On 09/11/2015 07:55 AM, Michael Matz wrote:
> Hi,
>
> On Thu, 10 Sep 2015, David Malcolm wrote:
>
>> Does anyone know why this was "carefully packed" and to what extent
>> this matters?  I'm adding an extra 8 bytes to it (or 4 if we eliminate
>> the existing location_t).  As far as I can see, these are
>> short-lived, and there are only relative few alive at any time.
>
> The c++ frontend stores _all_ tokens before starting to parse, so the size
> of cp_token is not totally irrelevant.  It still might not matter much,
> though.
FWIW, Zack hasn't gone away totally (he was just posting about the 
explicit_bzero stuff) -- it couldn't hurt to ping him on the 
implications of changing the size of that structure.

jeff
Richard Biener Sept. 15, 2015, 10:14 a.m. UTC | #3
On Thu, Sep 10, 2015 at 10:28 PM, David Malcolm <dmalcolm@redhat.com> wrote:
> This patch adds source *range* information to libcpp's cpp_token, and to
> c_token and cp_token in the C and C++ frontends respectively.
>
> To minimize churn, I kept the existing location_t fields, though in
> theory these are always just equal to the start of the source range.
>
> cpplib.h's struct cpp_token had this comment:
>
>   /* A preprocessing token.  This has been carefully packed and should
>      occupy 16 bytes on 32-bit hosts and 24 bytes on 64-bit hosts.  */
>
> Does anyone know why this was "carefully packed" and to what extent
> this matters?  I'm adding an extra 8 bytes to it (or 4 if we eliminate
> the existing location_t).  As far as I can see, these are
> short-lived, and there are only relative few alive at any time.
> Or is it about making them fast to copy?
>
> gcc/c-family/ChangeLog:
>         * c-lex.c (c_lex_with_flags): Add "range" param, and write back
>         to *range with the range of the libcpp token.
>         * c-pragma.h (c_lex_with_flags): Add "range" param.
>
> gcc/c/ChangeLog:
>         * c-parser.c (struct c_token): Add "range" field.
>         (c_lex_one_token): Write back to token->range in call to
>         c_lex_with_flags.
>
> gcc/cp/ChangeLog:
>         * parser.c (eof_token): Add "range" field to initializer.
>         (cp_lexer_get_preprocessor_token): Write back to token->range in
>         call to c_lex_with_flags.
>         * parser.h (struct cp_token): Add "range" field.
>
> libcpp/ChangeLog:
>         * include/cpplib.h (struct cpp_token): Add src_range field.
>         * lex.c (_cpp_lex_direct): Set up the src_range on the token.
> ---
>  gcc/c-family/c-lex.c    | 7 +++++--
>  gcc/c-family/c-pragma.h | 4 ++--
>  gcc/c/c-parser.c        | 6 +++++-
>  gcc/cp/parser.c         | 5 +++--
>  gcc/cp/parser.h         | 2 ++
>  libcpp/include/cpplib.h | 4 +++-
>  libcpp/lex.c            | 8 ++++++++
>  7 files changed, 28 insertions(+), 8 deletions(-)
>
> diff --git a/gcc/c-family/c-lex.c b/gcc/c-family/c-lex.c
> index 55ceb20..1334994 100644
> --- a/gcc/c-family/c-lex.c
> +++ b/gcc/c-family/c-lex.c
> @@ -380,11 +380,13 @@ c_common_has_attribute (cpp_reader *pfile)
>  }
>
>  /* Read a token and return its type.  Fill *VALUE with its value, if
> -   applicable.  Fill *CPP_FLAGS with the token's flags, if it is
> +   applicable.  Fill *LOC and *RANGE with the source location and range
> +   of the token.  Fill *CPP_FLAGS with the token's flags, if it is
>     non-NULL.  */
>
>  enum cpp_ttype
> -c_lex_with_flags (tree *value, location_t *loc, unsigned char *cpp_flags,
> +c_lex_with_flags (tree *value, location_t *loc, source_range *range,
> +                 unsigned char *cpp_flags,
>                   int lex_flags)
>  {
>    static bool no_more_pch;
> @@ -397,6 +399,7 @@ c_lex_with_flags (tree *value, location_t *loc, unsigned char *cpp_flags,
>   retry:
>    tok = cpp_get_token_with_location (parse_in, loc);
>    type = tok->type;
> +  *range = tok->src_range;
>
>   retry_after_at:
>    switch (type)
> diff --git a/gcc/c-family/c-pragma.h b/gcc/c-family/c-pragma.h
> index aa2b471..05df543 100644
> --- a/gcc/c-family/c-pragma.h
> +++ b/gcc/c-family/c-pragma.h
> @@ -225,8 +225,8 @@ extern enum cpp_ttype pragma_lex (tree *);
>  /* This is not actually available to pragma parsers.  It's merely a
>     convenient location to declare this function for c-lex, after
>     having enum cpp_ttype declared.  */
> -extern enum cpp_ttype c_lex_with_flags (tree *, location_t *, unsigned char *,
> -                                       int);
> +extern enum cpp_ttype c_lex_with_flags (tree *, location_t *, source_range *,
> +                                       unsigned char *, int);
>
>  extern void c_pp_lookup_pragma (unsigned int, const char **, const char **);
>
> diff --git a/gcc/c/c-parser.c b/gcc/c/c-parser.c
> index 11a2b0f..5d822ee 100644
> --- a/gcc/c/c-parser.c
> +++ b/gcc/c/c-parser.c
> @@ -170,6 +170,8 @@ struct GTY (()) c_token {
>    ENUM_BITFIELD (pragma_kind) pragma_kind : 8;
>    /* The location at which this token was found.  */
>    location_t location;
> +  /* The source range at which this token was found.  */
> +  source_range range;
>    /* The value associated with this token, if any.  */
>    tree value;
>  };
> @@ -239,7 +241,9 @@ c_lex_one_token (c_parser *parser, c_token *token)
>  {
>    timevar_push (TV_LEX);
>
> -  token->type = c_lex_with_flags (&token->value, &token->location, NULL,
> +  token->type = c_lex_with_flags (&token->value, &token->location,
> +                                 &token->range,
> +                                 NULL,
>                                   (parser->lex_untranslated_string
>                                    ? C_LEX_STRING_NO_TRANSLATE : 0));
>    token->id_kind = C_ID_NONE;
> diff --git a/gcc/cp/parser.c b/gcc/cp/parser.c
> index 67fbcda..7c59c58 100644
> --- a/gcc/cp/parser.c
> +++ b/gcc/cp/parser.c
> @@ -58,7 +58,7 @@ along with GCC; see the file COPYING3.  If not see
>
>  static cp_token eof_token =
>  {
> -  CPP_EOF, RID_MAX, 0, PRAGMA_NONE, false, false, false, 0, { NULL }
> +  CPP_EOF, RID_MAX, 0, PRAGMA_NONE, false, false, false, 0, {0, 0}, { NULL }
>  };
>
>  /* The various kinds of non integral constant we encounter. */
> @@ -764,7 +764,8 @@ cp_lexer_get_preprocessor_token (cp_lexer *lexer, cp_token *token)
>
>     /* Get a new token from the preprocessor.  */
>    token->type
> -    = c_lex_with_flags (&token->u.value, &token->location, &token->flags,
> +    = c_lex_with_flags (&token->u.value, &token->location,
> +                        &token->range, &token->flags,
>                         lexer == NULL ? 0 : C_LEX_STRING_NO_JOIN);
>    token->keyword = RID_MAX;
>    token->pragma_kind = PRAGMA_NONE;
> diff --git a/gcc/cp/parser.h b/gcc/cp/parser.h
> index 760467c..c7558a0 100644
> --- a/gcc/cp/parser.h
> +++ b/gcc/cp/parser.h
> @@ -61,6 +61,8 @@ struct GTY (()) cp_token {
>    BOOL_BITFIELD purged_p : 1;
>    /* The location at which this token was found.  */
>    location_t location;
> +  /* The source range at which this token was found.  */
> +  source_range range;

Is it just me or does location now feel somewhat redundant with range?  Can't we
compress that somehow?

>    /* The value associated with this token, if any.  */
>    union cp_token_value {
>      /* Used for CPP_NESTED_NAME_SPECIFIER and CPP_TEMPLATE_ID.  */
> diff --git a/libcpp/include/cpplib.h b/libcpp/include/cpplib.h
> index a2bdfa0..0b1a403 100644
> --- a/libcpp/include/cpplib.h
> +++ b/libcpp/include/cpplib.h
> @@ -235,9 +235,11 @@ struct GTY(()) cpp_identifier {
>  };
>
>  /* A preprocessing token.  This has been carefully packed and should
> -   occupy 16 bytes on 32-bit hosts and 24 bytes on 64-bit hosts.  */
> +   occupy 16 bytes on 32-bit hosts and 24 bytes on 64-bit hosts.
> +   FIXME: the above comment is no longer true with this patch.  */
>  struct GTY(()) cpp_token {
>    source_location src_loc;     /* Location of first char of token.  */
> +  source_range src_range;      /* Source range covered by the token.  */
>    ENUM_BITFIELD(cpp_ttype) type : CHAR_BIT;  /* token type */
>    unsigned short flags;                /* flags - see above */
>
> diff --git a/libcpp/lex.c b/libcpp/lex.c
> index 0aa1090..a84a8c0 100644
> --- a/libcpp/lex.c
> +++ b/libcpp/lex.c
> @@ -2365,6 +2365,9 @@ _cpp_lex_direct (cpp_reader *pfile)
>      result->src_loc = linemap_position_for_column (pfile->line_table,
>                                           CPP_BUF_COLUMN (buffer, buffer->cur));
>
> +  /* The token's src_range begins here.  */
> +  result->src_range.m_start = result->src_loc;
> +
>    switch (c)
>      {
>      case ' ': case '\t': case '\f': case '\v': case '\0':
> @@ -2723,6 +2726,11 @@ _cpp_lex_direct (cpp_reader *pfile)
>        break;
>      }
>
> +  /* The token's src_range ends here.  */
> +  result->src_range.m_finish =
> +    linemap_position_for_column (pfile->line_table,
> +                                CPP_BUF_COLUMN (buffer, buffer->cur));
> +
>    return result;
>  }
>
> --
> 1.8.5.3
>
Jakub Jelinek Sept. 15, 2015, 10:20 a.m. UTC | #4
On Tue, Sep 15, 2015 at 12:14:22PM +0200, Richard Biener wrote:
> > diff --git a/gcc/cp/parser.h b/gcc/cp/parser.h
> > index 760467c..c7558a0 100644
> > --- a/gcc/cp/parser.h
> > +++ b/gcc/cp/parser.h
> > @@ -61,6 +61,8 @@ struct GTY (()) cp_token {
> >    BOOL_BITFIELD purged_p : 1;
> >    /* The location at which this token was found.  */
> >    location_t location;
> > +  /* The source range at which this token was found.  */
> > +  source_range range;
> 
> Is it just me or does location now feel somewhat redundant with range?  Can't we
> compress that somehow?

For a token I'd expect it is redundant, I don't see how it would be useful
for a single preprocessing token to have more than start and end locations.
But generally, for expressions, 3 locations make sense.
If you have
abc + def
~~~~^~~~~
then having a range is useful.  In any case, I'm surprised that the ranges aren't encoded in
location_t (the data structures behind it, where we already stick also
BLOCK pointer).

	Jakub
Richard Biener Sept. 15, 2015, 10:33 a.m. UTC | #5
On Tue, Sep 15, 2015 at 12:20 PM, Jakub Jelinek <jakub@redhat.com> wrote:
> On Tue, Sep 15, 2015 at 12:14:22PM +0200, Richard Biener wrote:
>> > diff --git a/gcc/cp/parser.h b/gcc/cp/parser.h
>> > index 760467c..c7558a0 100644
>> > --- a/gcc/cp/parser.h
>> > +++ b/gcc/cp/parser.h
>> > @@ -61,6 +61,8 @@ struct GTY (()) cp_token {
>> >    BOOL_BITFIELD purged_p : 1;
>> >    /* The location at which this token was found.  */
>> >    location_t location;
>> > +  /* The source range at which this token was found.  */
>> > +  source_range range;
>>
>> Is it just me or does location now feel somewhat redundant with range?  Can't we
>> compress that somehow?
>
> For a token I'd expect it is redundant, I don't see how it would be useful
> for a single preprocessing token to have more than start and end locations.
> But generally, for expressions, 3 locations make sense.
> If you have
> abc + def
> ~~~~^~~~~
> then having a range is useful.  In any case, I'm surprised that the ranges aren't encoded in
> location_t (the data structures behind it, where we already stick also
> BLOCK pointer).

Probably lack of encoding space ... I suppose upping location_t to
64bits coud solve
some of that (with its own drawback on increasing size of core structures).

Richard.

>         Jakub
Jakub Jelinek Sept. 15, 2015, 10:48 a.m. UTC | #6
On Tue, Sep 15, 2015 at 12:33:58PM +0200, Richard Biener wrote:
> On Tue, Sep 15, 2015 at 12:20 PM, Jakub Jelinek <jakub@redhat.com> wrote:
> > On Tue, Sep 15, 2015 at 12:14:22PM +0200, Richard Biener wrote:
> >> > diff --git a/gcc/cp/parser.h b/gcc/cp/parser.h
> >> > index 760467c..c7558a0 100644
> >> > --- a/gcc/cp/parser.h
> >> > +++ b/gcc/cp/parser.h
> >> > @@ -61,6 +61,8 @@ struct GTY (()) cp_token {
> >> >    BOOL_BITFIELD purged_p : 1;
> >> >    /* The location at which this token was found.  */
> >> >    location_t location;
> >> > +  /* The source range at which this token was found.  */
> >> > +  source_range range;
> >>
> >> Is it just me or does location now feel somewhat redundant with range?  Can't we
> >> compress that somehow?
> >
> > For a token I'd expect it is redundant, I don't see how it would be useful
> > for a single preprocessing token to have more than start and end locations.
> > But generally, for expressions, 3 locations make sense.
> > If you have
> > abc + def
> > ~~~~^~~~~
> > then having a range is useful.  In any case, I'm surprised that the ranges aren't encoded in
> > location_t (the data structures behind it, where we already stick also
> > BLOCK pointer).
> 
> Probably lack of encoding space ... I suppose upping location_t to
> 64bits coud solve
> some of that (with its own drawback on increasing size of core structures).

What I had in mind was just add
  source_location start, end;
to location_adhoc_data struct and use !IS_ADHOC_LOC locations to represent
just plain locations without block and without range (including the cases
where the range has both start and end equal to the locus) and IS_ADHOC_LOC
locations for the cases where either we have non-NULL block, or we have
some other range, or both.  But I haven't spent any time on that, so just
wondering if such an encoding has been considered.

	Jakub
Manuel López-Ibáñez Sept. 15, 2015, 12:08 p.m. UTC | #7
On 15/09/15 12:20, Jakub Jelinek wrote:
> On Tue, Sep 15, 2015 at 12:14:22PM +0200, Richard Biener wrote:
>>> diff --git a/gcc/cp/parser.h b/gcc/cp/parser.h
>>> index 760467c..c7558a0 100644
>>> --- a/gcc/cp/parser.h
>>> +++ b/gcc/cp/parser.h
>>> @@ -61,6 +61,8 @@ struct GTY (()) cp_token {
>>>     BOOL_BITFIELD purged_p : 1;
>>>     /* The location at which this token was found.  */
>>>     location_t location;
>>> +  /* The source range at which this token was found.  */
>>> +  source_range range;
>>
>> Is it just me or does location now feel somewhat redundant with range?  Can't we
>> compress that somehow?
>
> For a token I'd expect it is redundant, I don't see how it would be useful
> for a single preprocessing token to have more than start and end locations.

If memory usage is a concern, can't we easily find out the end location of a 
token just by simply re-lexing it from the start location? Many tokens are a 
single character.

> But generally, for expressions, 3 locations make sense.
> If you have
> abc + def
> ~~~~^~~~~
> then having a range is useful.

It seems you want to have a location for '+' plus left-most and right-most 
locations. However, we will need the location of 'a' and the location of 'd', 
not only the location of 'f'. Thus, we probably want to have (or build) a range 
for each operand, to be able to handle something like:

(a + b) + (c + d)
~~~~~~~ ^ ~~~~~~~

This does not require to track the ranges of every token, but it requires to 
track ranges of expressions when building them. Moreover, we want to store 
these ranges/locations in the expression node, since many operands (VAR_DECL, 
constants, etc) do not have a location. (In my humble opinion, this a more 
serious defect of GCC than not tracking a range for tokens 
https://gcc.gnu.org/bugzilla/PR43486)

Note also that we do not necessarily need to track ranges in libcpp to print 
ranges in diagnostics. The latter can be implemented and useful before the 
former. The example above:

void foo(void)
{
   float c,d;
   int * a,b;
   (a + b) + (c + d); //error: invalid operands to binary + (have ‘int *’ and 
‘float’)
}

could be implemented simply by building the ranges while parsing (as I did in 
https://gcc.gnu.org/ml/gcc-patches/2009-08/msg00174.html), no need to store 
them explicitly. My intuition is that many of the ranges needed by diagnostics 
could be dynamically generated from two locations and passed to the point where 
it is used (like we do with location_t). We could store them, but we do not 
need to. Some examples:

     int y = *SomeA.X;
             ^~~~~~~~
     myvec[1]/P;
     ~~~~~~~~^~
   struct point origin = { x: 0.0, y: 0.0 };
                           ~~ ^
                           .x =

Do we have a place to store the range for "myvec[1]" or for "x:" ? (honest 
question).

Cheers,

Manuel.
Richard Biener Sept. 15, 2015, 12:18 p.m. UTC | #8
On Tue, Sep 15, 2015 at 2:08 PM, Manuel López-Ibáñez
<lopezibanez@gmail.com> wrote:
> On 15/09/15 12:20, Jakub Jelinek wrote:
>>
>> On Tue, Sep 15, 2015 at 12:14:22PM +0200, Richard Biener wrote:
>>>>
>>>> diff --git a/gcc/cp/parser.h b/gcc/cp/parser.h
>>>> index 760467c..c7558a0 100644
>>>> --- a/gcc/cp/parser.h
>>>> +++ b/gcc/cp/parser.h
>>>> @@ -61,6 +61,8 @@ struct GTY (()) cp_token {
>>>>     BOOL_BITFIELD purged_p : 1;
>>>>     /* The location at which this token was found.  */
>>>>     location_t location;
>>>> +  /* The source range at which this token was found.  */
>>>> +  source_range range;
>>>
>>>
>>> Is it just me or does location now feel somewhat redundant with range?
>>> Can't we
>>> compress that somehow?
>>
>>
>> For a token I'd expect it is redundant, I don't see how it would be useful
>> for a single preprocessing token to have more than start and end
>> locations.
>
>
> If memory usage is a concern, can't we easily find out the end location of a
> token just by simply re-lexing it from the start location? Many tokens are a
> single character.
>
>> But generally, for expressions, 3 locations make sense.
>> If you have
>> abc + def
>> ~~~~^~~~~
>> then having a range is useful.
>
>
> It seems you want to have a location for '+' plus left-most and right-most
> locations. However, we will need the location of 'a' and the location of
> 'd', not only the location of 'f'. Thus, we probably want to have (or build)
> a range for each operand, to be able to handle something like:
>
> (a + b) + (c + d)
> ~~~~~~~ ^ ~~~~~~~
>
> This does not require to track the ranges of every token, but it requires to
> track ranges of expressions when building them. Moreover, we want to store
> these ranges/locations in the expression node, since many operands
> (VAR_DECL, constants, etc) do not have a location. (In my humble opinion,
> this a more serious defect of GCC than not tracking a range for tokens
> https://gcc.gnu.org/bugzilla/PR43486)

Of course this boils down to "uses" of a VAR_DECL using the shared tree
node.  On GIMPLE some stmt kinds have separate locations for each operand
(PHI nodes), on GENERIC we'd have to invent a no-op expr tree code to
wrap such uses to be able to give them distinct locations (can't use sth
existing as frontends would need to ignore them in a different way than say
NOP_EXPRs or NON_LVALUE_EXPRs).

> Note also that we do not necessarily need to track ranges in libcpp to print
> ranges in diagnostics. The latter can be implemented and useful before the
> former. The example above:
>
> void foo(void)
> {
>   float c,d;
>   int * a,b;
>   (a + b) + (c + d); //error: invalid operands to binary + (have ‘int *’ and
> ‘float’)
> }
>
> could be implemented simply by building the ranges while parsing (as I did
> in https://gcc.gnu.org/ml/gcc-patches/2009-08/msg00174.html), no need to
> store them explicitly. My intuition is that many of the ranges needed by
> diagnostics could be dynamically generated from two locations and passed to
> the point where it is used (like we do with location_t). We could store
> them, but we do not need to. Some examples:
>
>     int y = *SomeA.X;
>             ^~~~~~~~
>     myvec[1]/P;
>     ~~~~~~~~^~
>   struct point origin = { x: 0.0, y: 0.0 };
>                           ~~ ^
>                           .x =
>
> Do we have a place to store the range for "myvec[1]" or for "x:" ? (honest
> question).
>
> Cheers,
>
> Manuel.
Manuel López-Ibáñez Sept. 15, 2015, 12:54 p.m. UTC | #9
On 15 September 2015 at 14:18, Richard Biener
<richard.guenther@gmail.com> wrote:
> Of course this boils down to "uses" of a VAR_DECL using the shared tree
> node.  On GIMPLE some stmt kinds have separate locations for each operand
> (PHI nodes), on GENERIC we'd have to invent a no-op expr tree code to
> wrap such uses to be able to give them distinct locations (can't use sth
> existing as frontends would need to ignore them in a different way than say
> NOP_EXPRs or NON_LVALUE_EXPRs).
>

The problem with that approach (besides the waste of memory implied by
a whole tree node just to store one location_t) is keeping those
wrappers in place while making them transparent for most of the
compiler. According to Arnaud, folding made this approach infeasible:
https://gcc.gnu.org/ml/gcc-patches/2012-09/msg01222.html

The other two alternatives are to store the location of the operands
on the expressions themselves or to store them as on-the-side
data-structure, but they come with their own drawbacks. I was
initially more in favour of the wrapper solution, but after dealing
with NOP_EXPRs, having to deal also with LOC_EXPR would be a nightmare
(as you say, they will have to be ignored in a different way). The
other alternatives seem less invasive and the problems mentioned here
https://gcc.gnu.org/ml/gcc-patches/2012-11/msg00164.html do not seem
as serious as I thought (passing down the location of the operand is
becoming  the norm anyway).

Cheers,

Manuel.
David Malcolm Sept. 15, 2015, 1:41 p.m. UTC | #10
On Tue, 2015-09-15 at 12:20 +0200, Jakub Jelinek wrote:
> On Tue, Sep 15, 2015 at 12:14:22PM +0200, Richard Biener wrote:
> > > diff --git a/gcc/cp/parser.h b/gcc/cp/parser.h
> > > index 760467c..c7558a0 100644
> > > --- a/gcc/cp/parser.h
> > > +++ b/gcc/cp/parser.h
> > > @@ -61,6 +61,8 @@ struct GTY (()) cp_token {
> > >    BOOL_BITFIELD purged_p : 1;
> > >    /* The location at which this token was found.  */
> > >    location_t location;
> > > +  /* The source range at which this token was found.  */
> > > +  source_range range;
> > 
> > Is it just me or does location now feel somewhat redundant with range?  Can't we
> > compress that somehow?
> 
> For a token I'd expect it is redundant, I don't see how it would be useful
> for a single preprocessing token to have more than start and end locations.

Indeed.   Patch 7 of the patch kit is merely considering tokens (in each
of libcpp and the C and C++ FEs), and in each of these three cases, once
we have a range, the "location" is just the start of the range.   I kept
the location to keep the patch smaller.   I can look into eliminating
the location field from the 3 relevant structures.

(I considered that we could have just the location as the start, and
calculate the end from the length of the token, but AFAIK this can't
cover token concatenation by the preprocessor, and if we're going to
have ranges later on, it's simpler to use the same representation).

> But generally, for expressions, 3 locations make sense.
> If you have
> abc + def
> ~~~~^~~~~
> then having a range is useful.  In any case, I'm surprised that the ranges aren't encoded in
> location_t (the data structures behind it, where we already stick also
> BLOCK pointer).

I posted a few ideas I had for implementing ranges in:
"[PATCH 12/22] Add source-ranges for trees"
https://gcc.gnu.org/ml/gcc-patches/2015-09/msg00739.html

but putting them into the ad-hoc location data is one that hadn't
occurred to me.  Thanks; I'll look into it.

Dave
David Malcolm Sept. 16, 2015, 8:21 p.m. UTC | #11
On Tue, 2015-09-15 at 12:48 +0200, Jakub Jelinek wrote:
> On Tue, Sep 15, 2015 at 12:33:58PM +0200, Richard Biener wrote:
> > On Tue, Sep 15, 2015 at 12:20 PM, Jakub Jelinek <jakub@redhat.com> wrote:
> > > On Tue, Sep 15, 2015 at 12:14:22PM +0200, Richard Biener wrote:
> > >> > diff --git a/gcc/cp/parser.h b/gcc/cp/parser.h
> > >> > index 760467c..c7558a0 100644
> > >> > --- a/gcc/cp/parser.h
> > >> > +++ b/gcc/cp/parser.h
> > >> > @@ -61,6 +61,8 @@ struct GTY (()) cp_token {
> > >> >    BOOL_BITFIELD purged_p : 1;
> > >> >    /* The location at which this token was found.  */
> > >> >    location_t location;
> > >> > +  /* The source range at which this token was found.  */
> > >> > +  source_range range;
> > >>
> > >> Is it just me or does location now feel somewhat redundant with range?  Can't we
> > >> compress that somehow?
> > >
> > > For a token I'd expect it is redundant, I don't see how it would be useful
> > > for a single preprocessing token to have more than start and end locations.
> > > But generally, for expressions, 3 locations make sense.
> > > If you have
> > > abc + def
> > > ~~~~^~~~~
> > > then having a range is useful.  In any case, I'm surprised that the ranges aren't encoded in
> > > location_t (the data structures behind it, where we already stick also
> > > BLOCK pointer).
> > 
> > Probably lack of encoding space ... I suppose upping location_t to
> > 64bits coud solve
> > some of that (with its own drawback on increasing size of core structures).
> 
> What I had in mind was just add
>   source_location start, end;
> to location_adhoc_data struct and use !IS_ADHOC_LOC locations to represent
> just plain locations without block and without range (including the cases
> where the range has both start and end equal to the locus) and IS_ADHOC_LOC
> locations for the cases where either we have non-NULL block, or we have
> some other range, or both.  But I haven't spent any time on that, so just
> wondering if such an encoding has been considered.

I've been attempting to implement that.

Am I right in thinking that the ad-hoc locations never get purged? i.e.
that once we've registered an ad-hoc location, then is that slot within
location_adhoc_data_map is forever associated with that (locus, block)
pair?  [or in the proposed model, the (locus, src_range, block)
triple?].

If so, it may make more sense to put the ranges into ad-hoc locations,
but only *after tokenization*: in this approach, the src_range would be
a field within the tokens (like in patch 07/22), in the hope that the
tokens are short-lived  (which AIUI is the case for libcpp and C, though
not for C++), presumably also killing the "location" field within
tokens.  We then stuff the range into the location_t when building trees
(maybe putting a src_range into c_expr to further delay populating
location_adhoc_data_map).

That way we avoid bloating the location_adhoc_data_map during lexing,
whilst preserving the range information, and we can stuff the ranges
into the 32-bit location_t within tree/gimple etc (albeit paying a cost
within the location_adhoc_data_map).

Thoughts?  Hope this sounds sane.
Dave
David Malcolm Sept. 17, 2015, 4:49 p.m. UTC | #12
On Wed, 2015-09-16 at 16:21 -0400, David Malcolm wrote:
> On Tue, 2015-09-15 at 12:48 +0200, Jakub Jelinek wrote:
> > On Tue, Sep 15, 2015 at 12:33:58PM +0200, Richard Biener wrote:
> > > On Tue, Sep 15, 2015 at 12:20 PM, Jakub Jelinek <jakub@redhat.com> wrote:
> > > > On Tue, Sep 15, 2015 at 12:14:22PM +0200, Richard Biener wrote:
> > > >> > diff --git a/gcc/cp/parser.h b/gcc/cp/parser.h
> > > >> > index 760467c..c7558a0 100644
> > > >> > --- a/gcc/cp/parser.h
> > > >> > +++ b/gcc/cp/parser.h
> > > >> > @@ -61,6 +61,8 @@ struct GTY (()) cp_token {
> > > >> >    BOOL_BITFIELD purged_p : 1;
> > > >> >    /* The location at which this token was found.  */
> > > >> >    location_t location;
> > > >> > +  /* The source range at which this token was found.  */
> > > >> > +  source_range range;
> > > >>
> > > >> Is it just me or does location now feel somewhat redundant with range?  Can't we
> > > >> compress that somehow?
> > > >
> > > > For a token I'd expect it is redundant, I don't see how it would be useful
> > > > for a single preprocessing token to have more than start and end locations.
> > > > But generally, for expressions, 3 locations make sense.
> > > > If you have
> > > > abc + def
> > > > ~~~~^~~~~
> > > > then having a range is useful.  In any case, I'm surprised that the ranges aren't encoded in
> > > > location_t (the data structures behind it, where we already stick also
> > > > BLOCK pointer).
> > > 
> > > Probably lack of encoding space ... I suppose upping location_t to
> > > 64bits coud solve
> > > some of that (with its own drawback on increasing size of core structures).
> > 
> > What I had in mind was just add
> >   source_location start, end;
> > to location_adhoc_data struct and use !IS_ADHOC_LOC locations to represent
> > just plain locations without block and without range (including the cases
> > where the range has both start and end equal to the locus) and IS_ADHOC_LOC
> > locations for the cases where either we have non-NULL block, or we have
> > some other range, or both.  But I haven't spent any time on that, so just
> > wondering if such an encoding has been considered.
> 
> I've been attempting to implement that.
> 
> Am I right in thinking that the ad-hoc locations never get purged? i.e.
> that once we've registered an ad-hoc location, then is that slot within
> location_adhoc_data_map is forever associated with that (locus, block)
> pair?  [or in the proposed model, the (locus, src_range, block)
> triple?].
> 
> If so, it may make more sense to put the ranges into ad-hoc locations,
> but only *after tokenization*: in this approach, the src_range would be
> a field within the tokens (like in patch 07/22), in the hope that the
> tokens are short-lived  (which AIUI is the case for libcpp and C, though
> not for C++), presumably also killing the "location" field within
> tokens.  We then stuff the range into the location_t when building trees
> (maybe putting a src_range into c_expr to further delay populating
> location_adhoc_data_map).
> 
> That way we avoid bloating the location_adhoc_data_map during lexing,
> whilst preserving the range information, and we can stuff the ranges
> into the 32-bit location_t within tree/gimple etc (albeit paying a cost
> within the location_adhoc_data_map).
> 
> Thoughts?  Hope this sounds sane.
> Dave

FWIW, I have a (very messy) implementation of this working for the C
frontend, which gives us source ranges on expressions without needing to
add any new tree nodes, or add any fields to existing tree structs.

The approach I'm using:

* ranges are stored directly as fields within cpp_token and c_token
(maybe we can ignore cp_token for now)

* ranges are stashed in the C FE, both (a) within the "struct c_expr"
and (b) within the location_t of each tree expression node as a new
field in the adhoc map.

Doing it twice may seem slightly redundant, but I think both are needed:
  (a) doing it within c_expr allows us to support ranges for constants
and VAR_DECL etc during parsing, without needing any kind of new tree
wrapper node
  (b) doing it in the ad-hoc map allows the ranges for expressions to
survive the parse and be usable in diagnostics later.

So this gives us (in the C FE): ranges for everything during parsing,
and ranges for expressions afterwards, with no new tree nodes or new
fields within tree nodes.

I'm working on cleaning it up into a much more minimal set of patches
that I hope are reviewable.

Hopefully this sounds like a good approach

I've also been looking at ways to mitigate bloat of the ad-hoc map, by
using some extra bits of location_t for representing short ranges
directly.

Dave
Jeff Law Sept. 17, 2015, 7:10 p.m. UTC | #13
On 09/15/2015 06:54 AM, Manuel López-Ibáñez wrote:
> On 15 September 2015 at 14:18, Richard Biener
> <richard.guenther@gmail.com> wrote:
>> Of course this boils down to "uses" of a VAR_DECL using the shared tree
>> node.  On GIMPLE some stmt kinds have separate locations for each operand
>> (PHI nodes), on GENERIC we'd have to invent a no-op expr tree code to
>> wrap such uses to be able to give them distinct locations (can't use sth
>> existing as frontends would need to ignore them in a different way than say
>> NOP_EXPRs or NON_LVALUE_EXPRs).
>>
>
> The problem with that approach (besides the waste of memory implied by
> a whole tree node just to store one location_t) is keeping those
> wrappers in place while making them transparent for most of the
> compiler. According to Arnaud, folding made this approach infeasible:
> https://gcc.gnu.org/ml/gcc-patches/2012-09/msg01222.html
>
> The other two alternatives are to store the location of the operands
> on the expressions themselves or to store them as on-the-side
> data-structure, but they come with their own drawbacks. I was
> initially more in favour of the wrapper solution, but after dealing
> with NOP_EXPRs, having to deal also with LOC_EXPR would be a nightmare
> (as you say, they will have to be ignored in a different way). The
> other alternatives seem less invasive and the problems mentioned here
> https://gcc.gnu.org/ml/gcc-patches/2012-11/msg00164.html do not seem
> as serious as I thought (passing down the location of the operand is
> becoming  the norm anyway).
I suspect any on-the-side data structure to handle this is ultimately 
doomed to failure.  Storing the location info for the operands in the 
expression means that anything that modifies an operand would have to 
have access to the expression so that location information could be 
updated.  Ugh.

As painful as it will be, the right way is to stop using DECL nodes like 
this and instead be using another node that isn't shared.  That allows 
atttaching location information.  David and I kicked this around before 
he posted his patch and didn't come up with anything better IIRC.

These wrapper nodes are definitely going to get in the way of folders 
and all kinds of things.  So it's not something that's going to be easy 
to add without digging into and modifying a lot of code.

I've always considered this a wart, but fixing that wart hasn't seemed 
worth the effort until recently.

Jeff
Jeff Law Sept. 17, 2015, 7:11 p.m. UTC | #14
On 09/15/2015 06:18 AM, Richard Biener wrote:

>
> Of course this boils down to "uses" of a VAR_DECL using the shared tree
> node.  On GIMPLE some stmt kinds have separate locations for each operand
> (PHI nodes), on GENERIC we'd have to invent a no-op expr tree code to
> wrap such uses to be able to give them distinct locations (can't use sth
> existing as frontends would need to ignore them in a different way than say
> NOP_EXPRs or NON_LVALUE_EXPRs).
Exactly.

Jeff
Jeff Law Sept. 17, 2015, 7:14 p.m. UTC | #15
On 09/17/2015 10:49 AM, David Malcolm wrote:

> FWIW, I have a (very messy) implementation of this working for the C
> frontend, which gives us source ranges on expressions without needing to
> add any new tree nodes, or add any fields to existing tree structs.
>
> The approach I'm using:
>
> * ranges are stored directly as fields within cpp_token and c_token
> (maybe we can ignore cp_token for now)
>
> * ranges are stashed in the C FE, both (a) within the "struct c_expr"
> and (b) within the location_t of each tree expression node as a new
> field in the adhoc map.
>
> Doing it twice may seem slightly redundant, but I think both are needed:
>    (a) doing it within c_expr allows us to support ranges for constants
> and VAR_DECL etc during parsing, without needing any kind of new tree
> wrapper node
>    (b) doing it in the ad-hoc map allows the ranges for expressions to
> survive the parse and be usable in diagnostics later.
>
> So this gives us (in the C FE): ranges for everything during parsing,
> and ranges for expressions afterwards, with no new tree nodes or new
> fields within tree nodes.
>
> I'm working on cleaning it up into a much more minimal set of patches
> that I hope are reviewable.
>
> Hopefully this sounds like a good approach
So is the assumption here that the location information is or is not 
supposed to survive through the gimple optimizers?   If I understand 
what you're doing correctly, I think the location information gets 
invalidated by const/copy propagations.

Though perhaps that's not a major problem because we're typically 
propagating an SSA_NAME, not a _DECL node.  Hmm.

jeff
Jeff Law Sept. 17, 2015, 7:19 p.m. UTC | #16
On 09/15/2015 04:33 AM, Richard Biener wrote:
> On Tue, Sep 15, 2015 at 12:20 PM, Jakub Jelinek <jakub@redhat.com> wrote:
>> On Tue, Sep 15, 2015 at 12:14:22PM +0200, Richard Biener wrote:
>>>> diff --git a/gcc/cp/parser.h b/gcc/cp/parser.h
>>>> index 760467c..c7558a0 100644
>>>> --- a/gcc/cp/parser.h
>>>> +++ b/gcc/cp/parser.h
>>>> @@ -61,6 +61,8 @@ struct GTY (()) cp_token {
>>>>     BOOL_BITFIELD purged_p : 1;
>>>>     /* The location at which this token was found.  */
>>>>     location_t location;
>>>> +  /* The source range at which this token was found.  */
>>>> +  source_range range;
>>>
>>> Is it just me or does location now feel somewhat redundant with range?  Can't we
>>> compress that somehow?
>>
>> For a token I'd expect it is redundant, I don't see how it would be useful
>> for a single preprocessing token to have more than start and end locations.
>> But generally, for expressions, 3 locations make sense.
>> If you have
>> abc + def
>> ~~~~^~~~~
>> then having a range is useful.  In any case, I'm surprised that the ranges aren't encoded in
>> location_t (the data structures behind it, where we already stick also
>> BLOCK pointer).
>
> Probably lack of encoding space ... I suppose upping location_t to
> 64bits coud solve
> some of that (with its own drawback on increasing size of core structures).
If we're going to 64 bits, then we might consider making it a pointer so 
that we don't have to spend so much time building up an encoding scheme.

Jeff
David Malcolm Sept. 17, 2015, 8:05 p.m. UTC | #17
On Thu, 2015-09-17 at 13:14 -0600, Jeff Law wrote:
> On 09/17/2015 10:49 AM, David Malcolm wrote:
> 
> > FWIW, I have a (very messy) implementation of this working for the C
> > frontend, which gives us source ranges on expressions without needing to
> > add any new tree nodes, or add any fields to existing tree structs.
> >
> > The approach I'm using:
> >
> > * ranges are stored directly as fields within cpp_token and c_token
> > (maybe we can ignore cp_token for now)
> >
> > * ranges are stashed in the C FE, both (a) within the "struct c_expr"
> > and (b) within the location_t of each tree expression node as a new
> > field in the adhoc map.
> >
> > Doing it twice may seem slightly redundant, but I think both are needed:
> >    (a) doing it within c_expr allows us to support ranges for constants
> > and VAR_DECL etc during parsing, without needing any kind of new tree
> > wrapper node
> >    (b) doing it in the ad-hoc map allows the ranges for expressions to
> > survive the parse and be usable in diagnostics later.
> >
> > So this gives us (in the C FE): ranges for everything during parsing,
> > and ranges for expressions afterwards, with no new tree nodes or new
> > fields within tree nodes.
> >
> > I'm working on cleaning it up into a much more minimal set of patches
> > that I hope are reviewable.
> >
> > Hopefully this sounds like a good approach
> So is the assumption here that the location information is or is not 
> supposed to survive through the gimple optimizers?

To be honest I hadn't given much thought to that stage; my hope is that
most of the diagnostics have been issued by the time we're optimizing.

In the proposed implementation the range information is "baked in" to
the location_t (via the ad-hoc lookaside), so it's carried along
wherever the location_t goes, and ought to have the same chances of
survival within the gimple optimizers as the existing location
information does.

>    If I understand 
> what you're doing correctly, I think the location information gets 
> invalidated by const/copy propagations.
> 
> Though perhaps that's not a major problem because we're typically 
> propagating an SSA_NAME, not a _DECL node.  Hmm.

Well, if the location_t is being invalidated by an optimization, we're
already losing source *point* information: file/line/column.  Given
that, losing range information as well seems like no great loss.

Or am I missing something?

(I am attempting to preserve the source_range data when block ptrs are
baked in to the ad-hoc locations, if that's what you're referring to)

Dave
diff mbox

Patch

diff --git a/gcc/c-family/c-lex.c b/gcc/c-family/c-lex.c
index 55ceb20..1334994 100644
--- a/gcc/c-family/c-lex.c
+++ b/gcc/c-family/c-lex.c
@@ -380,11 +380,13 @@  c_common_has_attribute (cpp_reader *pfile)
 }
 
 /* Read a token and return its type.  Fill *VALUE with its value, if
-   applicable.  Fill *CPP_FLAGS with the token's flags, if it is
+   applicable.  Fill *LOC and *RANGE with the source location and range
+   of the token.  Fill *CPP_FLAGS with the token's flags, if it is
    non-NULL.  */
 
 enum cpp_ttype
-c_lex_with_flags (tree *value, location_t *loc, unsigned char *cpp_flags,
+c_lex_with_flags (tree *value, location_t *loc, source_range *range,
+		  unsigned char *cpp_flags,
 		  int lex_flags)
 {
   static bool no_more_pch;
@@ -397,6 +399,7 @@  c_lex_with_flags (tree *value, location_t *loc, unsigned char *cpp_flags,
  retry:
   tok = cpp_get_token_with_location (parse_in, loc);
   type = tok->type;
+  *range = tok->src_range;
 
  retry_after_at:
   switch (type)
diff --git a/gcc/c-family/c-pragma.h b/gcc/c-family/c-pragma.h
index aa2b471..05df543 100644
--- a/gcc/c-family/c-pragma.h
+++ b/gcc/c-family/c-pragma.h
@@ -225,8 +225,8 @@  extern enum cpp_ttype pragma_lex (tree *);
 /* This is not actually available to pragma parsers.  It's merely a
    convenient location to declare this function for c-lex, after
    having enum cpp_ttype declared.  */
-extern enum cpp_ttype c_lex_with_flags (tree *, location_t *, unsigned char *,
-					int);
+extern enum cpp_ttype c_lex_with_flags (tree *, location_t *, source_range *,
+					unsigned char *, int);
 
 extern void c_pp_lookup_pragma (unsigned int, const char **, const char **);
 
diff --git a/gcc/c/c-parser.c b/gcc/c/c-parser.c
index 11a2b0f..5d822ee 100644
--- a/gcc/c/c-parser.c
+++ b/gcc/c/c-parser.c
@@ -170,6 +170,8 @@  struct GTY (()) c_token {
   ENUM_BITFIELD (pragma_kind) pragma_kind : 8;
   /* The location at which this token was found.  */
   location_t location;
+  /* The source range at which this token was found.  */
+  source_range range;
   /* The value associated with this token, if any.  */
   tree value;
 };
@@ -239,7 +241,9 @@  c_lex_one_token (c_parser *parser, c_token *token)
 {
   timevar_push (TV_LEX);
 
-  token->type = c_lex_with_flags (&token->value, &token->location, NULL,
+  token->type = c_lex_with_flags (&token->value, &token->location,
+				  &token->range,
+				  NULL,
 				  (parser->lex_untranslated_string
 				   ? C_LEX_STRING_NO_TRANSLATE : 0));
   token->id_kind = C_ID_NONE;
diff --git a/gcc/cp/parser.c b/gcc/cp/parser.c
index 67fbcda..7c59c58 100644
--- a/gcc/cp/parser.c
+++ b/gcc/cp/parser.c
@@ -58,7 +58,7 @@  along with GCC; see the file COPYING3.  If not see
 
 static cp_token eof_token =
 {
-  CPP_EOF, RID_MAX, 0, PRAGMA_NONE, false, false, false, 0, { NULL }
+  CPP_EOF, RID_MAX, 0, PRAGMA_NONE, false, false, false, 0, {0, 0}, { NULL }
 };
 
 /* The various kinds of non integral constant we encounter. */
@@ -764,7 +764,8 @@  cp_lexer_get_preprocessor_token (cp_lexer *lexer, cp_token *token)
 
    /* Get a new token from the preprocessor.  */
   token->type
-    = c_lex_with_flags (&token->u.value, &token->location, &token->flags,
+    = c_lex_with_flags (&token->u.value, &token->location,
+                        &token->range, &token->flags,
 			lexer == NULL ? 0 : C_LEX_STRING_NO_JOIN);
   token->keyword = RID_MAX;
   token->pragma_kind = PRAGMA_NONE;
diff --git a/gcc/cp/parser.h b/gcc/cp/parser.h
index 760467c..c7558a0 100644
--- a/gcc/cp/parser.h
+++ b/gcc/cp/parser.h
@@ -61,6 +61,8 @@  struct GTY (()) cp_token {
   BOOL_BITFIELD purged_p : 1;
   /* The location at which this token was found.  */
   location_t location;
+  /* The source range at which this token was found.  */
+  source_range range;
   /* The value associated with this token, if any.  */
   union cp_token_value {
     /* Used for CPP_NESTED_NAME_SPECIFIER and CPP_TEMPLATE_ID.  */
diff --git a/libcpp/include/cpplib.h b/libcpp/include/cpplib.h
index a2bdfa0..0b1a403 100644
--- a/libcpp/include/cpplib.h
+++ b/libcpp/include/cpplib.h
@@ -235,9 +235,11 @@  struct GTY(()) cpp_identifier {
 };
 
 /* A preprocessing token.  This has been carefully packed and should
-   occupy 16 bytes on 32-bit hosts and 24 bytes on 64-bit hosts.  */
+   occupy 16 bytes on 32-bit hosts and 24 bytes on 64-bit hosts.
+   FIXME: the above comment is no longer true with this patch.  */
 struct GTY(()) cpp_token {
   source_location src_loc;	/* Location of first char of token.  */
+  source_range src_range;	/* Source range covered by the token.  */
   ENUM_BITFIELD(cpp_ttype) type : CHAR_BIT;  /* token type */
   unsigned short flags;		/* flags - see above */
 
diff --git a/libcpp/lex.c b/libcpp/lex.c
index 0aa1090..a84a8c0 100644
--- a/libcpp/lex.c
+++ b/libcpp/lex.c
@@ -2365,6 +2365,9 @@  _cpp_lex_direct (cpp_reader *pfile)
     result->src_loc = linemap_position_for_column (pfile->line_table,
 					  CPP_BUF_COLUMN (buffer, buffer->cur));
 
+  /* The token's src_range begins here.  */
+  result->src_range.m_start = result->src_loc;
+
   switch (c)
     {
     case ' ': case '\t': case '\f': case '\v': case '\0':
@@ -2723,6 +2726,11 @@  _cpp_lex_direct (cpp_reader *pfile)
       break;
     }
 
+  /* The token's src_range ends here.  */
+  result->src_range.m_finish =
+    linemap_position_for_column (pfile->line_table,
+				 CPP_BUF_COLUMN (buffer, buffer->cur));
+
   return result;
 }