Patchwork [1/2] gcc symbol database

login
register
mail settings
Submitter Yunfeng ZHANG
Date May 28, 2012, 8:40 a.m.
Message ID <CA+dUcj2us9Uw=UNiJpY14W-e4XCQWOnjpADR9mRojTNcGn=38Q@mail.gmail.com>
Download mbox | patch
Permalink /patch/161595/
State New
Headers show

Comments

Yunfeng ZHANG - May 28, 2012, 8:40 a.m.
Description (gccsymdb https://gccsymdb.googlecode.com/svn/trunk):
The patch I committed here is based on the idea -- collecting gcc internal
data (definition, file-dependence etc.) and outputting them into database for
further usage just like cscope. I've published the idea in previous mail loop
and sign a legal contact to gcc, last mail is
http://gcc.gnu.org/ml/gcc-patches/2010-06/msg00844.html. Now I want to
announce my new achievement to here, with later patches my plugin can collect
extern definitions and function call relationship to sqlite database, the
attachment is for 4.7.0, my site is for 4.6.2-4.6.3, see attachment/doc.txt
for start.  Later is some statistics: cscope database on 4.6.3 is about 130M,
my database is 23M, and my database is more accurate, without any macro/ifdef
intervention. I also hope gcc can accept my plugin as gcc standard plugin.
Later steps are try my plugins on gcc-4.7.0.
1) ./configure --prefix=/home/zyf/root/ --with-mpc=/home/zyf/root/
--with-gmp=/home/zyf/root/ --with-mpfr=/home/zyf/root/
2) cp -a /home/zyf/src/symdb.gcc/gcc.
patches/ patches
3) quilt push -a
4) make all-stage1
5) cd /home/zyf/src/symdb.gcc/ && make
6) cp gs init.sql helper.vim /home/zyf/src/gcc-4.7.0
7) cd /home/zyf/src/gcc-4.7.0
8) ./gs initdb ./
9) # Since my plugin only works on C not C++, but gcc uses stage1/g++ to
   # compile the whole stage2, so open Makefile and search `STAGE2_CXXFLAGS =
   # $(STAGE2_CFLAGS)', append `-xc' to the tail of the line.
   make STAGE2_CFLAGS="-fplugin=/home/zyf/src/symdb.gcc/symdb.so
   -fplugin-arg-symdb-dbfile=/home/zyf/src/gcc-4.7.0/gccsym.db" all-stage2
   # Ignore the error when linking cc1.
10) ./gs vacuumdb ./
11) vim
12) `:source helper.vim'
Using `CTRL-]' to search a definition.
Using `CTRL-[' to search which functions calls the function.
Using `CTRL-T' to jump back.
Using `Gs def yoursymbol' to search a definition.
Using `Gs callee yourfunction' to search function call relationship.

Testcases:
My plugin has a suite of testcases. Using bash+sqlite and test/run.sh to do
it.

ChangeLog includes mainly three files, libcpp/, gcc/ and gcc/c-family/. And
patch is sent as two blocks, one patch in libcpp/, another in gcc/.
ChangeLog
-------------------------
libcpp/
2012-05-24  Yunfeng ZHANG <zyf.zeroos@gmail.com>

      * include/cpplib.h (struct cpp_callbacks): Add new callbacks.
      macro_start_expand, macro_end_arg, macro_end_expand, start_directive,
      end_directive, directive_token.
      (cpp_token): Add file_offset field for every token.
      * internal.h (struct _cpp_line_note): Add adjust_offset field to adjust
      new cpp_token::file_offset.
      * directives.c (_cpp_handle_directive): Implement end_directive.
      * macro.c (enter_macro_context): Implement macro_end_arg.
      (cpp_get_token_1): Implement macro_start_expand and macro_end_expand.
      * lex.c (add_line_note): Add offset argument for file offset adjustment.
      (_cpp_clean_line): Using new add_line_note declaration.

gcc/
2012-05-24  Yunfeng ZHANG <zyf.zeroos@gmail.com>

     * plugin.def: New plugin events, one for broadcast new token is arrived,
     another is for new definition notification.
     * c-parser.c: Move c_id_kind/c_token definitions to c-family/c-common.h.
     (c_lex_one_token): Invoke PLUGIN_C_TOKEN callback.
     (c_parser_external_declaration): Invoke PLUGIN_EXTERN_DECL callback.
     (c_parser_declaration_or_fndef): Invoke PLUGIN_EXTERN_DECLSPECS,
     PLUGIN_EXTERN_VAR, PLUGIN_EXTERN_FUNC_OLD_PARAM, PLUGIN_EXTERN_FUNC
     callbacks.
     (c_parser_enum_specifier): Invoke PLUGIN_ENUM_SPECIFIER callback.
     (c_parser_postfix_expression_after_primary): Invoke PLUGIN_CALL_FUNCTION
     callback.
     * plugin.c (register_callback): Handle new plugin events.
     (invoke_plugin_callbacks_full): Likewise.
     * doc/plugins.texi: Document new plugin events.

gcc/c-family/
2012-05-24  Yunfeng ZHANG <zyf.zeroos@gmail.com>
     * c-lex.c: Include plugin.h.
     (c_lex_with_flags): Invoke PLUGIN_CPP_TOKEN callback.
     * c-common.h: Include c-pragma.h. Move c_id_kind/c_token definitions from
     c-parser.c.

Patch 1 of 2 on libcpp/
--------------------------------
diff -upr .pc/symdb_enhance_libcpp/libcpp/directives.c libcpp/directives.c
--- .pc/symdb_enhance_libcpp/libcpp/directives.c    2011-10-17
17:59:12.000000000 +0800
+++ libcpp/directives.c    2012-05-25 14:56:56.751134781 +0800
@@ -492,6 +492,8 @@ _cpp_handle_directive (cpp_reader *pfile
   else if (skip == 0)
     _cpp_backup_tokens (pfile, 1);

+  if (pfile->cb.end_directive)
+    pfile->cb.end_directive (pfile);
   end_directive (pfile, skip);
   if (was_parsing_args && !pfile->state.in_deferred_pragma)
     {
diff -upr .pc/symdb_enhance_libcpp/libcpp/include/cpplib.h
libcpp/include/cpplib.h
--- .pc/symdb_enhance_libcpp/libcpp/include/cpplib.h    2011-12-21
04:44:13.000000000 +0800
+++ libcpp/include/cpplib.h    2012-05-25 14:56:56.745507332 +0800
@@ -218,10 +218,10 @@ struct GTY(()) cpp_identifier {
        node;
 };

-/* A preprocessing token.  This has been carefully packed and should
-   occupy 16 bytes on 32-bit hosts and 24 bytes on 64-bit hosts.  */
+/* A preprocessing token. */
 struct GTY(()) cpp_token {
   source_location src_loc;    /* Location of first char of token.  */
+  int file_offset;
   ENUM_BITFIELD(cpp_ttype) type : CHAR_BIT;  /* token type */
   unsigned short flags;        /* flags - see above */

@@ -522,6 +522,24 @@ struct cpp_callbacks
      be expanded.  */
   cpp_hashnode * (*macro_to_expand) (cpp_reader *, const cpp_token *);

+  /* macro_{start/end}_expand are called when gcc starts to expand macro, note
+   * if A macro includes B macro, the pair is called multiple times. */
+  void (*macro_start_expand) (cpp_reader *, const cpp_token *,
+          const cpp_hashnode *);
+  void (*macro_end_expand) (cpp_reader *);
+  /* Called when a function-like macro stops collecting macro parameters,
+   * cancel = true, macro expansion is canceled. */
+  void (*macro_end_arg) (cpp_reader *, bool cancel);
+  /* The pair is called when cpp directive (starting from `#', such as
+   * `#define', `#endif' etc) is encountered and reaches end. */
+  void (*start_directive) (cpp_reader *, const cpp_token*);
+  void (*end_directive) (cpp_reader *);
+  /* The more powerful function getting token than cpp_get_token. Here, name
+   * directive_token maybe makes you confused, it's named from
+   * libcpp/lex.c:_cpp_lex_direct, there isn't relationship between
+   * directive_token and {start, end}_directive above. */
+  void (*directive_token) (cpp_reader *, const cpp_token*);
+
   /* Called to emit a diagnostic.  This callback receives the
      translated message.  */
   bool (*error) (cpp_reader *, int, int, source_location, unsigned int,
diff -upr .pc/symdb_enhance_libcpp/libcpp/internal.h libcpp/internal.h
--- .pc/symdb_enhance_libcpp/libcpp/internal.h    2012-01-09
16:48:43.000000000 +0800
+++ libcpp/internal.h    2012-05-25 14:56:56.752132995 +0800
@@ -291,6 +291,11 @@ struct _cpp_line_note
      intervening space, 0 represents a note that has already been handled,
      and anything else is invalid.  */
   unsigned int type;
+
+  /* file offset adjustment is recorded by add_line_note to adjust
+   * cpp_token::file_offset. The case is when some spaces are left after an
+   * escaped newline `\', cpp_token::file_offset becomes inexact. */
+  const unsigned char *adjust_offset;
 };

 /* Represents the contents of a file cpplib has read in.  */
diff -upr .pc/symdb_enhance_libcpp/libcpp/macro.c libcpp/macro.c
--- .pc/symdb_enhance_libcpp/libcpp/macro.c    2012-01-09
22:15:25.000000000 +0800
+++ libcpp/macro.c    2012-05-25 14:56:56.749508416 +0800
@@ -1029,9 +1029,13 @@ enter_macro_context (cpp_reader *pfile,
           if (pragma_buff)
         _cpp_release_buff (pfile, pragma_buff);

+          if (pfile->cb.macro_end_arg)
+        pfile->cb.macro_end_arg (pfile, true);
           return 0;
         }

+      if (pfile->cb.macro_end_arg)
+        pfile->cb.macro_end_arg (pfile, false);
       if (macro->paramc > 0)
         replace_args (pfile, node, macro,
               (macro_arg *) buff->base,
@@ -2263,6 +2267,8 @@ cpp_get_token_1 (cpp_reader *pfile, sour
       if (pfile->context->c.macro)
         ++num_expanded_macros_counter;
       _cpp_pop_context (pfile);
+      if (pfile->cb.macro_end_expand)
+        pfile->cb.macro_end_expand (pfile);
       if (pfile->state.in_directive)
         continue;
       result = &pfile->avoid_paste;
@@ -2321,8 +2327,14 @@ cpp_get_token_1 (cpp_reader *pfile, sour
         }
         }
       else
-        ret = enter_macro_context (pfile, node, result,
-                       virt_loc);
+          {
+           if (pfile->cb.macro_start_expand)
+         pfile->cb.macro_start_expand (pfile, result, node);
+       ret = enter_macro_context (pfile, node, result, virt_loc);
+     if (ret == 0 && pfile->cb.macro_end_expand)
+       /* macro expansion is canceled. */
+       pfile->cb.macro_end_expand (pfile);
+         }
       if (ret)
          {
           if (pfile->state.in_directive || ret == 2)

Patch

diff -upr .pc/symdb_enhance_libcpp/libcpp/lex.c libcpp/lex.c
--- .pc/symdb_enhance_libcpp/libcpp/lex.c    2011-12-08 06:05:59.000000000 +0800
+++ libcpp/lex.c    2012-05-25 14:56:56.747508973 +0800
@@ -51,7 +51,8 @@  static const struct token_spelling token
 #define TOKEN_SPELL(token) (token_spellings[(token)->type].category)
 #define TOKEN_NAME(token) (token_spellings[(token)->type].name)

-static void add_line_note (cpp_buffer *, const uchar *, unsigned int);
+static void add_line_note (cpp_buffer *, const uchar *, unsigned int,
+               const uchar *);
 static int skip_line_comment (cpp_reader *);
 static void skip_whitespace (cpp_reader *, cppchar_t);
 static void lex_string (cpp_reader *, cpp_token *, const uchar *);
@@ -82,7 +83,8 @@  cpp_ideq (const cpp_token *token, const
 /* Record a note TYPE at byte POS into the current cleaned logical
    line.  */
 static void
-add_line_note (cpp_buffer *buffer, const uchar *pos, unsigned int type)
+add_line_note (cpp_buffer *buffer, const uchar *pos, unsigned int type,
+           const uchar * offset)
 {
   if (buffer->notes_used == buffer->notes_cap)
     {
@@ -93,6 +95,7 @@  add_line_note (cpp_buffer *buffer, const

   buffer->notes[buffer->notes_used].pos = pos;
   buffer->notes[buffer->notes_used].type = type;
+  buffer->notes[buffer->notes_used].adjust_offset = offset;
   buffer->notes_used++;
 }

@@ -689,7 +692,7 @@  _cpp_clean_line (cpp_reader *pfile)
         {
           /* Have a trigraph.  We may or may not have to convert
              it.  Add a line note regardless, for -Wtrigraphs.  */
-          add_line_note (buffer, s, s[2]);
+          add_line_note (buffer, s, s[2], 0);
           if (CPP_OPTION (pfile, trigraphs))
             {
               /* We do, and that means we have to switch to the
@@ -734,7 +737,7 @@  _cpp_clean_line (cpp_reader *pfile)

       /* Have an escaped newline; process it and proceed to
      the slow path.  */
-      add_line_note (buffer, p - 1, p != d ? ' ' : '\\');
+      add_line_note (buffer, p - 1, p != d ? ' ' : '\\', s + 1);
       d = p - 2;
       buffer->next_line = p - 1;

@@ -759,14 +762,14 @@  _cpp_clean_line (cpp_reader *pfile)
           if (p == buffer->next_line || p[-1] != '\\')
         break;

-          add_line_note (buffer, p - 1, p != d ? ' ': '\\');
+          add_line_note (buffer, p - 1, p != d ? ' ': '\\', s + 1);
           d = p - 2;
           buffer->next_line = p - 1;
         }
       else if (c == '?' && s[1] == '?' && _cpp_trigraph_map[s[2]])
         {
           /* Add a note regardless, for the benefit of -Wtrigraphs.  */
-          add_line_note (buffer, d, s[2]);
+          add_line_note (buffer, d, s[2], 0);
           if (CPP_OPTION (pfile, trigraphs))
         {
           *d = _cpp_trigraph_map[s[2]];
@@ -789,7 +792,7 @@  _cpp_clean_line (cpp_reader *pfile)
  done:
   *d = '\n';
   /* A sentinel note that should never be processed.  */
-  add_line_note (buffer, d + 1, '\n');
+  add_line_note (buffer, d + 1, '\n', s + 1);
   buffer->next_line = s + 1;
 }

@@ -1886,6 +1889,8 @@  _cpp_lex_token (cpp_reader *pfile)
          handles the directive as normal.  */
           && pfile->state.parsing_args != 1)
         {
+          if (pfile->cb.start_directive)
+        pfile->cb.start_directive (pfile, result);
           if (_cpp_handle_directive (pfile, result->flags & PREV_WHITE))
         {
           if (pfile->directive_result.type == CPP_PADDING)
@@ -2032,6 +2037,17 @@  _cpp_lex_direct (cpp_reader *pfile)
       _cpp_process_line_notes (pfile, false);
       result->src_loc = pfile->line_table->highest_line;
     }
+  if (buffer->cur_note != 0)
+    {
+      int index = buffer->cur_note - 1;
+      result->file_offset = buffer->cur - buffer->buf;
+      result->file_offset +=
+    buffer->notes[index].adjust_offset - buffer->notes[index].pos;
+    }
+  else
+    {
+      result->file_offset = buffer->cur - buffer->buf;
+    }
   c = *buffer->cur++;

   if (pfile->forced_token_location_p)
@@ -2346,6 +2362,8 @@  _cpp_lex_direct (cpp_reader *pfile)
       break;
     }

+  if (pfile->cb.directive_token)
+    pfile->cb.directive_token (pfile, result);
   return result;
 }