Patchwork do not require lookahead in json-lexer.c if not necessary

login
register
mail settings
Submitter Paolo Bonzini
Date May 21, 2010, 9:08 a.m.
Message ID <1274432933-15542-1-git-send-email-pbonzini@redhat.com>
Download mbox | patch
Permalink /patch/53116/
State New
Headers show

Comments

Paolo Bonzini - May 21, 2010, 9:08 a.m.
> Having look ahead operate differently for different states
> really complicates the lexer.  I don't see this as a big
> problem in practice. 

I also don't see this as a big problem in practice, except maybe
for a quit command.  However, WRT the complication of the lexer,
it is really not so bad.  The trick is to split

   lexer->state = json_lexer[IN_START][(uint8_t)ch];

in two phases, namely transitioning to IN_START and (only if
lookahead was used) doing another iteration of the loop.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
        The patch is on top of Luiz's changes.

 json-lexer.c |   85 +++++++++++++++++++++++++++++----------------------------
 1 files changed, 43 insertions(+), 42 deletions(-)

Patch

diff --git a/json-lexer.c b/json-lexer.c
index d1d8033..b9250c1 100644
--- a/json-lexer.c
+++ b/json-lexer.c
@@ -29,7 +29,6 @@ 
 
 enum json_lexer_state {
     ERROR = 0,
-    IN_DONE_STRING,
     IN_DQ_UCODE3,
     IN_DQ_UCODE2,
     IN_DQ_UCODE1,
@@ -59,17 +58,18 @@  enum json_lexer_state {
     IN_ESCAPE_I64,
     IN_ESCAPE_DONE,
     IN_WHITESPACE,
-    IN_OPERATOR_DONE,
     IN_START,
 };
 
 #define TERMINAL(state) [0 ... 0x7F] = (state)
 
-static const uint8_t json_lexer[][256] =  {
-    [IN_DONE_STRING] = {
-        TERMINAL(JSON_STRING),
-    },
+/* Return whether TERMINAL is a terminal state and the transition to it
+   from OLD_STATE required lookahead.  This can happens whenever the table
+   below uses the TERMINAL macro.  */
+#define TERMINAL_NEEDED_LOOKAHEAD(old_state, terminal) \
+            (json_lexer[(old_state)][0] == (terminal))
 
+static const uint8_t json_lexer[][256] =  {
     /* double quote string */
     [IN_DQ_UCODE3] = {
         ['0' ... '9'] = IN_DQ_STRING,
@@ -104,7 +104,7 @@  static const uint8_t json_lexer[][256] =  {
     [IN_DQ_STRING] = {
         [1 ... 0xFF] = IN_DQ_STRING,
         ['\\'] = IN_DQ_STRING_ESCAPE,
-        ['"'] = IN_DONE_STRING,
+        ['"'] = JSON_STRING,
     },
 
     /* single quote string */
@@ -141,7 +141,7 @@  static const uint8_t json_lexer[][256] =  {
     [IN_SQ_STRING] = {
         [1 ... 0xFF] = IN_SQ_STRING,
         ['\\'] = IN_SQ_STRING_ESCAPE,
-        ['\''] = IN_DONE_STRING,
+        ['\''] = JSON_STRING,
     },
 
     /* Zero */
@@ -207,11 +207,6 @@  static const uint8_t json_lexer[][256] =  {
         ['\n'] = IN_WHITESPACE,
     },        
 
-    /* operator */
-    [IN_OPERATOR_DONE] = {
-        TERMINAL(JSON_OPERATOR),
-    },
-
     /* escape */
     [IN_ESCAPE_DONE] = {
         TERMINAL(JSON_ESCAPE),
@@ -255,12 +250,12 @@  static const uint8_t json_lexer[][256] =  {
         ['0'] = IN_ZERO,
         ['1' ... '9'] = IN_NONZERO_NUMBER,
         ['-'] = IN_NEG_NONZERO_NUMBER,
-        ['{'] = IN_OPERATOR_DONE,
-        ['}'] = IN_OPERATOR_DONE,
-        ['['] = IN_OPERATOR_DONE,
-        [']'] = IN_OPERATOR_DONE,
-        [','] = IN_OPERATOR_DONE,
-        [':'] = IN_OPERATOR_DONE,
+        ['{'] = JSON_OPERATOR,
+        ['}'] = JSON_OPERATOR,
+        ['['] = JSON_OPERATOR,
+        [']'] = JSON_OPERATOR,
+        [','] = JSON_OPERATOR,
+        [':'] = JSON_OPERATOR,
         ['a' ... 'z'] = IN_KEYWORD,
         ['%'] = IN_ESCAPE,
         [' '] = IN_WHITESPACE,
@@ -279,35 +274,41 @@  void json_lexer_init(JSONLexer *lexer, JSONLexerEmitter func)
 
 static int json_lexer_feed_char(JSONLexer *lexer, char ch)
 {
+    int char_consumed, new_state;
+
     lexer->x++;
     if (ch == '\n') {
         lexer->x = 0;
         lexer->y++;
     }
 
-    lexer->state = json_lexer[lexer->state][(uint8_t)ch];
-
-    switch (lexer->state) {
-    case JSON_OPERATOR:
-    case JSON_ESCAPE:
-    case JSON_INTEGER:
-    case JSON_FLOAT:
-    case JSON_KEYWORD:
-    case JSON_STRING:
-        lexer->emit(lexer, lexer->token, lexer->state, lexer->x, lexer->y);
-    case JSON_SKIP:
-        lexer->state = json_lexer[IN_START][(uint8_t)ch];
-        QDECREF(lexer->token);
-        lexer->token = qstring_new();
-        break;
-    case ERROR:
-        return -EINVAL;
-    default:
-        break;
-    }
-
-    qstring_append_chr(lexer->token, ch);
+    do {
+        new_state = json_lexer[lexer->state][(uint8_t)ch];
+        char_consumed = !TERMINAL_NEEDED_LOOKAHEAD(lexer->state, new_state);
+        if (char_consumed) {
+            qstring_append_chr(lexer->token, ch);
+        }
 
+        switch (new_state) {
+        case JSON_OPERATOR:
+        case JSON_ESCAPE:
+        case JSON_INTEGER:
+        case JSON_FLOAT:
+        case JSON_KEYWORD:
+        case JSON_STRING:
+            lexer->emit(lexer, lexer->token, new_state, lexer->x, lexer->y);
+        case JSON_SKIP:
+            QDECREF(lexer->token);
+            lexer->token = qstring_new();
+            new_state = IN_START;
+            break;
+        case ERROR:
+            return -EINVAL;
+        default:
+            break;
+        }
+        lexer->state = new_state;
+    } while (!char_consumed);
     return 0;
 }
 
@@ -329,7 +330,7 @@  int json_lexer_feed(JSONLexer *lexer, const char *buffer, size_t size)
 
 int json_lexer_flush(JSONLexer *lexer)
 {
-    return json_lexer_feed_char(lexer, 0);
+    return lexer->state == IN_START ? 0 : json_lexer_feed_char(lexer, 0);
 }
 
 void json_lexer_destroy(JSONLexer *lexer)