From patchwork Thu Oct 4 17:33:37 2012 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Michael Roth X-Patchwork-Id: 189249 Return-Path: X-Original-To: incoming@patchwork.ozlabs.org Delivered-To: patchwork-incoming@bilbo.ozlabs.org Received: from lists.gnu.org (lists.gnu.org [208.118.235.17]) (using TLSv1 with cipher AES256-SHA (256/256 bits)) (Client did not present a certificate) by ozlabs.org (Postfix) with ESMTPS id AAEAA2C037E for ; Fri, 5 Oct 2012 04:55:50 +1000 (EST) Received: from localhost ([::1]:55613 helo=lists.gnu.org) by lists.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1TJpKW-00008v-6C for incoming@patchwork.ozlabs.org; Thu, 04 Oct 2012 13:35:24 -0400 Received: from eggs.gnu.org ([208.118.235.92]:49543) by lists.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1TJpJZ-0007cQ-Mj for qemu-devel@nongnu.org; Thu, 04 Oct 2012 13:34:30 -0400 Received: from Debian-exim by eggs.gnu.org with spam-scanned (Exim 4.71) (envelope-from ) id 1TJpJU-0000zG-Pi for qemu-devel@nongnu.org; Thu, 04 Oct 2012 13:34:25 -0400 Received: from mail-ob0-f173.google.com ([209.85.214.173]:43670) by eggs.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1TJpJU-0000gj-ET for qemu-devel@nongnu.org; Thu, 04 Oct 2012 13:34:20 -0400 Received: by mail-ob0-f173.google.com with SMTP id wc18so694637obb.4 for ; Thu, 04 Oct 2012 10:34:20 -0700 (PDT) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gmail.com; s=20120113; h=sender:from:to:cc:subject:date:message-id:x-mailer:in-reply-to :references; bh=BZfYSiFAnUe7eFCo8ARbLMtnImlC08YNY/qeg/018Ls=; b=MgRFKpkEAT7pWOahbPHTVhBg59W8rzPIhGYbYS/BHvgXviFxZzuBKqQs3bFqc9ZAPo 5ZNXPu+1VbpNb4QBV0AriTmkMUwAUkqL2NoWla8cyNNPebKyFMjevVqQgDXbXpMLuTLE 55oPNRhS/ZRawNcrpXjubmVEmAyGRBNXlDaIwe8wMiLM+wfiNf2kOwL/jpksa2KNKXvD f0mjMOGPQlRrF1gYGRTCbHXrDSUoFzrVNncPfff6O6KSDy0b8AFodSceaaGd95p3v0/Z tk2RsngyE6giFptRejHwxAqAQW6I9NeD8J7bBuQWtF+kyHkja9JbdD0Occ7CFQ6Cczbn PUlw== Received: by 10.60.172.233 with SMTP id bf9mr4855337oec.99.1349372060157; Thu, 04 Oct 2012 10:34:20 -0700 (PDT) Received: from loki.austin.ibm.com ([32.97.110.59]) by mx.google.com with ESMTPS id h2sm7131923obn.11.2012.10.04.10.34.18 (version=TLSv1/SSLv3 cipher=OTHER); Thu, 04 Oct 2012 10:34:19 -0700 (PDT) From: Michael Roth To: qemu-devel@nongnu.org Date: Thu, 4 Oct 2012 12:33:37 -0500 Message-Id: <1349372021-31212-19-git-send-email-mdroth@linux.vnet.ibm.com> X-Mailer: git-send-email 1.7.9.5 In-Reply-To: <1349372021-31212-1-git-send-email-mdroth@linux.vnet.ibm.com> References: <1349372021-31212-1-git-send-email-mdroth@linux.vnet.ibm.com> X-detected-operating-system: by eggs.gnu.org: Genre and OS details not recognized. X-Received-From: 209.85.214.173 Cc: kwolf@redhat.com, peter.maydell@linaro.org, aliguori@us.ibm.com, blauwirbel@gmail.com, pbonzini@redhat.com, eblake@redhat.com Subject: [Qemu-devel] [PATCH v3 18/22] qidl: add lexer library (based on QC parser) X-BeenThere: qemu-devel@nongnu.org X-Mailman-Version: 2.1.14 Precedence: list List-Id: List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org Sender: qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org Adds an abstract Lexer class to handle tokenizer via a peek/pop/peekline/popline interface, along with an implementation for C based on the lexer from qc.git Signed-off-by: Michael Roth --- scripts/lexer.py | 306 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 306 insertions(+) create mode 100644 scripts/lexer.py diff --git a/scripts/lexer.py b/scripts/lexer.py new file mode 100644 index 0000000..96c6c1a --- /dev/null +++ b/scripts/lexer.py @@ -0,0 +1,306 @@ +# +# QEMU Lexer Library +# +# Copyright IBM, Corp. 2012 +# +# Authors: +# Anthony Liguori +# Michael Roth +# +# This work is licensed under the terms of the GNU GPLv2 or later. +# See the COPYING file in the top-level directory. +# +# The lexer code is based off of: +# http://www.lysator.liu.se/c/ANSI-C-grammar-l.html + +class Input(object): + def __init__(self, fp): + self.fp = fp + self.line = None + self.offset = 0 + self.is_eof = False + self.__fill_buf() + + def __fill_buf(self): + if not self.line and not self.is_eof: + self.line = self.fp.readline() + if not self.line: + self.is_eof = True + + def peek(self): + if self.is_eof: + return "" + return self.line[self.offset] + + def pop(self): + if self.is_eof: + return "" + ch = self.line[self.offset] + self.offset += 1 + if self.offset == len(self.line): + self.offset = 0 + self.line = None + self.__fill_buf() + return ch + + def peek_line(self): + return self.line + + def pop_line(self): + line = self.line + self.line = None + self.offset = 0 + self.__fill_buf() + return line + + def eof(self): + return self.is_eof + +class Lexer(object): + def __init__(self, input, ignored_types=[]): + self.input = input + self.ignored_types = ignored_types + self.current_type = None + self.current_value = None + + def get_token(self): + raise NotImplemented("derived classes must implement this method") + + def __ensure_token(self): + while self.current_type == None and not self.input.eof(): + t, v = self.get_token() + if t not in self.ignored_types: + self.current_type = t + self.current_value = v + + def peek(self): + self.__ensure_token() + return self.current_value + + def peek_line(self): + self.__ensure_token() + return self.input.peek_line() + + def peek_type(self): + self.__ensure_token() + return self.current_type + + def pop(self): + self.__ensure_token() + v = self.current_value + self.current_type = None + self.current_value = None + return v + + def pop_line(self): + self.__ensure_token() + self.current_type = None + self.current_value = None + return self.input.pop_line() + + def pop_expected(self, type_expected=None, value_expected=None): + self.__ensure_token() + if self.current_type != type_expected: + raise Exception("expected '%s', got %s %s" % + (type_expected, self.current_type, self.current_value)) + if value_expected != None: + if self.current_value != value_expected: + raise Exception("expected '%s', got %s" % + (value_expected, self.current_value)) + return self.pop() + + def check_token(self, type_expected, value_expected=None): + self.__ensure_token() + if self.current_type != type_expected: + return False + if value_expected != None: + if self.current_value != value_expected: + return False + return True + + def eof(self): + self.__ensure_token() + return self.current_type == None + +def in_range(ch, start, end): + if ch >= start and ch <= end: + return True + return False + +# D [0-9] +# L [a-zA-Z_] +# H [a-fA-F0-9] +# E [Ee][+-]?{D}+ +# FS (f|F|l|L) +# IS (u|U|l|L)* + +def is_D(ch): + return in_range(ch, '0', '9') + +def is_L(ch): + return in_range(ch, 'a', 'z') or in_range(ch, 'A', 'Z') or ch == '_' + +def is_H(ch): + return in_range(ch, 'a', 'f') or in_range(ch, 'A', 'F') or is_D(ch) + +def is_FS(ch): + return ch in 'fFlL' + +def is_IS(ch): + return ch in 'uUlL' + +class CLexer(Lexer): + def __init__(self, input, ignored_types=[]): + super(CLexer, self).__init__(input, ignored_types) + + # used internally, external users should use + # CLexer.peek()/peek_type()/pop() instead + def get_token(self): + token = '' + while not self.input.eof(): + ch = self.input.peek() + + if is_L(ch): + token += ch + self.input.pop() + ch = self.input.peek() + while is_L(ch) or is_D(ch): + token += ch + self.input.pop() + ch = self.input.peek() + if token in [ 'auto', 'break', 'case', 'const', 'continue', + 'default', 'do', 'else', 'enum', 'extern', + 'for', 'goto', 'if', 'register', 'return', + 'signed', 'sizeof', + 'static', 'struct', 'typedef', 'union', + 'unsigned', 'volatile', 'while' ]: + return (token, token) + else: + return ('symbol', token) + elif ch == "'": + token += ch + self.input.pop() + + ch = self.input.peek() + if ch == '\\': + token += ch + self.input.pop() + token += self.input.pop() + else: + token += ch + token += self.input.pop() + return ('literal', token) + elif ch == '"': + token += ch + self.input.pop() + + ch = self.input.peek() + while ch not in ['', '"']: + token += ch + self.input.pop() + if ch == '\\': + token += self.input.pop() + ch = self.input.peek() + token += ch + self.input.pop() + return ('literal', token) + elif ch in '.><+-*/%&^|!;{},:=()[]~?': + token += ch + self.input.pop() + ch = self.input.peek() + tmp_token = token + ch + if tmp_token in ['<:']: + return ('operator', '[') + elif tmp_token in [':>']: + return ('operator', ']') + elif tmp_token in ['<%']: + return ('operator', '{') + elif tmp_token in ['%>']: + return ('operator', '}') + elif tmp_token == '//': + token = tmp_token + ch = self.input.peek() + while ch != '\n' and ch != '': + token += ch + self.input.pop() + ch = self.input.peek() + return ('comment', token) + elif tmp_token == '/*': + token = tmp_token + self.input.pop() + + ch = self.input.peek() + while True: + while ch != '*': + token += ch + self.input.pop() + ch = self.input.peek() + token += ch + self.input.pop() + ch = self.input.peek() + if ch == '/': + token += ch + self.input.pop() + break + return ('comment', token) + elif tmp_token in [ '+=', '-=', '*=', '/=', '%=', '&=', '^=', + '|=', '>>', '<<', '++', '--', '->', '&&', + '||', '<=', '>=', '==', '!=' ]: + return ('operator', tmp_token) + else: + return ('operator', token) + elif ch == '0': + token += ch + self.input.pop() + ch = self.input.peek() + if ch in 'xX': + token += ch + self.input.pop() + ch = self.input.peek() + while is_H(ch): + token += ch + self.input.pop() + ch = self.input.peek() + while is_IS(ch): + token += ch + self.input.pop() + ch = self.input.peek() + elif is_D(ch): + token += ch + self.input.pop() + ch = self.input.peek() + while is_D(ch): + token += ch + self.input.pop() + ch = self.input.peek() + return ('literal', token) + elif is_D(ch): + token += ch + self.input.pop() + ch = self.input.peek() + while is_D(ch): + token += ch + self.input.pop() + ch = self.input.peek() + return ('literal', token) + elif ch in ' \t\v\n\f': + token += ch + self.input.pop() + ch = self.input.peek() + while len(ch) and ch in ' \t\v\n\f': + token += ch + self.input.pop() + ch = self.input.peek() + return ('whitespace', token) + elif ch in '#': + token += ch + self.input.pop() + ch = self.input.peek() + while len(ch) and ch != '\n': + token += ch + self.input.pop() + ch = self.input.peek() + return ('directive', token) + else: + return ('unknown', ch) + return (None, None)