From patchwork Wed Oct 31 22:36:08 2012 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Michael Roth X-Patchwork-Id: 196051 Return-Path: X-Original-To: incoming@patchwork.ozlabs.org Delivered-To: patchwork-incoming@bilbo.ozlabs.org Received: from lists.gnu.org (lists.gnu.org [208.118.235.17]) (using TLSv1 with cipher AES256-SHA (256/256 bits)) (Client did not present a certificate) by ozlabs.org (Postfix) with ESMTPS id 0B3212C0093 for ; Thu, 1 Nov 2012 10:22:06 +1100 (EST) Received: from localhost ([::1]:52704 helo=lists.gnu.org) by lists.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1TTgwC-0004jZ-Hv for incoming@patchwork.ozlabs.org; Wed, 31 Oct 2012 18:39:04 -0400 Received: from eggs.gnu.org ([208.118.235.92]:34927) by lists.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1TTguJ-0001q1-Fe for qemu-devel@nongnu.org; Wed, 31 Oct 2012 18:37:09 -0400 Received: from Debian-exim by eggs.gnu.org with spam-scanned (Exim 4.71) (envelope-from ) id 1TTguH-0005tp-95 for qemu-devel@nongnu.org; Wed, 31 Oct 2012 18:37:07 -0400 Received: from mail-ie0-f173.google.com ([209.85.223.173]:33189) by eggs.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1TTguH-0005k6-4O for qemu-devel@nongnu.org; Wed, 31 Oct 2012 18:37:05 -0400 Received: by mail-ie0-f173.google.com with SMTP id 17so2893446iea.4 for ; Wed, 31 Oct 2012 15:37:04 -0700 (PDT) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gmail.com; s=20120113; h=sender:from:to:cc:subject:date:message-id:x-mailer:in-reply-to :references; bh=Ub0wX8LVl7To9iCye4ScXcmeL/hhXOJrZQeDcteiUZ8=; b=fHJx2iFAIrShvPna/8sWMycLJ/v/gaR8Yhu50GZpNhD64KOM41Uo7SbFC/j42dp+v+ gmbO7eOtvrctj2G8g/lp5LLjR84MZr0NLXQ+mQbNlXAlP96xde49RwYdDC/UcU+9uP2G SBcA7Pb++rwvyw+TctEyM1H2NEMUf1x13nnMbppoJ98YILrCM8Zr3O1cGlIETjnxFu97 cDR1iEPgQOPja6Dfg9nM0owl4j8/AiZj2UofLD2vyx1lcFD564jldeD2/SNpRBB8t3wT 1qWFyjSAqPstGXYtcai3zXphDqtPCNDwDBlz3M3t/10kYo5HUGEpFWZOLNJD0s+zSxvv Y7Jg== Received: by 10.50.5.236 with SMTP id v12mr3166122igv.6.1351723024873; Wed, 31 Oct 2012 15:37:04 -0700 (PDT) Received: from loki.morrigu.org (cpe-72-179-62-111.austin.res.rr.com. [72.179.62.111]) by mx.google.com with ESMTPS id hg2sm11556858igc.3.2012.10.31.15.37.03 (version=TLSv1/SSLv3 cipher=OTHER); Wed, 31 Oct 2012 15:37:04 -0700 (PDT) From: Michael Roth To: qemu-devel@nongnu.org Date: Wed, 31 Oct 2012 17:36:08 -0500 Message-Id: <1351722972-17801-25-git-send-email-mdroth@linux.vnet.ibm.com> X-Mailer: git-send-email 1.7.9.5 In-Reply-To: <1351722972-17801-1-git-send-email-mdroth@linux.vnet.ibm.com> References: <1351722972-17801-1-git-send-email-mdroth@linux.vnet.ibm.com> X-detected-operating-system: by eggs.gnu.org: GNU/Linux 3.x [fuzzy] X-Received-From: 209.85.223.173 Cc: kwolf@redhat.com, peter.maydell@linaro.org, aliguori@us.ibm.com, blauwirbel@gmail.com, pbonzini@redhat.com Subject: [Qemu-devel] [PATCH 24/28] qidl: add lexer library (based on QC parser) X-BeenThere: qemu-devel@nongnu.org X-Mailman-Version: 2.1.14 Precedence: list List-Id: List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org Sender: qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org Adds an abstract Lexer class to handle tokenizer via a peek/pop/peekline/popline interface, along with an implementation for C based on the lexer from qc.git Reviewed-by: Paolo Bonzini Signed-off-by: Michael Roth --- scripts/lexer.py | 325 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 325 insertions(+) create mode 100644 scripts/lexer.py diff --git a/scripts/lexer.py b/scripts/lexer.py new file mode 100644 index 0000000..23cdc0e --- /dev/null +++ b/scripts/lexer.py @@ -0,0 +1,325 @@ +# +# QEMU Lexer Library +# +# Copyright IBM, Corp. 2012 +# +# Authors: +# Anthony Liguori +# Michael Roth +# +# This work is licensed under the terms of the GNU GPLv2 or later. +# See the COPYING file in the top-level directory. +# +# The lexer code is based off of: +# http://www.lysator.liu.se/c/ANSI-C-grammar-l.html + +class Input(object): + def __init__(self, fp): + self.fp = fp + self.filename = fp.name + self.lineno = 0 + self.line = None + self.offset = 0 + self.is_eof = False + self.__fill_buf() + + def __repr__(self): + return "%s:%d" % (str(self.filename), self.lineno) + + def __fill_buf(self): + if not self.line and not self.is_eof: + self.line = self.fp.readline() + if not self.line: + self.is_eof = True + else: + self.lineno = self.lineno + 1 + + def set_next_line(self, filename, lineno): + self.filename = filename + self.lineno = lineno - 1 + + def peek(self): + if self.is_eof: + return "" + return self.line[self.offset] + + def pop(self): + if self.is_eof: + return "" + ch = self.line[self.offset] + self.offset += 1 + if self.offset == len(self.line): + self.offset = 0 + self.line = None + self.__fill_buf() + return ch + + def peek_line(self): + return self.line + + def pop_line(self): + line = self.line + self.line = None + self.offset = 0 + self.__fill_buf() + return line + + def eof(self): + return self.is_eof + +class Lexer(object): + def __init__(self, input, ignored_types=[]): + self.input = input + self.ignored_types = ignored_types + self.current_type = None + self.current_value = None + + def get_token(self): + raise NotImplemented("derived classes must implement this method") + + def __repr__(self): + return repr(self.input) + + def __ensure_token(self): + while self.current_type == None and not self.input.eof(): + t, v = self.get_token() + if t not in self.ignored_types: + self.current_type = t + self.current_value = v + + def peek(self): + self.__ensure_token() + return self.current_value + + def peek_line(self): + self.__ensure_token() + return self.input.peek_line() + + def peek_type(self): + self.__ensure_token() + return self.current_type + + def pop(self): + self.__ensure_token() + v = self.current_value + self.current_type = None + self.current_value = None + return v + + def pop_line(self): + self.__ensure_token() + self.current_type = None + self.current_value = None + return self.input.pop_line() + + def pop_expected(self, type_expected=None, value_expected=None): + self.__ensure_token() + if self.current_type != type_expected: + raise Exception("%s: expected '%s', got %s %s" % + (self, type_expected, self.current_type, self.current_value)) + if value_expected != None: + if self.current_value != value_expected: + raise Exception("%s: expected '%s', got %s" % + (self, value_expected, self.current_value)) + return self.pop() + + def check_token(self, type_expected, value_expected=None): + self.__ensure_token() + if self.current_type != type_expected: + return False + if value_expected != None: + if self.current_value != value_expected: + return False + return True + + def eof(self): + self.__ensure_token() + return self.current_type == None + +def in_range(ch, start, end): + if ch >= start and ch <= end: + return True + return False + +# D [0-9] +# L [a-zA-Z_] +# H [a-fA-F0-9] +# E [Ee][+-]?{D}+ +# FS (f|F|l|L) +# IS (u|U|l|L)* + +def is_D(ch): + return in_range(ch, '0', '9') + +def is_L(ch): + return in_range(ch, 'a', 'z') or in_range(ch, 'A', 'Z') or ch == '_' + +def is_H(ch): + return in_range(ch, 'a', 'f') or in_range(ch, 'A', 'F') or is_D(ch) + +def is_FS(ch): + return ch in 'fFlL' + +def is_IS(ch): + return ch in 'uUlL' + +class CLexer(Lexer): + def __init__(self, input, ignored_types=[]): + super(CLexer, self).__init__(input, ignored_types) + + # used internally, external users should use + # CLexer.peek()/peek_type()/pop() instead + def get_token(self): + token = '' + while not self.input.eof(): + ch = self.input.peek() + + if is_L(ch): + token += ch + self.input.pop() + ch = self.input.peek() + while is_L(ch) or is_D(ch): + token += ch + self.input.pop() + ch = self.input.peek() + if token in [ 'auto', 'break', 'case', 'const', 'continue', + 'default', 'do', 'else', 'enum', 'extern', + 'for', 'goto', 'if', 'register', 'return', + 'signed', 'sizeof', 'static', 'struct', + 'typedef', 'union', 'unsigned', 'volatile', + 'while' ]: + return (token, token) + else: + return ('symbol', token) + elif ch == "'": + token += ch + self.input.pop() + + ch = self.input.peek() + if ch == '\\': + token += ch + self.input.pop() + token += self.input.pop() + else: + token += ch + token += self.input.pop() + return ('literal', token) + elif ch == '"': + token += ch + self.input.pop() + + ch = self.input.peek() + while ch not in ['', '"']: + token += ch + self.input.pop() + if ch == '\\': + token += self.input.pop() + ch = self.input.peek() + token += ch + self.input.pop() + return ('literal', token) + elif ch in '.><+-*/%&^|!;{},:=()[]~?': + token += ch + self.input.pop() + ch = self.input.peek() + tmp_token = token + ch + if tmp_token in ['<:']: + return ('operator', '[') + elif tmp_token in [':>']: + return ('operator', ']') + elif tmp_token in ['<%']: + return ('operator', '{') + elif tmp_token in ['%>']: + return ('operator', '}') + elif tmp_token == '//': + token = tmp_token + ch = self.input.peek() + while ch != '\n' and ch != '': + token += ch + self.input.pop() + ch = self.input.peek() + return ('comment', token) + elif tmp_token == '/*': + token = tmp_token + self.input.pop() + + ch = self.input.peek() + while True: + while ch != '*': + token += ch + self.input.pop() + ch = self.input.peek() + token += ch + self.input.pop() + ch = self.input.peek() + if ch == '/': + token += ch + self.input.pop() + break + return ('comment', token) + elif tmp_token in [ '+=', '-=', '*=', '/=', '%=', '&=', '^=', + '|=', '>>', '<<', '++', '--', '->', '&&', + '||', '<=', '>=', '==', '!=' ]: + return ('operator', tmp_token) + else: + return ('operator', token) + elif ch == '0': + token += ch + self.input.pop() + ch = self.input.peek() + if ch in 'xX': + token += ch + self.input.pop() + ch = self.input.peek() + while is_H(ch): + token += ch + self.input.pop() + ch = self.input.peek() + while is_IS(ch): + token += ch + self.input.pop() + ch = self.input.peek() + elif is_D(ch): + token += ch + self.input.pop() + ch = self.input.peek() + while is_D(ch): + token += ch + self.input.pop() + ch = self.input.peek() + return ('literal', token) + elif is_D(ch): + token += ch + self.input.pop() + ch = self.input.peek() + while is_D(ch): + token += ch + self.input.pop() + ch = self.input.peek() + return ('literal', token) + elif ch in ' \t\v\n\f': + token += ch + self.input.pop() + ch = self.input.peek() + while len(ch) and ch in ' \t\v\n\f': + token += ch + self.input.pop() + ch = self.input.peek() + return ('whitespace', token) + elif ch in '#': + token += ch + self.input.pop() + ch = self.input.peek() + while len(ch) and ch != '\n': + token += ch + self.input.pop() + ch = self.input.peek() + # parse lineno directives to set Input filename/lineno + tokens = token.split() + if tokens[0] == "#" and tokens[1].isdigit(): + lineno, filename = int(tokens[1]), tokens[2][1:-1] + self.input.set_next_line(filename, lineno) + return ('directive', token) + else: + return ('unknown', ch) + return (None, None)