From patchwork Fri Sep 21 14:07:41 2012 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Michael Roth X-Patchwork-Id: 185784 Return-Path: X-Original-To: incoming@patchwork.ozlabs.org Delivered-To: patchwork-incoming@bilbo.ozlabs.org Received: from lists.gnu.org (lists.gnu.org [208.118.235.17]) (using TLSv1 with cipher AES256-SHA (256/256 bits)) (Client did not present a certificate) by ozlabs.org (Postfix) with ESMTPS id 1B9952C0088 for ; Sat, 22 Sep 2012 01:25:01 +1000 (EST) Received: from localhost ([::1]:36130 helo=lists.gnu.org) by lists.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1TF56B-0001ra-68 for incoming@patchwork.ozlabs.org; Fri, 21 Sep 2012 11:24:59 -0400 Received: from eggs.gnu.org ([208.118.235.92]:41352) by lists.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1TF3uU-0002D3-Da for qemu-devel@nongnu.org; Fri, 21 Sep 2012 10:08:57 -0400 Received: from Debian-exim by eggs.gnu.org with spam-scanned (Exim 4.71) (envelope-from ) id 1TF3uL-0007Lz-Eu for qemu-devel@nongnu.org; Fri, 21 Sep 2012 10:08:50 -0400 Received: from mail-ie0-f173.google.com ([209.85.223.173]:34534) by eggs.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1TF3uL-0007Cw-9R for qemu-devel@nongnu.org; Fri, 21 Sep 2012 10:08:41 -0400 Received: by mail-ie0-f173.google.com with SMTP id 17so990551iea.4 for ; Fri, 21 Sep 2012 07:08:41 -0700 (PDT) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gmail.com; s=20120113; h=sender:from:to:cc:subject:date:message-id:x-mailer:in-reply-to :references; bh=RRw5DmRP504QIe4GvExyc2lk4+6gyiIzzY2cnVBpf5E=; b=PjBa1e+ddQqSWfcm5TE+GQ9EtRTYgeCQ2mEtc7rr9iMbKEeO0Oge350Bf+DbPimX+W MgKAplhJ3guyXh68Sv1szHxIhNYzmpd1ESjUVBWC9LFZ90pKW5OfBXhQvsUSmhkvagD4 9Y/H98F5QjEDIlKoR52WUK7tagEcnrKfuEIpQGxLjMR8W9l2VFYp2aFELRkwVO85vM24 FVcVYj3ryD3HhgBODvNol1MXTDCVGh/vEGB4jX/gp7miMDiHlNp6jqe8SqcZP4RAirr3 4+DJqkkuRrG12uY7rNMJq2TZq6XaVDL9WHmGHm7rYzu9K36F1ExF1EKF1zDaO4sTZDLD 7FhA== Received: by 10.50.157.227 with SMTP id wp3mr1771755igb.29.1348236521119; Fri, 21 Sep 2012 07:08:41 -0700 (PDT) Received: from loki.morrigu.org (cpe-72-179-62-111.austin.res.rr.com. [72.179.62.111]) by mx.google.com with ESMTPS id ua5sm17301156igb.10.2012.09.21.07.08.39 (version=TLSv1/SSLv3 cipher=OTHER); Fri, 21 Sep 2012 07:08:40 -0700 (PDT) From: Michael Roth To: qemu-devel@nongnu.org Date: Fri, 21 Sep 2012 09:07:41 -0500 Message-Id: <1348236465-23124-19-git-send-email-mdroth@linux.vnet.ibm.com> X-Mailer: git-send-email 1.7.9.5 In-Reply-To: <1348236465-23124-1-git-send-email-mdroth@linux.vnet.ibm.com> References: <1348236465-23124-1-git-send-email-mdroth@linux.vnet.ibm.com> X-detected-operating-system: by eggs.gnu.org: Genre and OS details not recognized. X-Received-From: 209.85.223.173 Cc: blauwirbel@gmail.com, peter.maydell@linaro.org, aliguori@us.ibm.com, eblake@redhat.com Subject: [Qemu-devel] [PATCH 18/22] qidl: add lexer library (based on QC parser) X-BeenThere: qemu-devel@nongnu.org X-Mailman-Version: 2.1.14 Precedence: list List-Id: List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org Sender: qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org Adds an abstract Lexer class to handle tokenizer via a peek/pop/peekline/popline interface, along with an implementation for C based on the lexer from qc.git Signed-off-by: Michael Roth --- scripts/lexer.py | 306 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 306 insertions(+) create mode 100644 scripts/lexer.py diff --git a/scripts/lexer.py b/scripts/lexer.py new file mode 100644 index 0000000..e740e5c --- /dev/null +++ b/scripts/lexer.py @@ -0,0 +1,306 @@ +# +# QEMU Lexer Library +# +# Copyright IBM, Corp. 2012 +# +# Authors: +# Anthony Liguori +# Michael Roth +# +# This work is licensed under the terms of the GNU GPLv2. +# See the COPYING file in the top-level directory. +# +# The lexer code is based off of: +# http://www.lysator.liu.se/c/ANSI-C-grammar-l.html + +class Input(object): + def __init__(self, fp): + self.fp = fp + self.line = None + self.offset = 0 + self.is_eof = False + self.__fill_buf() + + def __fill_buf(self): + if not self.line and not self.is_eof: + self.line = self.fp.readline() + if not self.line: + self.is_eof = True + + def peek(self): + if self.is_eof: + return "" + return self.line[self.offset] + + def pop(self): + if self.is_eof: + return "" + ch = self.line[self.offset] + self.offset += 1 + if self.offset == len(self.line): + self.offset = 0 + self.line = None + self.__fill_buf() + return ch + + def peek_line(self): + return self.line + + def pop_line(self): + line = self.line + self.line = None + self.offset = 0 + self.__fill_buf() + return line + + def eof(self): + return self.is_eof + +class Lexer(object): + def __init__(self, input, ignored_types=[]): + self.input = input + self.ignored_types = ignored_types + self.current_type = None + self.current_value = None + + def get_token(self): + raise NotImplemented("derived classes must implement this method") + + def __ensure_token(self): + while self.current_type == None and not self.input.eof(): + t, v = self.get_token() + if t not in self.ignored_types: + self.current_type = t + self.current_value = v + + def peek(self): + self.__ensure_token() + return self.current_value + + def peek_line(self): + self.__ensure_token() + return self.input.peek_line() + + def peek_type(self): + self.__ensure_token() + return self.current_type + + def pop(self): + self.__ensure_token() + v = self.current_value + self.current_type = None + self.current_value = None + return v + + def pop_line(self): + self.__ensure_token() + self.current_type = None + self.current_value = None + return self.input.pop_line() + + def pop_expected(self, type_expected=None, value_expected=None): + self.__ensure_token() + if self.current_type != type_expected: + raise Exception("expected '%s', got %s %s" % + (type_expected, self.current_type, self.current_value)) + if value_expected != None: + if self.current_value != value_expected: + raise Exception("expected '%s', got %s" % + (value_expected, self.current_value)) + return self.pop() + + def check_token(self, type_expected, value_expected=None): + self.__ensure_token() + if self.current_type != type_expected: + return False + if value_expected != None: + if self.current_value != value_expected: + return False + return True + + def eof(self): + self.__ensure_token() + return self.current_type == None + +def in_range(ch, start, end): + if ch >= start and ch <= end: + return True + return False + +# D [0-9] +# L [a-zA-Z_] +# H [a-fA-F0-9] +# E [Ee][+-]?{D}+ +# FS (f|F|l|L) +# IS (u|U|l|L)* + +def is_D(ch): + return in_range(ch, '0', '9') + +def is_L(ch): + return in_range(ch, 'a', 'z') or in_range(ch, 'A', 'Z') or ch == '_' + +def is_H(ch): + return in_range(ch, 'a', 'f') or in_range(ch, 'A', 'F') or is_D(ch) + +def is_FS(ch): + return ch in 'fFlL' + +def is_IS(ch): + return ch in 'uUlL' + +class CLexer(Lexer): + def __init__(self, input, ignored_types=[]): + super(CLexer, self).__init__(input, ignored_types) + + # used internally, external users should use + # CLexer.peek()/peek_type()/pop() instead + def get_token(self): + token = '' + while not self.input.eof(): + ch = self.input.peek() + + if is_L(ch): + token += ch + self.input.pop() + ch = self.input.peek() + while is_L(ch) or is_D(ch): + token += ch + self.input.pop() + ch = self.input.peek() + if token in [ 'auto', 'break', 'case', 'const', 'continue', + 'default', 'do', 'else', 'enum', 'extern', + 'for', 'goto', 'if', 'register', 'return', + 'signed', 'sizeof', + 'static', 'struct', 'typedef', 'union', + 'unsigned', 'volatile', 'while' ]: + return (token, token) + else: + return ('symbol', token) + elif ch == "'": + token += ch + self.input.pop() + + ch = self.input.peek() + if ch == '\\': + token += ch + self.input.pop() + token += self.input.pop() + else: + token += ch + token += self.input.pop() + return ('literal', token) + elif ch == '"': + token += ch + self.input.pop() + + ch = self.input.peek() + while ch not in ['', '"']: + token += ch + self.input.pop() + if ch == '\\': + token += self.input.pop() + ch = self.input.peek() + token += ch + self.input.pop() + return ('literal', token) + elif ch in '.><+-*/%&^|!;{},:=()[]~?': + token += ch + self.input.pop() + ch = self.input.peek() + tmp_token = token + ch + if tmp_token in ['<:']: + return ('operator', '[') + elif tmp_token in [':>']: + return ('operator', ']') + elif tmp_token in ['<%']: + return ('operator', '{') + elif tmp_token in ['%>']: + return ('operator', '}') + elif tmp_token == '//': + token = tmp_token + ch = self.input.peek() + while ch != '\n' and ch != '': + token += ch + self.input.pop() + ch = self.input.peek() + return ('comment', token) + elif tmp_token == '/*': + token = tmp_token + self.input.pop() + + ch = self.input.peek() + while True: + while ch != '*': + token += ch + self.input.pop() + ch = self.input.peek() + token += ch + self.input.pop() + ch = self.input.peek() + if ch == '/': + token += ch + self.input.pop() + break + return ('comment', token) + elif tmp_token in [ '+=', '-=', '*=', '/=', '%=', '&=', '^=', + '|=', '>>', '<<', '++', '--', '->', '&&', + '||', '<=', '>=', '==', '!=' ]: + return ('operator', tmp_token) + else: + return ('operator', token) + elif ch == '0': + token += ch + self.input.pop() + ch = self.input.peek() + if ch in 'xX': + token += ch + self.input.pop() + ch = self.input.peek() + while is_H(ch): + token += ch + self.input.pop() + ch = self.input.peek() + while is_IS(ch): + token += ch + self.input.pop() + ch = self.input.peek() + elif is_D(ch): + token += ch + self.input.pop() + ch = self.input.peek() + while is_D(ch): + token += ch + self.input.pop() + ch = self.input.peek() + return ('literal', token) + elif is_D(ch): + token += ch + self.input.pop() + ch = self.input.peek() + while is_D(ch): + token += ch + self.input.pop() + ch = self.input.peek() + return ('literal', token) + elif ch in ' \t\v\n\f': + token += ch + self.input.pop() + ch = self.input.peek() + while len(ch) and ch in ' \t\v\n\f': + token += ch + self.input.pop() + ch = self.input.peek() + return ('whitespace', token) + elif ch in '#': + token += ch + self.input.pop() + ch = self.input.peek() + while len(ch) and ch != '\n': + token += ch + self.input.pop() + ch = self.input.peek() + return ('directive', token) + else: + return ('unknown', ch) + return (None, None)