From 04364418c981a32d647e368c806bdd7ea030d8a1 Mon Sep 17 00:00:00 2001 From: Joey Payne Date: Sun, 22 Jan 2017 16:31:51 -0700 Subject: [PATCH] Add initial lexer and test --- atheris/__init__.py | 0 atheris/lexer.py | 181 ++++++++++++++++++++++++++++++++++++++++++++ atheris/token.py | 67 ++++++++++++++++ tests/__init__.py | 3 + tests/example.ath | 6 ++ tests/indents.ath | 3 + tests/test_lexer.py | 41 ++++++++++ 7 files changed, 301 insertions(+) create mode 100755 atheris/__init__.py create mode 100755 atheris/lexer.py create mode 100755 atheris/token.py create mode 100755 tests/__init__.py create mode 100755 tests/example.ath create mode 100755 tests/indents.ath create mode 100755 tests/test_lexer.py diff --git a/atheris/__init__.py b/atheris/__init__.py new file mode 100755 index 0000000..e69de29 diff --git a/atheris/lexer.py b/atheris/lexer.py new file mode 100755 index 0000000..ce22b10 --- /dev/null +++ b/atheris/lexer.py @@ -0,0 +1,181 @@ +from io import StringIO +from atheris.token import TokenType, Token + + +class Scanner(object): + def __init__(self, obj): + buf = None + if isinstance(obj, str): + buf = StringIO(obj) + else: + buf = obj + + self.buf = buf + + def readline(self): + return self.buf.readline() + + def read(self): + return self.buf.read(1) + + def unread(self): + self.buf.seek(self.buf.tell()-1) + + +class Lexer(object): + def __init__(self, obj): + self.scanner = Scanner(obj) + self.pos = 0 + self.col = 0 + self.line = 0 + self.line_buf = '' + self.cur_char = '' + self.indent_stack = [] + self.advance() + + def fill_buffer(self): + self.line_buf = self.scanner.readline().rstrip('\n').rstrip('\r\n') + + def advance(self, return_on_newline=False): + self.pos += 1 + self.col += 1 + + end_of_line = False + + if self.pos < len(self.line_buf): + self.cur_char = self.line_buf[self.pos] + else: + end_of_line = True + if return_on_newline: + return end_of_line + # if we've reached the end of the line, fill er up again + self.fill_buffer() + if self.line_buf: + self.pos = 0 + self.line += 1 + self.col = 1 + self.cur_char = self.line_buf[self.pos] + else: + # no more characters + self.cur_char = '' + return end_of_line + + def get_indent(self): + indent = 0 + + end_of_line = False + while self.cur_char.isspace() and self.cur_char and not end_of_line: + indent += 1 + end_of_line = self.advance(True) + + token = None + + if self.indent_stack: + + last_indent = self.indent_stack[-1] + if indent < last_indent: + self.indent_stack.pop() + size = last_indent - indent + self.col -= 1 + token = Token(TokenType.DEDENT, size, self.line, self.col) + self.advance() + elif indent > last_indent: + self.indent_stack.append(indent) + size = indent - last_indent + self.col -= 1 + token = Token(TokenType.INDENT, size, self.line, self.col-size) + self.advance() + + elif indent > 0: + self.indent_stack.append(indent) + self.col -= 1 + token = Token(TokenType.INDENT, indent, self.line, 1) + self.advance() + + return token + + def next_token(self): + cur_col = self.col + cur_line = self.line + + if self.cur_char: + if self.pos == 0: + indent = self.get_indent() + if indent is not None: + return indent + + while self.cur_char.isspace(): + self.advance() + + if self.cur_char.isdigit() or self.cur_char == '.': + num_str = '' + + while self.cur_char.isdigit() or self.cur_char == '.': + num_str += self.cur_char + self.advance() + + return Token(TokenType.NUMBER, num_str, cur_line, cur_col) + + elif self.cur_char == '#': + self.advance() + + while self.cur_char and self.cur_char not in set(['\n', '\r']): + self.advance() + + return self.next_token() + + elif self.cur_char in Token.keyword_map: + last = self.cur_char + self.advance() + return Token(Token.keyword_map[last], last, + cur_line, cur_col) + + elif not self.cur_char.isspace(): + id_str = '' + + while not self.cur_char.isspace() and not self.cur_char in Token.keyword_map: + id_str += self.cur_char + self.advance() + + if self.cur_char in Token.keyword_map and not id_str: + token = Token(Token.keyword_map[self.cur_char], + self.cur_char, + cur_line, cur_col) + self.advance() + return token + + if id_str in Token.keyword_map: + return Token(Token.keyword_map[id_str], id_str, + cur_line, cur_col) + else: + return Token(TokenType.IDENT, id_str, cur_line, cur_col) + + + if self.indent_stack: + # Extra indents that have not been taken care of (ie: at the + # end of an indented file) + indent = self.indent_stack.pop() + + content = indent + + if self.indent_stack: + previous_indent = self.indent_stack[-1] + content = indent - previous_indent + + return Token(TokenType.DEDENT, content, cur_line+1, 1) + + return Token(TokenType.EOF, '', cur_line+1, 1) + + def tokens(self): + token = self.next_token() + while token.token_type != TokenType.EOF: + yield token + token = self.next_token() + yield token + +if __name__ == '__main__': + import sys + lexer = Lexer(open(sys.argv[1])) + + for token in lexer.tokens(): + print(token) diff --git a/atheris/token.py b/atheris/token.py new file mode 100755 index 0000000..0681922 --- /dev/null +++ b/atheris/token.py @@ -0,0 +1,67 @@ +from enum import Enum + +class TokenType(Enum): + EOF = 1 + WS = 2 + + IDENT = 3 + + LET = 4 + VAR = 5 + + INT = 6 + FLOAT = 7 + STRING = 8 + CHAR = 9 + + LEFT_BRACKET = 10 + RIGHT_BRACKET = 11 + EQUALS = 12 + COLON = 13 + + DEF = 14 + CLASS = 15 + + INDENT = 16 + DEDENT = 17 + + FOR = 18 + IN = 19 + + NUMBER = 20 + QUOTE = 21 + DB_QUOTE = 22 + + +class Token(object): + + keyword_map = { + 'def': TokenType.DEF, + 'var': TokenType.VAR, + 'class': TokenType.CLASS, + 'let': TokenType.LET, + 'int': TokenType.INT, + 'float': TokenType.FLOAT, + 'string': TokenType.STRING, + 'char': TokenType.CHAR, + 'for': TokenType.FOR, + 'in': TokenType.IN, + '(': TokenType.LEFT_BRACKET, + ')': TokenType.RIGHT_BRACKET, + '=': TokenType.EQUALS, + ':': TokenType.COLON, + '\'': TokenType.QUOTE, + '"': TokenType.DB_QUOTE + } + + def __init__(self, token_type=None, content=None, line=0, col=0): + self.token_type = token_type + self.content = content + + self.line = line + self.col = col + + def __str__(self): + return ('Token(type: {}, content: {}, ' + 'line: {}, col: {})'.format(self.token_type, self.content, + self.line, self.col)) diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100755 index 0000000..dcebf64 --- /dev/null +++ b/tests/__init__.py @@ -0,0 +1,3 @@ +import sys, os +myPath = os.path.dirname(os.path.abspath(__file__)) +sys.path.insert(0, myPath + '/../') diff --git a/tests/example.ath b/tests/example.ath new file mode 100755 index 0000000..009d70d --- /dev/null +++ b/tests/example.ath @@ -0,0 +1,6 @@ +def stuff(vars: int): string = + let x: float = 3.4 + var + t: int = 4 + junk: ClassName = ClassName() + diff --git a/tests/indents.ath b/tests/indents.ath new file mode 100755 index 0000000..2aa16cd --- /dev/null +++ b/tests/indents.ath @@ -0,0 +1,3 @@ +x + + diff --git a/tests/test_lexer.py b/tests/test_lexer.py new file mode 100755 index 0000000..7d390a7 --- /dev/null +++ b/tests/test_lexer.py @@ -0,0 +1,41 @@ +import pytest +from atheris.lexer import Lexer +from atheris.token import TokenType + +@pytest.fixture(scope="module") +def lexer(): + return Lexer(open('example.ath')) + +def test_hanging_indents(): + lexer = Lexer('x\n \n \n') + tokens = list(lexer.tokens()) + + assert tokens[0].token_type == TokenType.IDENT + assert tokens[0].content == 'x' + assert tokens[0].line == 1 + assert tokens[0].col == 1 + + assert tokens[1].token_type == TokenType.INDENT + assert tokens[1].content == 4 + assert tokens[1].line == 2 + assert tokens[1].col == 1 + + assert tokens[2].token_type == TokenType.INDENT + assert tokens[2].content == 4 + assert tokens[2].line == 3 + assert tokens[2].col == 4 + + assert tokens[3].token_type == TokenType.DEDENT + assert tokens[3].content == 4 + assert tokens[3].line == 4 + assert tokens[3].col == 1 + + assert tokens[4].token_type == TokenType.DEDENT + assert tokens[4].content == 4 + assert tokens[4].line == 4 + assert tokens[4].col == 1 + + assert tokens[5].token_type == TokenType.EOF + assert tokens[5].content == '' + assert tokens[5].line == 4 + assert tokens[5].col == 1