Add initial lexer and test

This commit is contained in:
Joey Payne 2017-01-22 16:31:51 -07:00
commit 04364418c9
7 changed files with 301 additions and 0 deletions

0
atheris/__init__.py Executable file
View file

181
atheris/lexer.py Executable file
View file

@ -0,0 +1,181 @@
from io import StringIO
from atheris.token import TokenType, Token
class Scanner(object):
def __init__(self, obj):
buf = None
if isinstance(obj, str):
buf = StringIO(obj)
else:
buf = obj
self.buf = buf
def readline(self):
return self.buf.readline()
def read(self):
return self.buf.read(1)
def unread(self):
self.buf.seek(self.buf.tell()-1)
class Lexer(object):
def __init__(self, obj):
self.scanner = Scanner(obj)
self.pos = 0
self.col = 0
self.line = 0
self.line_buf = ''
self.cur_char = ''
self.indent_stack = []
self.advance()
def fill_buffer(self):
self.line_buf = self.scanner.readline().rstrip('\n').rstrip('\r\n')
def advance(self, return_on_newline=False):
self.pos += 1
self.col += 1
end_of_line = False
if self.pos < len(self.line_buf):
self.cur_char = self.line_buf[self.pos]
else:
end_of_line = True
if return_on_newline:
return end_of_line
# if we've reached the end of the line, fill er up again
self.fill_buffer()
if self.line_buf:
self.pos = 0
self.line += 1
self.col = 1
self.cur_char = self.line_buf[self.pos]
else:
# no more characters
self.cur_char = ''
return end_of_line
def get_indent(self):
indent = 0
end_of_line = False
while self.cur_char.isspace() and self.cur_char and not end_of_line:
indent += 1
end_of_line = self.advance(True)
token = None
if self.indent_stack:
last_indent = self.indent_stack[-1]
if indent < last_indent:
self.indent_stack.pop()
size = last_indent - indent
self.col -= 1
token = Token(TokenType.DEDENT, size, self.line, self.col)
self.advance()
elif indent > last_indent:
self.indent_stack.append(indent)
size = indent - last_indent
self.col -= 1
token = Token(TokenType.INDENT, size, self.line, self.col-size)
self.advance()
elif indent > 0:
self.indent_stack.append(indent)
self.col -= 1
token = Token(TokenType.INDENT, indent, self.line, 1)
self.advance()
return token
def next_token(self):
cur_col = self.col
cur_line = self.line
if self.cur_char:
if self.pos == 0:
indent = self.get_indent()
if indent is not None:
return indent
while self.cur_char.isspace():
self.advance()
if self.cur_char.isdigit() or self.cur_char == '.':
num_str = ''
while self.cur_char.isdigit() or self.cur_char == '.':
num_str += self.cur_char
self.advance()
return Token(TokenType.NUMBER, num_str, cur_line, cur_col)
elif self.cur_char == '#':
self.advance()
while self.cur_char and self.cur_char not in set(['\n', '\r']):
self.advance()
return self.next_token()
elif self.cur_char in Token.keyword_map:
last = self.cur_char
self.advance()
return Token(Token.keyword_map[last], last,
cur_line, cur_col)
elif not self.cur_char.isspace():
id_str = ''
while not self.cur_char.isspace() and not self.cur_char in Token.keyword_map:
id_str += self.cur_char
self.advance()
if self.cur_char in Token.keyword_map and not id_str:
token = Token(Token.keyword_map[self.cur_char],
self.cur_char,
cur_line, cur_col)
self.advance()
return token
if id_str in Token.keyword_map:
return Token(Token.keyword_map[id_str], id_str,
cur_line, cur_col)
else:
return Token(TokenType.IDENT, id_str, cur_line, cur_col)
if self.indent_stack:
# Extra indents that have not been taken care of (ie: at the
# end of an indented file)
indent = self.indent_stack.pop()
content = indent
if self.indent_stack:
previous_indent = self.indent_stack[-1]
content = indent - previous_indent
return Token(TokenType.DEDENT, content, cur_line+1, 1)
return Token(TokenType.EOF, '', cur_line+1, 1)
def tokens(self):
token = self.next_token()
while token.token_type != TokenType.EOF:
yield token
token = self.next_token()
yield token
if __name__ == '__main__':
import sys
lexer = Lexer(open(sys.argv[1]))
for token in lexer.tokens():
print(token)

67
atheris/token.py Executable file
View file

@ -0,0 +1,67 @@
from enum import Enum
class TokenType(Enum):
EOF = 1
WS = 2
IDENT = 3
LET = 4
VAR = 5
INT = 6
FLOAT = 7
STRING = 8
CHAR = 9
LEFT_BRACKET = 10
RIGHT_BRACKET = 11
EQUALS = 12
COLON = 13
DEF = 14
CLASS = 15
INDENT = 16
DEDENT = 17
FOR = 18
IN = 19
NUMBER = 20
QUOTE = 21
DB_QUOTE = 22
class Token(object):
keyword_map = {
'def': TokenType.DEF,
'var': TokenType.VAR,
'class': TokenType.CLASS,
'let': TokenType.LET,
'int': TokenType.INT,
'float': TokenType.FLOAT,
'string': TokenType.STRING,
'char': TokenType.CHAR,
'for': TokenType.FOR,
'in': TokenType.IN,
'(': TokenType.LEFT_BRACKET,
')': TokenType.RIGHT_BRACKET,
'=': TokenType.EQUALS,
':': TokenType.COLON,
'\'': TokenType.QUOTE,
'"': TokenType.DB_QUOTE
}
def __init__(self, token_type=None, content=None, line=0, col=0):
self.token_type = token_type
self.content = content
self.line = line
self.col = col
def __str__(self):
return ('Token(type: {}, content: {}, '
'line: {}, col: {})'.format(self.token_type, self.content,
self.line, self.col))

3
tests/__init__.py Executable file
View file

@ -0,0 +1,3 @@
import sys, os
myPath = os.path.dirname(os.path.abspath(__file__))
sys.path.insert(0, myPath + '/../')

6
tests/example.ath Executable file
View file

@ -0,0 +1,6 @@
def stuff(vars: int): string =
let x: float = 3.4
var
t: int = 4
junk: ClassName = ClassName()

3
tests/indents.ath Executable file
View file

@ -0,0 +1,3 @@
x

41
tests/test_lexer.py Executable file
View file

@ -0,0 +1,41 @@
import pytest
from atheris.lexer import Lexer
from atheris.token import TokenType
@pytest.fixture(scope="module")
def lexer():
return Lexer(open('example.ath'))
def test_hanging_indents():
lexer = Lexer('x\n \n \n')
tokens = list(lexer.tokens())
assert tokens[0].token_type == TokenType.IDENT
assert tokens[0].content == 'x'
assert tokens[0].line == 1
assert tokens[0].col == 1
assert tokens[1].token_type == TokenType.INDENT
assert tokens[1].content == 4
assert tokens[1].line == 2
assert tokens[1].col == 1
assert tokens[2].token_type == TokenType.INDENT
assert tokens[2].content == 4
assert tokens[2].line == 3
assert tokens[2].col == 4
assert tokens[3].token_type == TokenType.DEDENT
assert tokens[3].content == 4
assert tokens[3].line == 4
assert tokens[3].col == 1
assert tokens[4].token_type == TokenType.DEDENT
assert tokens[4].content == 4
assert tokens[4].line == 4
assert tokens[4].col == 1
assert tokens[5].token_type == TokenType.EOF
assert tokens[5].content == ''
assert tokens[5].line == 4
assert tokens[5].col == 1