Atheris/atheris/lexer.py
2017-02-12 10:46:33 -07:00

215 lines
6.1 KiB
Python

from io import StringIO
from atheris.token import TokenType, Token
class Scanner(object):
def __init__(self, obj):
buf = None
if isinstance(obj, str):
buf = StringIO(obj)
else:
buf = obj
self.buf = buf
def readline(self):
return self.buf.readline()
def read(self):
return self.buf.read(1)
def unread(self):
self.buf.seek(self.buf.tell()-1)
class Lexer(object):
def __init__(self, obj):
self.scanner = Scanner(obj)
self.pos = 0
self.col = 0
self.line = 0
self.line_buf = ''
self.cur_char = ''
self.indent_stack = []
self.advance()
def fill_buffer(self):
self.line_buf = self.scanner.readline().rstrip('\n').rstrip('\r\n')
def advance(self):
self.pos += 1
self.col += 1
if self.pos < len(self.line_buf):
self.cur_char = self.line_buf[self.pos]
else:
# if we've reached the end of the line, fill er up again
self.fill_buffer()
if self.line_buf:
self.pos = 0
self.line += 1
self.col = 1
self.cur_char = self.line_buf[self.pos]
else:
# no more characters
self.cur_char = ''
def get_indent(self):
indent = 0
while self.cur_char.isspace():
indent += 1
self.pos += 1
self.col += 1
if self.pos < len(self.line_buf):
self.cur_char = self.line_buf[self.pos]
else:
break
# If the current character is a non-space and we read an indent
# level, that means we over-read the indent and need to back up
# one space
if not self.cur_char.isspace() and self.cur_char and indent > 0:
self.pos -= 1
self.col -= 1
token = None
if self.indent_stack:
last_indent = self.indent_stack[-1]
if indent < last_indent:
self.indent_stack.pop()
size = last_indent - indent
self.col -= 1
token = Token(TokenType.DEDENT, size, self.line, self.col)
self.advance()
elif indent > last_indent:
self.indent_stack.append(indent)
size = indent - last_indent
self.col -= 1
token = Token(TokenType.INDENT, size, self.line, self.col-size)
self.advance()
elif indent > 0:
self.indent_stack.append(indent)
self.col -= 1
token = Token(TokenType.INDENT, indent, self.line, 1)
self.advance()
return token
def handle_number(self, cur_line, cur_col):
num_str = ''
while ((self.cur_char.isdigit() or self.cur_char == '.') and
self.cur_char):
num_str += self.cur_char
self.advance()
return Token(TokenType.NUMBER, num_str, cur_line, cur_col)
def skip_whitespace(self):
while self.cur_char.isspace() and self.cur_char:
self.advance()
def handle_comments(self, cur_line, cur_col):
self.advance()
comment_str = ''
while self.cur_char and self.cur_char not in set(['\n', '\r']):
comment_str += self.cur_char
self.advance()
return Token(TokenType.COMMENT, comment_str, cur_line, cur_col)
def handle_single_keyword(self, cur_line, cur_col):
last = self.cur_char
self.advance()
return Token(Token.keyword_map[last], last,
cur_line, cur_col)
def handle_ident(self, cur_line, cur_col):
id_str = ''
while (not self.cur_char.isspace() and
not self.cur_char in Token.keyword_map and
self.cur_char):
id_str += self.cur_char
self.advance()
if self.cur_char in Token.keyword_map and not id_str:
token = Token(Token.keyword_map[self.cur_char],
self.cur_char,
cur_line, cur_col)
self.advance()
return token
if id_str in Token.keyword_map:
return Token(Token.keyword_map[id_str], id_str,
cur_line, cur_col)
else:
return Token(TokenType.IDENT, id_str, cur_line, cur_col)
def get_trailing_dedent(self, cur_line):
# Extra indents that have not been taken care of (ie: at the
# end of an indented file)
indent = self.indent_stack.pop()
content = indent
if self.indent_stack:
previous_indent = self.indent_stack[-1]
content = indent - previous_indent
return Token(TokenType.DEDENT, content, cur_line+1, 1)
def get_token(self, cur_col, cur_line):
if self.cur_char.isdigit() or self.cur_char == '.':
return self.handle_number(cur_line, cur_col)
elif self.cur_char == '#':
return self.handle_comments(cur_line, cur_col)
elif self.cur_char in Token.keyword_map:
return self.handle_single_keyword(cur_line, cur_col)
elif not self.cur_char.isspace():
return self.handle_ident(cur_line, cur_col)
def next_token(self):
cur_col = self.col
cur_line = self.line
if self.cur_char:
if self.pos == 0:
indent = self.get_indent()
if indent is not None:
return indent
self.skip_whitespace()
token = self.get_token(cur_line, cur_col)
if token:
return token
if self.indent_stack:
return self.get_trailing_dedent(cur_line)
return Token(TokenType.EOF, '', cur_line+1, 1)
def tokens(self):
token = self.next_token()
while token.token_type != TokenType.EOF:
yield token
token = self.next_token()
yield token
if __name__ == '__main__':
import sys
lexer = Lexer(open(sys.argv[1]))
for token in lexer.tokens():
print(token)