diff --git a/lint.bat b/lint.bat new file mode 100644 index 0000000..11e9c96 --- /dev/null +++ b/lint.bat @@ -0,0 +1,2 @@ +venv\Scripts\activate.bat +black prototype diff --git a/prototype/README.md b/prototype/README.md new file mode 100644 index 0000000..4a090c1 --- /dev/null +++ b/prototype/README.md @@ -0,0 +1,28 @@ +# Light Operational Language + +Formerly `Light Object Language`, but then I decided that I wasn't going to use +OOP patterns. + +## Intent + +Create a transpiler that can rewrite a modern language to C. + +There are many limiting things in C89 and C99 that can be made by a more +intelligent preprocessing step. This is it. + +## Eventual Features + +In no particular order, here are some fun features that I may add. + +1. Generics. +2. Traits/Interfaces. +3. Lambdas/Closure. (???) +4. Borrow checker. (???) +5. Closures. (???) + +## Ecosystem + +1. VS Code extension with syntax highlighting. +2. Bootstrap this into its own language. What's nice is that once we write it, +it will generate C code so we can bootstrap it somewhat continuously (if you +know what I mean... I wasn't very clear). \ No newline at end of file diff --git a/prototype/__init__.py b/prototype/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/prototype/analyzer/deprecated/builtins.py b/prototype/analyzer/deprecated/builtins.py new file mode 100644 index 0000000..cb613a0 --- /dev/null +++ b/prototype/analyzer/deprecated/builtins.py @@ -0,0 +1,28 @@ +# from typing import Dict, Union +# +# from analyzer.lol_analyzer_types import TypeDef, FunctionDef +# +# +# def create_builtin_type(): +# namespace_type = TypeDef("namespace") +# int_type = TypeDef("int") +# bool_type = TypeDef("bool") +# float_type = TypeDef("float") +# str_type = TypeDef("str") +# +# # Create int ops +# int_type.add_binop("+", int_type, int_type) +# int_type.add_binop("-", int_type, int_type) +# int_type.add_binop("*", int_type, int_type) +# int_type.add_binop("/", int_type, int_type) +# +# int_type.add_binop("<", int_type, bool_type) +# int_type.add_binop("<=", int_type, bool_type) +# int_type.add_binop("==", int_type, bool_type) +# int_type.add_binop("!=", int_type, bool_type) +# int_type.add_binop(">=", int_type, bool_type) +# int_type.add_binop(">", int_type, bool_type) +# +# BUILTINS: Dict[str, Union[TypeDef, FunctionDef]] = { +# "int": TypeDef("int") +# } diff --git a/prototype/analyzer/deprecated/c_builtins/libc.py b/prototype/analyzer/deprecated/c_builtins/libc.py new file mode 100644 index 0000000..55c3144 --- /dev/null +++ b/prototype/analyzer/deprecated/c_builtins/libc.py @@ -0,0 +1,8 @@ +"""This file aids with the libc libraries.""" + + +class CLibrary: + def __init__(self, name): + pass + + diff --git a/prototype/analyzer/deprecated/c_keywords.py b/prototype/analyzer/deprecated/c_keywords.py new file mode 100644 index 0000000..9460881 --- /dev/null +++ b/prototype/analyzer/deprecated/c_keywords.py @@ -0,0 +1,198 @@ +""" +Add to list of used symbols (up to and including C99 standard). + +TODO +---- + +1. Each symbol from C should have: + a. its source, + b. its type, + c. etc. + + attached to make it easy to add to the symbol table. +""" +from enum import Enum, auto, unique +from typing import Dict + + +@unique +class SymbolSource(Enum): + # C Language + C_BUILTIN = auto() + C_STDLIB = auto() + C_STDIO = auto() + # LOL + LOL_BUILTIN = auto() + # User + USER = auto() + + +@unique +class CSymbolType(Enum): + MACRO = auto() + TYPE = auto() + CONSTANT = auto() + FUNCTION = auto() + OTHER = auto() + + +################################################################################ +### LANGUAGE KEYWORDS +################################################################################ +_C89_KEYWORDS: Dict[str, SymbolSource] = { + "auto": SymbolSource.C_BUILTIN, + "break": SymbolSource.C_BUILTIN, + "case": SymbolSource.C_BUILTIN, + "char": SymbolSource.C_BUILTIN, + "const": SymbolSource.C_BUILTIN, + "continue": SymbolSource.C_BUILTIN, + "default": SymbolSource.C_BUILTIN, + "do": SymbolSource.C_BUILTIN, + "double": SymbolSource.C_BUILTIN, + "else": SymbolSource.C_BUILTIN, + "enum": SymbolSource.C_BUILTIN, + "extern": SymbolSource.C_BUILTIN, + "float": SymbolSource.C_BUILTIN, + "for": SymbolSource.C_BUILTIN, + "goto": SymbolSource.C_BUILTIN, + "if": SymbolSource.C_BUILTIN, + "int": SymbolSource.C_BUILTIN, + "long": SymbolSource.C_BUILTIN, + "register": SymbolSource.C_BUILTIN, + "return": SymbolSource.C_BUILTIN, + "short": SymbolSource.C_BUILTIN, + "signed": SymbolSource.C_BUILTIN, + "sizeof": SymbolSource.C_BUILTIN, + "static": SymbolSource.C_BUILTIN, + "struct": SymbolSource.C_BUILTIN, + "switch": SymbolSource.C_BUILTIN, + "typedef": SymbolSource.C_BUILTIN, + "union": SymbolSource.C_BUILTIN, + "unsigned": SymbolSource.C_BUILTIN, + "void": SymbolSource.C_BUILTIN, + "volatile": SymbolSource.C_BUILTIN, + "while": SymbolSource.C_BUILTIN, +} +_C99_KEYWORDS: Dict[str, SymbolSource] = { + "inline": SymbolSource.C_BUILTIN, + "restrict": SymbolSource.C_BUILTIN, + "_Bool": SymbolSource.C_BUILTIN, + "_Complex": SymbolSource.C_BUILTIN, + "_Imaginary": SymbolSource.C_BUILTIN, +} +C_KEYWORDS = {**_C89_KEYWORDS, **_C99_KEYWORDS} + + +################################################################################ +### LANGUAGE KEYWORDS +################################################################################ +C_STDIO_KEYWORDS: Dict[str, SymbolSource] = { + # According to https://www.tutorialspoint.com/c_standard_library/stdio_h.htm + # Types + "size_t": SymbolSource.C_STDIO, + "FILE": SymbolSource.C_STDIO, + "fpost_t": SymbolSource.C_STDIO, + # Macros + "NULL": SymbolSource.C_STDIO, + "_IOFBF": SymbolSource.C_STDIO, + "_IOLBF": SymbolSource.C_STDIO, + "_IONBF": SymbolSource.C_STDIO, + "BUFSIZ": SymbolSource.C_STDIO, + "EOF": SymbolSource.C_STDIO, + "FOPEN_MAX": SymbolSource.C_STDIO, + "FILENAME_MAX": SymbolSource.C_STDIO, + "L_tmpnam": SymbolSource.C_STDIO, + "SEEK_CUR": SymbolSource.C_STDIO, + "SEEK_END": SymbolSource.C_STDIO, + "SEEK_SET": SymbolSource.C_STDIO, + "TMP_MAX": SymbolSource.C_STDIO, + "stderr": SymbolSource.C_STDIO, + "stdin": SymbolSource.C_STDIO, + "stdout": SymbolSource.C_STDIO, + # Functions + "fclose": SymbolSource.C_STDIO, + "clearerr": SymbolSource.C_STDIO, + "feof": SymbolSource.C_STDIO, + "ferror": SymbolSource.C_STDIO, + "fflush": SymbolSource.C_STDIO, + "fgetpos": SymbolSource.C_STDIO, + "fopen": SymbolSource.C_STDIO, + "fread": SymbolSource.C_STDIO, + "freopen": SymbolSource.C_STDIO, + "fseek": SymbolSource.C_STDIO, + "fsetpos": SymbolSource.C_STDIO, + "ftell": SymbolSource.C_STDIO, + "fwrite": SymbolSource.C_STDIO, + "remove": SymbolSource.C_STDIO, + "rename": SymbolSource.C_STDIO, + "rewind": SymbolSource.C_STDIO, + "setbuf": SymbolSource.C_STDIO, + "setvbuf": SymbolSource.C_STDIO, + "tmpfile": SymbolSource.C_STDIO, + "tmpnam": SymbolSource.C_STDIO, + "fprintf": SymbolSource.C_STDIO, + "printf": SymbolSource.C_STDIO, + "sprintf": SymbolSource.C_STDIO, + "vfprintf": SymbolSource.C_STDIO, + "vprintf": SymbolSource.C_STDIO, + "vsprintf": SymbolSource.C_STDIO, + "fscanf": SymbolSource.C_STDIO, + "scanf": SymbolSource.C_STDIO, + "sscanf": SymbolSource.C_STDIO, + "fgetc": SymbolSource.C_STDIO, + "fgets": SymbolSource.C_STDIO, + "fputc": SymbolSource.C_STDIO, + "fputs": SymbolSource.C_STDIO, + "getc": SymbolSource.C_STDIO, + "getchar": SymbolSource.C_STDIO, + "gets": SymbolSource.C_STDIO, # NOTE: very dangerous function! + "putc": SymbolSource.C_STDIO, + "putchar": SymbolSource.C_STDIO, + "puts": SymbolSource.C_STDIO, + "ungetc": SymbolSource.C_STDIO, + "perror": SymbolSource.C_STDIO, +} + +C_STDLIB_KEYWORDS = { + # According to https://www.tutorialspoint.com/c_standard_library/stdlib_h.htm + # Types + "size_t": SymbolSource.C_STDLIB, + "wchar_t": SymbolSource.C_STDLIB, + "div_t": SymbolSource.C_STDLIB, + "ldiv_t": SymbolSource.C_STDLIB, + # Macros + "NULL": SymbolSource.C_STDLIB, + "EXIT_FAILURE": SymbolSource.C_STDLIB, + "EXIT_SUCCESS": SymbolSource.C_STDLIB, + "RAND_MAX": SymbolSource.C_STDLIB, + "MB_CUR_MAX": SymbolSource.C_STDLIB, + # Functions + "atof": SymbolSource.C_STDLIB, + "atoi": SymbolSource.C_STDLIB, + "atol": SymbolSource.C_STDLIB, + "strtod": SymbolSource.C_STDLIB, + "strtol": SymbolSource.C_STDLIB, + "strtoul": SymbolSource.C_STDLIB, + "calloc": SymbolSource.C_STDLIB, + "free": SymbolSource.C_STDLIB, + "malloc": SymbolSource.C_STDLIB, + "realloc": SymbolSource.C_STDLIB, + "abort": SymbolSource.C_STDLIB, + "atexit": SymbolSource.C_STDLIB, + "exit": SymbolSource.C_STDLIB, + "getenv": SymbolSource.C_STDLIB, + "system": SymbolSource.C_STDLIB, + "bsearch": SymbolSource.C_STDLIB, + "qsort": SymbolSource.C_STDLIB, + "abs": SymbolSource.C_STDLIB, + "div": SymbolSource.C_STDLIB, + "labs": SymbolSource.C_STDLIB, + "ldiv": SymbolSource.C_STDLIB, + "rand": SymbolSource.C_STDLIB, + "srand": SymbolSource.C_STDLIB, + "mblen": SymbolSource.C_STDLIB, + "mbstowcs": SymbolSource.C_STDLIB, + "mbtowc": SymbolSource.C_STDLIB, + "wcstombs": SymbolSource.C_STDLIB, + "wctomb": SymbolSource.C_STDLIB, +} diff --git a/prototype/analyzer/deprecated/lol_analyzer.py b/prototype/analyzer/deprecated/lol_analyzer.py new file mode 100644 index 0000000..c5c148e --- /dev/null +++ b/prototype/analyzer/deprecated/lol_analyzer.py @@ -0,0 +1,77 @@ +from typing import Dict, List + +from prototype.analyzer.lol_analyzer_types import ( + LolAnalysisObj, + LolModule, +) +from prototype.parser.lol_parser import ( + ASTNode, + FunctionDefinitionNode, + ImportModuleNode, + VariableDefinitionNode, +) + + +### HELPER FUNCTIONS +def extract_names_in_module( + ast_nodes: List[ASTNode], raw_text: str +) -> LolModule: + """ + Extract names (only) of function definitions, global definitions, and + imports. + + TODO + ---- + 1. Add struct/enum/monad + """ + module = LolModule("", raw_text) + + for i, node in enumerate(ast_nodes): + if isinstance(node, FunctionDefinitionNode): + module.add_function_name(node) + elif isinstance(node, VariableDefinitionNode): + module.add_variable_definition_name(node) + elif isinstance(node, ImportModuleNode): + # TODO(dchu) - recursively add members to this submodule! + module.add_submodule(node) + # TODO(dchu): accept data structures + else: + # We will ignore anything outside of functions! This is an error + raise ValueError(f"{node} cannot be outside of functions!") + return module + + +def get_prototypes(module: LolModule, ast_nodes: List[ASTNode], raw_text: str): + """Get function and variable prototypes.""" + for i, node in enumerate(ast_nodes): + if isinstance(node, FunctionDefinitionNode): + module.add_function_prototype(node) + elif isinstance(node, VariableDefinitionNode): + module.add_variable_definition_prototype(node) + elif isinstance(node, ImportModuleNode): + pass + else: + # We will ignore anything outside of functions! This is an error + raise ValueError(f"{node} cannot be outside of functions!") + + +def get_bodies(module: LolModule, ast_nodes: List[ASTNode], raw_text: str): + for i, node in enumerate(ast_nodes): + if isinstance(node, FunctionDefinitionNode): + module.add_function_body(node) + elif isinstance(node, VariableDefinitionNode): + module.add_variable_definition_body(node) + elif isinstance(node, ImportModuleNode): + pass + else: + # We will ignore anything outside of functions! This is an error + raise ValueError(f"{node} cannot be outside of functions!") + + +def analyze(asts: List[ASTNode], raw_text: str) -> Dict[str, LolAnalysisObj]: + # Get names for functions, etc + module: LolModule = extract_names_in_module(asts, raw_text) + # Get prototypes for functions + get_prototypes(module, asts, raw_text) + + return module diff --git a/prototype/analyzer/deprecated/lol_analyzer_reserved_names.py b/prototype/analyzer/deprecated/lol_analyzer_reserved_names.py new file mode 100644 index 0000000..332e39b --- /dev/null +++ b/prototype/analyzer/deprecated/lol_analyzer_reserved_names.py @@ -0,0 +1,118 @@ +# TODO(dchu): we should exclude names that match either "__.*" and "_[A-Z].*". +from typing import Set + + +# Names reserved by the C standard library +STDIO_H_NAMES: Set[str] = { + # According to https://www.tutorialspoint.com/c_standard_library/stdio_h.htm + # Types + "size_t", + "FILE", + "fpost_t", + # Macros + "NULL", + "_IOFBF", + "_IOLBF", + "_IONBF", + "BUFSIZ", + "EOF", + "FOPEN_MAX", + "FILENAME_MAX", + "L_tmpnam", + "SEEK_CUR", + "SEEK_END", + "SEEK_SET", + "TMP_MAX", + "stderr", + "stdin", + "stdout", + # Functions + "fclose", + "clearerr", + "feof", + "ferror", + "fflush", + "fgetpos", + "fopen", + "fread", + "freopen", + "fseek", + "fsetpos", + "ftell", + "fwrite", + "remove", + "rename", + "rewind", + "setbuf", + "setvbuf", + "tmpfile", + "tmpnam", + "fprintf", + "printf", + "sprintf", + "vfprintf", + "vprintf", + "vsprintf", + "fscanf", + "scanf", + "sscanf", + "fgetc", + "fgets", + "fputc", + "fputs", + "getc", + "getchar", + "gets", # NOTE: very dangerous function! + "putc", + "putchar", + "puts", + "ungetc", + "perror", +} + +STDLIB_H_NAMES: Set[str] = { + # According to https://www.tutorialspoint.com/c_standard_library/stdlib_h.htm + # Types + "size_t", + "wchar_t", + "div_t", + "ldiv_t", + # Macros + "NULL", + "EXIT_FAILURE", + "EXIT_SUCCESS", + "RAND_MAX", + "MB_CUR_MAX", + # Functions + "atof", + "atoi", + "atol", + "strtod", + "strtol", + "strtoul", + "calloc", + "free", + "malloc", + "realloc", + "abort", + "atexit", + "exit", + "getenv", + "system", + "bsearch", + "qsort", + "abs", + "div", + "labs", + "ldiv", + "rand", + "srand", + "mblen", + "mbstowcs", + "mbtowc", + "wcstombs", + "wctomb", +} + +# TODO(dchu): include full set of C standard library names in here. +C_STANDARD_LIBRARY_NAMES: Set[str] = set().union(STDIO_H_NAMES, STDLIB_H_NAMES) diff --git a/prototype/analyzer/deprecated/lol_analyzer_types.py b/prototype/analyzer/deprecated/lol_analyzer_types.py new file mode 100644 index 0000000..e23c693 --- /dev/null +++ b/prototype/analyzer/deprecated/lol_analyzer_types.py @@ -0,0 +1,733 @@ +from abc import ABCMeta +from enum import Enum, auto, unique +from typing import Any, Dict, List, Set, Union + +import prototype.parser.lol_parser_types as parser_types +from prototype.analyzer.c_keywords import SymbolSource, C_KEYWORDS + + +@unique +class COpType(Enum): + CALL = auto() # (, ...), e.g. func_name(x, y, z) + ACCESS = auto() # [], e.g. array_name[x] + PREFIX = auto() # , e.g. +x + INFIX = auto() # , e.g. x+y + SUFFIX = auto() # , e.g. x++ + + +def parse_type_expression( + scope: "LolScope", type_expression: parser_types.TypeExpression +) -> "LolDataType": + if isinstance(type_expression, parser_types.Identifier): + name = type_expression.token.lexeme + if name not in scope: + raise ValueError( + f"type '{name}' not in symbol table '{scope}'" + ) + result = scope.search_scope(name) + assert isinstance(result, LolDataType) + return result + else: + raise ValueError(f"type '{type(type_expression)}' is unsupported") + + +def parse_value_expression( + scope: "LolScope", value_expression: parser_types.ValueExpression +) -> "ValueExpression": + if isinstance(value_expression, parser_types.Literal): + data_type = { + parser_types.StringLiteral: scope.search_scope("str"), + parser_types.DecimalLiteral: scope.search_scope("int32"), + }.get(type(value_expression)) + assert data_type is not None + return LiteralExpression(value_expression.value, data_type) + elif isinstance(value_expression, parser_types.VariableCallNode): + data_var_name = value_expression.get_name_as_str() + data_var = scope.search_scope(data_var_name) + assert isinstance(data_var, LolDataVariable) + return VariableCallExpression(data_var) + elif isinstance(value_expression, parser_types.OperatorValueExpression): + # Get operator + op_str = value_expression.get_operator_as_str() + # NOTE: operators for int/float are overloaded. How do we decide + # which one to get? + operator = scope.search_scope(op_str) + assert isinstance(operator, LolOperator) + # Get operands + analysis_args = [] + for parser_args in value_expression.get_operands(): + arg = parse_value_expression(parser_args) + analysis_args.append(arg) + return OperatorValueExpression(operator, analysis_args) + elif isinstance(value_expression, parser_types.FunctionCallNode): + # Get function + func_name = value_expression.get_name_as_str() + func = scope.search_scope(func_name) + assert isinstance(func, LolFunction) + # Get operands + analysis_args = [] + for parser_args in value_expression.get_arguments(): + arg = parse_value_expression(parser_args) + analysis_args.append(arg) + return FunctionCallExpression(func, analysis_args) + + +################################################################################ +### BUILDING BLOCKS +################################################################################ +class LolAnalysisObj(metaclass=ABCMeta): + """More like, 'named C object'. i.e. data type, variable, function.""" + + def __init__(self, name: str, alt_c_name: Union[str, None] = None): + self.name = name + self.c_name = alt_c_name if alt_c_name is not None else name + + +class LolDataType(LolAnalysisObj): + """ + Represent a data type in LOL. E.g. struct, enum, monad, or builtin type. + """ + + def __init__( + self, + name: str, + alt_c_name: str = None, + ast_node: parser_types.ASTNode = None, + ): + super().__init__(name, alt_c_name) + self._ast_node = ast_node + self.functions: Dict[str, "LolFunction"] = {} + + def __repr__(self): + return f"{self.name}" + + def add_function(self, func: "LolFunction"): + self.functions[func.name] = func + + +class LolDataVariable(LolAnalysisObj): + def __init__( + self, + name: str, + ast_node: parser_types.ASTNode = None, + data_type: LolDataType = None, # Is unknown at instantiation + init_value: Any = None, + *, + alt_c_name: str = None, # Use name unless otherwise specified + is_mut: bool = False, # Opposite of C's const + is_unrestricted: bool = False, # Opposite of C's restrict + is_volatile: bool = False, # Equivalent to C's volatile + ): + super().__init__(name, alt_c_name) + self._ast_node = ast_node + self.data_type = data_type + self.init_value = init_value + + self.is_mut = is_mut + self.is_unrestricted = is_unrestricted + self.is_volatile = is_volatile + + def __repr__(self): + return f"{self.name}: {self.data_type} = {self.init_value}" + + def add_data_type(self, data_type: LolDataType): + self.data_type = data_type + + def add_init_value(self, init_value: Any): + self.init_value = init_value + + def get_return_type(self): + return self.data_type + + +class LolFunction(LolAnalysisObj): + def __init__( + self, + name: str, + ast_node: parser_types.ASTNode = None, # Only for user-defined func + params: List[LolDataVariable] = None, # DEPRECATED? + return_type: LolDataType = None, # DEPRECATED? + *, + alt_c_name: str = None, + c_op_type: COpType = COpType.CALL, + is_builtin_c: bool = None, # If so, do not perform checks yet... + is_pure: bool = None, + is_public: bool = None, + ): + super().__init__(name, alt_c_name) + + self._parameters = params + self._return_type = return_type + self._body = None + + # C stuff + self.c_op_type = c_op_type + self.is_builtin_c = is_builtin_c + + # Optimization + self.is_pure = is_pure + self.is_public = is_public + + def __repr__(self): + name = self.name + params = tuple( + self._parameters + ) if self._parameters is not None else "(?)" + ret_t = self._return_type if self._return_type is not None else "?" + return f"function {name}{params} -> {ret_t}" + + ############################################################################ + ### FUNCTION PROTOTYPE + ############################################################################ + def add_prototype( + self, scope: "LolScope", func_node: parser_types.FunctionDefinitionNode + ): + parser_params = func_node.get_parameters() + self._add_parameters(scope, parser_params) + + parser_ret_t = func_node.get_return_type() + self._add_return_type(scope, parser_ret_t) + + def _add_parameters( + self, scope: "LolScope", + parser_params: List[parser_types.VariableDefinitionNode] + ): + if self._parameters is not None: + raise ValueError("trying to overwrite params") + params = [] + for var_def_node in parser_params: + name = var_def_node.get_name_as_str() + data_t = parse_type_expression(scope, var_def_node.get_data_type()) + p = LolDataVariable( + name=name, + data_type=data_t, + init_value=None, # We don't support default val functions + ) + params.append(p) + self._parameters = params + + def _add_return_type( + self, scope: "LolScope", return_type: parser_types.TypeExpression + ): + if self._return_type is not None: + raise ValueError("trying to overwrite return_type") + ret_t = parse_type_expression(scope, return_type) + self._return_type = ret_t + + ############################################################################ + ### FUNCTION BODY + ############################################################################ + def add_body( + self, scope: "LolScope", func_node: parser_types.FunctionDefinitionNode + ): + if self.is_builtin_c: + raise ValueError("trying to add body to built in function!") + # Type check + func_name = func_node.get_name_as_str() + func = scope.search_scope(func_name) + assert isinstance(func, LolFunction) + + analyzer_body: List[Statement] = [] + anon_var_counter = 0 + # NOTE: does not support function definitions or module imports here. + for i, statement in func_node.get_body(): + if isinstance(statement, parser_types.VariableDefinitionNode): + r = self._add_variable_definition_statement(scope, statement) + analyzer_body.append(r) + elif isinstance(statement, parser_types.VariableModificationNode): + r = self._add_variable_modification_statement(scope, statement) + analyzer_body.append(r) + elif isinstance(statement, parser_types.FunctionCallNode): + r = self._add_function_call_statement(scope, statement) + analyzer_body.append(r) + elif isinstance(statement, parser_types.ReturnNode): + raise NotImplementedError + # Allowing expressions means that we expect operators to potentially + # have side-effects! + elif isinstance( + statement, parser_types.OperatorValueExpression + ): + raise NotImplementedError + # TODO(dchu): ignore solitary literals and identifiers that have no + # TODO(dchu): ... side effects or observable actions. + else: + raise ValueError(f"unsupported statement {statement}") + + def _add_variable_definition_statement( + self, scope: "LolScope", var_def: parser_types.VariableDefinitionNode + ): + lvalue = var_def.get_name_as_str() + parser_data_type = var_def.get_data_type() + parser_value = var_def.get_value() + + data_t = parse_type_expression(scope, parser_data_type) + value = parse_value_expression(scope, parser_value) + + data_var = LolDataVariable( + lvalue, var_def, data_type=data_t, init_value=value + ) + analyzer_statement = VariableDefinitionStatement( + lvalue=data_var, + data_type=data_t, + expression=value, + ) + return analyzer_statement + + def _add_variable_modification_statement( + self, scope: "LolScope", var_mod: parser_types.VariableModificationNode + ): + lvalue = var_mod.get_name_as_str() + parser_value = var_mod.get_value() + + value = parse_value_expression(scope, parser_value) + + data_var = scope.search_scope(lvalue) + analyzer_statement = VariableModificationStatement( + lvalue=data_var, + expression=value, + ) + return analyzer_statement + + def _add_function_call_statement( + self, scope: "LolScope", func_call: parser_types.FunctionCallNode + ) -> "ExpressionWithSideEffectStatement": + name = func_call.get_name_as_str() + args = func_call.get_arguments() + func = scope.search_scope(name) + analyzer_args = [parse_value_expression(scope, expr) for expr in args] + func_statement = FunctionCallExpression(func, analyzer_args) + analyzer_statement = ExpressionWithSideEffectStatement(func_statement) + return analyzer_statement + + def _add_return_statement( + self, scope: "LolScope", ret_stmt: parser_types.ReturnNode + ): + expr = ret_stmt.get_expression() + analyzer_expr = parse_value_expression(scope, expr) + analyzer_statement = ReturnStatement(analyzer_expr) + return analyzer_statement + + ############################################################################ + ### SEARCHING + ############################################################################ + def get_return_type(self): + return self._return_type + + +class LolOperator(LolAnalysisObj): + """TODO: maybe this could be derived from LolFunction?""" + + def __init__( + self, + operator: str, + operator_type: parser_types.OperatorType, + operand_types: List[LolDataType], + return_type: LolDataType, + ): + # This implies that the LOL operators directly shadow their C + # counterparts. That is, the c_alt_name == the name. + super().__init__(operator, operator) + self._operator = operator + self._operator_type = operator_type + self._operand_types = operand_types + self._return_type = return_type + + def get_return_type(self): + return self._return_type + + +################################################################################ +### VALUE EXPRESSIONS +################################################################################ +class ValueExpression(metaclass=ABCMeta): + """ + Abstract class for expressions with side effects. + + N.B. Expressions without side-effects may be safely removed. + """ + + pass + + +class FunctionCallExpression(ValueExpression): + def __init__(self, function: LolFunction, arguments: List[ValueExpression]): + super().__init__() + self._function = function + self._arguments = arguments + + def get_return_type(self) -> LolDataType: + return self._function.get_return_type() + + +class OperatorValueExpression(ValueExpression): + def __init__( + self, + operator: LolOperator, + arguments: List[ValueExpression], + ): + super().__init__() + self._operator = operator + self._arguments = arguments + + def get_return_type(self) -> LolDataType: + return self._operator.get_return_type() + + +class LiteralExpression(ValueExpression): + """ + Assign a literal to a variable for easier debugging. + + N.B. corner case: if we are already assigning a literal to a named variable, + the we do not need to assign it to an unnamed variable. + + E.g. `named_var: int = 10;` does not need + `unnamed_var: int = 10; named_var: int = unnamed_var;` + """ + + def __init__( + self, + literal: parser_types.Literal, + data_type: LolDataType, + ): + super().__init__() + self._literal = literal + self._data_type = data_type + + def get_return_type(self) -> LolDataType: + return self._data_type + + +class VariableCallExpression(ValueExpression): + def __init__(self, data_variable: LolDataVariable): + self._data_variable = data_variable + + def get_return_type(self) -> LolDataType: + return self._data_variable.get_return_type() + + +################################################################################ +### INTERMEDIATE REPRESENTATION +################################################################################ +class Statement(metaclass=ABCMeta): + def __init__(self): + pass + + # @abstractmethod + # def emit(self): + # pass + + +class VariableDefinitionStatement(Statement): + """ = """ + + def __init__( + self, + data_type: LolDataType, + lvalue: LolDataVariable, + expression: ValueExpression, + ): + super().__init__() + self._data_type = data_type + self._lvalue = lvalue + self._expression = expression + + +class VariableModificationStatement(Statement): + def __init__( + self, + lvalue: LolDataVariable, + expression: ValueExpression, + ): + super().__init__() + self._lvalue = lvalue + self._expression = expression + + +class ExpressionWithSideEffectStatement(Statement): + """Expression with side-effects.""" + + def __init__( + self, + expression: ValueExpression, + ): + super().__init__() + self._expression = expression + + +class ReturnStatement(Statement): + def __init__(self, expr: ValueExpression): + super().__init__() + self.expr = expr + + +################################################################################ +### MODULE +################################################################################ +class LolScope(LolAnalysisObj): + def __init__( + self, name: str, outer_scope: Union["LolScope", None] + ): + """ + Params + ------ + * name: str - name of the scope (e.g. name of the function) + * outer_scope: LolScope | None - scope beyond this one. If None, then it + is the outermost scope (i.e. the global scope). + """ + super().__init__(name) + # Symbols in the C namespace (NOTE: the struct, function, and variable + # namespaces are all different in C, but we will keep them together). + # This namespace is just to ensure there are no collisions (since we do + # not allow name mangling). + # TODO(dchu): eventually, we may want the type of each symbol here! + self._c_namespace: Dict[str, SymbolSource] = {**C_KEYWORDS} + # The symbol table is the set of usable names. + self._symbol_table: Dict[str, LolAnalysisObj] = {} + self._outer_scope = outer_scope + + def __repr__(self): + return repr(self._symbol_table) + + def create_inner_scope(self, name: str) -> "LolScope": + """ + Create a scope that nests within the current scope. + + By just deleting the inner scope once we are done with it, we may lose + some valuable debugging information. + + E.g. + ``` + let var_0: int = 0; + if var_0 == 0 { + let var_1: int = 1; + } + print(var_1); // We want to tell the user that var_1 is _no longer_ + // in scope! + ``` + """ + return LolScope(name, self) + + def add_to_scope( + self, + name: str, + obj: Union[LolDataType, LolDataVariable, LolFunction], + *, + source: SymbolSource = SymbolSource.USER, + ): + """Adds to scope if the symbol isn't already used; otherwise, it raises + an error.""" + # Assert types are correct (just to make debugging easier). This is + # important because we might have a Python str and a Token that are both + # supposed to be the same thing, but due to the differing types, they + # appear to be different. N.B. Tokens are not allowed in here! + assert isinstance(name, str) + assert isinstance(obj, (LolDataType, LolDataVariable, LolFunction)) + # Ensure names are not already used + if name in self._c_namespace: + msg = f"'{name}' already in C namespace '{self._c_namespace}'" + raise ValueError(msg) + if name in self._symbol_table: + msg = f"'{name}' already in symbol table '{self._symbol_table}'" + raise ValueError(msg) + # Add to C and LOL namespaces + self._c_namespace[name] = source + self._symbol_table[name] = obj + + def add_function( + self, + name: str, + ast_node: parser_types.FunctionDefinitionNode, + *, + source: SymbolSource = SymbolSource.USER, + ): + func = LolFunction( + name=name, + ast_node=ast_node, + ) + self.add_to_scope(name, func, source=source) + + def add_variable( + self, + name: str, + ast_node: parser_types.VariableDefinitionNode, + *, + source: SymbolSource = SymbolSource.USER, + ): + var = LolDataVariable( + name=name, + ast_node=ast_node, + data_type=None, + ) + self.add_to_scope(name, var, source=source) + + def search_scope(self, name: str, *, recursion_depth: int = 0): + # If found, then return! + if name in self._symbol_table: + return name + # If not found, then recurse + if self._outer_scope is None: + raise ValueError(f"unable to find {name} in any scope!") + elif recursion_depth >= 32: + raise ValueError(f"recursion depth is >=32. Infinite loop?") + return self._outer_scope.search_scope( + name, recursion_depth=recursion_depth + 1 + ) + + def assert_contains(self, name: str): + assert name in self._symbol_table + + +class LolModule(LolAnalysisObj): + """ + NOTES + ----- + + 1. No name mangling + * Thus, no overloading + 2. No custom types + 3. No default values for functions + 4. No default values for custom types + """ + + def __init__(self, name: str, raw_text: str): + # TODO(dchu): figure this one out! What should the names of modules be? + # TODO(dchu): The package? + super().__init__(name, "") + # Raw text is simply for debugging! + self._raw_text: str = raw_text + # C Standard Library Headers to Include + # I will include the angled brackets or quotations since it is not + # immediately obvious whether to use brackets or quotes otherwise. + self.c_includes: List[str] = [] + + # Module scope is the outermost scope + self._scope = LolScope(name, outer_scope=None) + + def __repr__(self): + return f"LolModule(scope={self._scope})" + + + ############################################################################ + ### Add names to module + ############################################################################ + def add_function_name( + self, function_node: parser_types.FunctionDefinitionNode + ): + name = function_node.get_name_as_str() + self._scope.assert_contains(name) + self._scope.add_function(name, function_node, source=SymbolSource.USER) + + def add_variable_definition_name( + self, var_def_node: parser_types.VariableDefinitionNode + ): + name = var_def_node.get_name_as_str() + self._scope.add_variable(name, var_def_node, source=SymbolSource.USER) + + def add_submodule(self, submodule_node: parser_types.ImportModuleNode): + # TODO - this should just import all recursively! + name = submodule_node.get_name_as_str() + submodule_name = submodule_node.get_library_as_str() + if submodule_name == "stdio.h": + self.include_stdio(name) + else: + # self.symbol_table[name] = Module(name, "") + raise ValueError("general imports not supported!") + + ############################################################################ + ### Add prototypes to module + ############################################################################ + + def add_function_prototype( + self, func_node: parser_types.FunctionDefinitionNode + ): + name = func_node.get_name_as_str() + func = self._scope.search_scope(name) + assert isinstance(func, LolFunction) + func.add_prototype(self._scope, func_node) + + def add_variable_definition_prototype( + self, variable_definition_node: parser_types.VariableDefinitionNode + ): + name = variable_definition_node.get_name_as_str() + var_def = self._scope.search_scope(name) + assert isinstance(var_def, LolDataVariable) + var_def.add_data_type( + parse_type_expression( + variable_definition_node.get_data_type() + ) + ) + + ############################################################################ + # Add bodies + ############################################################################ + def add_function_body(self, func_node: parser_types.FunctionDefinitionNode): + name = func_node.get_name_as_str() + func = self._scope.search_scope(name) + assert isinstance(func, LolFunction) + func.add_body(self._scope, func_node) + + def add_variable_definition_body( + self, variable_definition_node: parser_types.VariableDefinitionNode + ): + name = variable_definition_node.get_name_as_str() + var_def = self._scope.search_scope(name) + assert isinstance(var_def, LolDataVariable) + var_def.add_init_value(variable_definition_node.get_value()) + + + +################################################################################ +### BUILTINS +################################################################################ + +# TODO +# ==== +# 1. Move builtins to separate file +# 2. blah + +# Data Types +lol_int32 = LolDataType("int32", alt_c_name="int") +lol_str = LolDataType("str", alt_c_name="char *") +lol_bool = LolDataType("bool", alt_c_name="int") +lol_void = LolDataType("void") + +# Placeholder Variables +unnamed_lol_int32 = LolDataVariable(None, lol_int32, None) +unnamed_lol_str = LolDataVariable(None, lol_str, None) + +# Functions +lol_printf = LolFunction( + "printf", + [unnamed_lol_str], + lol_int32, + is_builtin_c=True, +) + +builtins: Dict[str, Any] = { + "int32": lol_int32, + "string": lol_str, + "bool": lol_bool, +} + +includes: Set[str] = {"stdio.h"} + +lol_int32.add_function( + LolFunction( + "+", + [unnamed_lol_int32, unnamed_lol_int32], + lol_int32, + alt_c_name="+", + is_builtin_c=True, + c_op_type=COpType.INFIX, + ) +) +lol_int32.add_function( + LolFunction( + "-", + [unnamed_lol_int32, unnamed_lol_int32], + lol_int32, + alt_c_name="-", + is_builtin_c=True, + c_op_type=COpType.INFIX, + ) +) diff --git a/prototype/analyzer/deprecated/lol_analyzer_user_types.py b/prototype/analyzer/deprecated/lol_analyzer_user_types.py new file mode 100644 index 0000000..9d53e98 --- /dev/null +++ b/prototype/analyzer/deprecated/lol_analyzer_user_types.py @@ -0,0 +1,67 @@ +from prototype.analyzer.lol_analyzer_types import LolModule + + +class LolUserModule(LolModule): + def __init__(self, name: str, raw_text: str): + super().__init__(name, raw_text) + + # Add C and LOL builtins to C namespace and symbol table + if name == "": + self.add_lol_builtins() + + def add_lol_builtins(self): + """ + NOTE: this adds these to only the top level module. We can only run + this function once--otherwise, there will be duplicate int32 objects. + """ + self._scope.add_to_scope( + "int32", LolDataType("int32", alt_c_name="int") + ) + self._scope.add_to_scope("str", LolDataType("str", alt_c_name="char *")) + + def add_builtin_func(self, name: str): + if ( + name in self.c_namespace + and self.c_namespace[name] != SymbolSource.C_STDLIB + ): + raise ValueError( + f"user-defined symbol '{name}' already in " + f"C namespace '{self.c_namespace}'" + ) + self.c_namespace[name] = SymbolSource.C_STDLIB + self.symbol_table[name] = LolFunction(name, is_builtin_c=True) + + def include_stdio(self, lol_alias: str): + from prototype.analyzer.lol_analyzer_reserved_names import STDIO_H_NAMES + + lib_name = "" + if lib_name in self.c_includes: + raise ValueError( + f"module '{lib_name}' already in " + f"C includes list '{self.c_includes}'" + ) + self.c_includes.append(lib_name) + # Ensure that all names within stdio.h are unique + # NOTE: some C standard library names overlap without problem. + # E.g. NULL, size_t, etc. We take care of that by checking the source + # as well. + for stdio_name in STDIO_H_NAMES: + if stdio_name in self.c_namespace: + if self.c_namespace[stdio_name] == SymbolSource.C_STDLIB: + continue + else: + raise ValueError( + f"name '{stdio_name}' already in " + f"C namespace '{self.c_namespace}'" + ) + else: + self.c_namespace[stdio_name] = SymbolSource.C_STDLIB + # Check that the alias is unique in the symbol table too! + if lol_alias in self.symbol_table: + raise ValueError( + f"name '{lol_alias}' already in " + f"symbol table '{self.symbol_table}'" + ) + stdio_namespace = LolModule(lol_alias, self._raw_text) + stdio_namespace.add_builtin_func("printf") + self.symbol_table[lol_alias] = stdio_namespace \ No newline at end of file diff --git a/prototype/analyzer/new_lol_analyzer.py b/prototype/analyzer/new_lol_analyzer.py new file mode 100644 index 0000000..3319478 --- /dev/null +++ b/prototype/analyzer/new_lol_analyzer.py @@ -0,0 +1,444 @@ +from typing import Any, Dict, Tuple, List, Optional, Union +from enum import Enum, auto, unique + +from prototype.parser.lol_parser import ( + ASTNode, + FunctionDefinitionNode, + ImportModuleNode, + VariableDefinitionNode, +) +import prototype.parser.lol_parser_types as parser_types +import prototype.lexer.lol_lexer_types as lexer_types + + +################################################################################ +### LOL ANALYSIS INTERMEDIATE REPRESENTATION +################################################################################ +LolIRExpression = Union["LolIRFunctionCallExpression", "LolIROperatorExpression", "LolIRLiteralExpression", "LolAnalysisVariable"] +LolIRStatement = Union["LolIRDefinitionStatement", "LolIRSetStatement", "LolIRFunctionCallStatement", "LolIRIfStatement", "LolIRReturnStatement"] + + +### Expressions +class LolIRFunctionCallExpression: + def __init__(self, function: "LolAnalysisFunction", arguments: List["LolAnalysisVariable"]): + self.function = function + self.arguments = arguments + + +class LolIROperatorExpression: + def __init__(self, op: str, operands: List["LolAnalysisVariable"]): + self.op = op + self.operands: List["LolAnalysisVariable"] = operands + + +class LolIRLiteralExpression: + def __init__(self, literal: Any): + self.literal = literal + + +### Statements +class LolIRDefinitionStatement: + def __init__(self, name: str, type: "LolAnalysisDataType", value: LolIRExpression): + self.name: str = name + self.type: "LolAnalysisDataType" = type + self.value = value + + +class LolIRSetStatement: + def __init__(self, name: str, value: LolIRExpression): + self.name = name + self.value = value + + +class LolIRFunctionCallStatement: + def __init__(self, func_call: LolIRFunctionCallExpression): + self.func_call = func_call + +class LolIRIfStatement: + def __init__(self, if_cond: "LolAnalysisVariable", if_body: List[LolIRStatement], else_body: List[LolIRStatement]): + self.if_cond = if_cond + self.if_body = if_body + self.else_body = else_body + + +class LolIRReturnStatement: + def __init__(self, ret_var: "LolAnalysisVariable"): + self.ret_var = ret_var + + +################################################################################ +### LOL ANALYSIS TYPES +################################################################################ +LolAnalysisDataType = Union["LolAnalysisBuiltinType"] +LolAnalysisSymbol = Union[LolAnalysisDataType, "LolAnalysisFunction", "LolAnalysisVariable"] + + +def optional_to_dict(obj: Any): + if obj is None: + return None + else: + return obj.to_dict() + + +def recursive_to_dict(obj: Optional[Dict[str, LolAnalysisSymbol]]): + """This is like optional_names() but calls to_dict() on each value.""" + if obj is None: + return None + else: + return {key: val.to_dict() for key, val in obj.items()} + + +def optional_names(obj: Optional[Dict[str, LolAnalysisSymbol]]): + if obj is None: + return None + else: + # assert isinstance(obj, Dict[str, LolAnalysisSymbol]) + return {key: val.name for key, val in obj.items()} + + +class LolAnalysisBuiltinType: + def __init__(self, name: str, ops: Dict[str, "LolAnalysisBuiltinType"]): + self.name = name + self.ops = ops + + def to_dict(self): + return dict( + metatype=self.__class__.__name__, + name=self.name, + ops={op: dt.name for op, dt in self.ops.items()}, + id=id(self), + ) + + +def get_type( + type_ast: parser_types.TypeExpression, module_symbol_table: Dict[str, LolAnalysisSymbol] +) -> LolAnalysisDataType: + # TODO: Change this in when we support multi-token TypeExpressions + assert isinstance(type_ast, parser_types.Identifier) + type_token: lexer_types.Token = type_ast.token + type_name: str = type_token.as_str() + if type_name not in module_symbol_table: + raise ValueError(f"module symbol table should contain name {type_name}") + type_symbol: LolAnalysisDataType = module_symbol_table[type_name] + # Python 3.10 should support this due to PIP 604 + # assert isinstance(type_symbol, LolAnalysisDataType) + return type_symbol + + +class LolAnalysisVariable: + def __init__(self, name: str, ast_definition_node: VariableDefinitionNode): + self.name = name + self.ast_definition_node = ast_definition_node + + self.type: Optional[LolAnalysisDataType] = None + + def complete_prototype(self, module_symbol_table: Dict[str, LolAnalysisSymbol]): + assert self.type is None + self.type = get_type(self.ast_definition_node.get_data_type(), module_symbol_table) + + def to_dict(self): + return dict( + metatype=self.__class__.__name__, + name=self.name, + type=optional_to_dict(self.type), + ) + + +class LolAnalysisFunction: + def __init__( + self, + name: str, + ast_definition_node: Optional[FunctionDefinitionNode], + *, + # Function Prototype + return_types: Optional[LolAnalysisDataType] = None, + parameter_types: Optional[List[LolAnalysisDataType]] = None, + parameter_names: Optional[List[str]] = None, + # Function Body + symbol_table: Optional[Dict[str, LolAnalysisSymbol]] = None, + body: Optional[List[LolIRStatement]] = None, + ): + self.name = name + self.ast_definition_node = ast_definition_node + + self.return_types: Optional[LolAnalysisDataType] = return_types + self.parameter_types: Optional[List[LolAnalysisDataType]] = parameter_types + self.parameter_names: Optional[List[str]] = parameter_names + + self.symbol_table: Optional[Dict[str, LolAnalysisSymbol]] = symbol_table + self.body: Optional[List[LolIRStatement]] = body + + def complete_prototype(self, module_symbol_table: Dict[str, LolAnalysisSymbol]): + assert self.return_types is None + assert self.parameter_types is None + assert self.parameter_names is None + self.return_types = get_type(self.ast_definition_node.get_return_type(), module_symbol_table) + self.parameter_types = [ + get_type(t.get_data_type(), module_symbol_table) for t in self.ast_definition_node.get_parameters() + ] + self.parameter_names = [ + get_type(t.get_name_as_str(), module_symbol_table) for t in self.ast_definition_node.get_parameters() + ] + + def _get_temporary_variable_name(self) -> str: + # NOTE: this is a complete hack! + if not hasattr(self, "tmp_cnt"): + self.tmp_cnt = 0 + tmp = self.tmp_cnt + self.tmp_cnt += 1 + return f"%{tmp}" + + def _get_symbol(self, module_symbol_table: Dict[str, LolAnalysisSymbol], name: str): + split_names = name.split("::") + first_name = split_names[0] + + if first_name in self.symbol_table: + module = self.symbol_table + for name in split_names[:-1]: + module = module[name].module_symbol_table + last_name = split_names[-1] + print(module, last_name) + return module[last_name] + elif first_name in module_symbol_table: + module = module_symbol_table + for name in split_names[:-1]: + module = module[name].module_symbol_table + last_name = split_names[-1] + print(module, last_name) + return module[last_name] + else: + raise ValueError(f"symbol {first_name} not found in either module") + + def _parse_expression_recursively(self, x: parser_types.ASTNode, module_symbol_table: Dict[str, LolAnalysisSymbol]) -> str: + if isinstance(x, parser_types.OperatorValueExpression): + op_name: str = x.get_operator_as_str() + operands: List["LolAnalysisVariable"] = [ + self._get_symbol(module_symbol_table, self._parse_expression_recursively(y)) + for y in x.get_operands() + ] + ret = self._get_temporary_variable_name() + stmt = LolIRDefinitionStatement(ret, LolIROperatorExpression(op_name, operands)) + self.body.append(stmt) + self.symbol_table[ret] = LolAnalysisVariable(ret, x) + return ret + elif isinstance(x, parser_types.Literal): + if isinstance(x, parser_types.DecimalLiteral): + ret = self._get_temporary_variable_name() + stmt = LolIRDefinitionStatement( + ret, module_symbol_table["i32"], LolIRLiteralExpression(x.value) + ) + self.body.append(stmt) + self.symbol_table[ret] = LolAnalysisVariable(ret, x) + return ret + elif isinstance(x, parser_types.StringLiteral): + ret = self._get_temporary_variable_name() + stmt = LolIRDefinitionStatement( + ret, module_symbol_table["cstr"], LolIRLiteralExpression(x.value) + ) + self.body.append(stmt) + self.symbol_table[ret] = LolAnalysisVariable(ret, x) + return ret + elif isinstance(x, parser_types.FunctionCallNode): + func_name: str = x.get_name_as_str() + func: LolAnalysisFunction = self._get_symbol(module_symbol_table, func_name) + assert isinstance(func, LolAnalysisFunction) + args: List["LolAnalysisVariable"] = [ + self._get_symbol( + module_symbol_table, + self._parse_expression_recursively(y, module_symbol_table)) + for y in x.get_arguments() + ] + ret: str = self._get_temporary_variable_name() + stmt = LolIRDefinitionStatement( + ret, func.return_types, LolIRFunctionCallExpression(func, args) + ) + self.body.append(stmt) + self.symbol_table[ret] = LolAnalysisVariable(ret, x) + return ret + elif isinstance(x, parser_types.ReturnNode): + ret = self._parse_expression_recursively(x.get_expression(), module_symbol_table) + stmt = LolIRReturnStatement(self._get_symbol(module_symbol_table, ret)) + self.body.append(stmt) + elif isinstance(x, parser_types.VariableCallNode): + return x.get_name_as_str() + else: + raise NotImplementedError("") + + def complete_body(self, module_symbol_table: Dict[str, LolAnalysisSymbol]): + assert self.symbol_table is None + assert self.body is None + self.symbol_table = {} + self.body = [] + for statement in self.ast_definition_node.get_body(): + self._parse_expression_recursively(statement, module_symbol_table) + + def to_dict(self): + return dict( + metatype=self.__class__.__name__, + name=self.name, + return_types=None, + parameter_types=None, + parameter_names=self.parameter_names, + symbol_table=optional_names(self.symbol_table), + body="TODO", + ) + + +class LolAnalysisModule: + def __init__(self, name: str, caller_module: Optional["LolAnalysisModule"] = None): + self.name = name + self.intermediate_repr: List[Any] = [] + self.module_symbol_table: Dict[str, LolAnalysisSymbol] = {} + + self.add_builtin_types(caller_module) + + def add_to_module_symbol_table(self, name, symbol): + if name in self.module_symbol_table: + raise ValueError(f"name {name} already in module symbol table") + self.module_symbol_table[name] = symbol + + def add_builtin_types(self, caller_module: Optional["LolAnalysisModule"]): + if caller_module is None: + i32 = LolAnalysisBuiltinType("i32", {}) + i32.ops["+"] = i32 + i32.ops["-"] = i32 + i32.ops["*"] = i32 + i32.ops["/"] = i32 + cstr = LolAnalysisBuiltinType("cstr", {}) + void = LolAnalysisBuiltinType("void", {}) + else: + # We want all of the built-in objects to be identical objects with + # even the pointers matching (so module_a's i32 is module_b's i32) + i32 = caller_module.module_symbol_table["i32"] + cstr = caller_module.module_symbol_table["cstr"] + void = caller_module.module_symbol_table["void"] + self.add_to_module_symbol_table("i32", i32) + self.add_to_module_symbol_table("cstr", cstr) + self.add_to_module_symbol_table("void", void) + + def to_dict(self): + # NOTE: This could end up in an infinite loop of recursion if we + # have circular imports; however, it is useful to see the verbose + # printing of modules, especially leaf modules. + return dict( + metatype=self.__class__.__name__, + name=self.name, + module_symbol_table=recursive_to_dict(self.module_symbol_table), + ) + + ### NAME + def _add_function_name(self, ast_definition: FunctionDefinitionNode): + name = ast_definition.get_name_as_str() + symbol = LolAnalysisFunction(name, ast_definition) + self.add_to_module_symbol_table(name, symbol) + + def _add_variable_name(self, ast_definition: VariableDefinitionNode): + name = ast_definition.get_name_as_str() + symbol = LolAnalysisVariable(name, ast_definition) + self.add_to_module_symbol_table(name, symbol) + + # TODO: merge this into the variable! + def _add_import_name(self, ast_definition: ImportModuleNode): + name = ast_definition.get_name_as_str() + library = ast_definition.get_library_as_str() + if library == "stdio.h": + symbol = LolAnalysisModule(library, caller_module=self) + i32: LolAnalysisBuiltinType = self.module_symbol_table["i32"] + cstr: LolAnalysisBuiltinType = self.module_symbol_table["cstr"] + printf_func = LolAnalysisFunction( + "printf", + None, + return_types=i32, + parameter_types=[cstr], + parameter_names=["format"], + ) + symbol.add_to_module_symbol_table("printf", printf_func) + else: + raise NotImplementedError("only stdio.h library is supported!") + self.add_to_module_symbol_table(name, symbol) + + def get_module_names(self, ast_nodes: List[ASTNode]): + """ + Extract names (only) of function definitions, module definitions, and + imports. + + TODO + ---- + 1. Add struct/enum/monad + """ + for i, node in enumerate(ast_nodes): + if isinstance(node, FunctionDefinitionNode): + self._add_function_name(node) + elif isinstance(node, VariableDefinitionNode): + self._add_variable_name(node) + elif isinstance(node, ImportModuleNode): + # TODO(dchu) - recursively add members to this submodule! + self._add_import_name(node) + # TODO(dchu): accept data structures + else: + # We will ignore anything outside of functions! This is an error + raise ValueError(f"{node} cannot be outside of functions!") + + ### PROTOTYPES + def add_function_prototype(self, ast_definition: FunctionDefinitionNode): + name = ast_definition.get_name_as_str() + func: LolAnalysisFunction = self.module_symbol_table[name] + func.complete_prototype(self.module_symbol_table) + + def add_variable_prototype(self, ast_definition: VariableDefinitionNode): + name = ast_definition.get_name_as_str() + var: LolAnalysisVariable = self.module_symbol_table[name] + var.complete_prototype(self.module_symbol_table) + + def add_import_prototype(self, ast_definition: ImportModuleNode): + # Intentionally do nothing + pass + + def get_module_prototypes(self, ast_nodes: List[ASTNode]): + """Get function and variable prototypes.""" + for i, node in enumerate(ast_nodes): + if isinstance(node, FunctionDefinitionNode): + self.add_function_prototype(node) + elif isinstance(node, VariableDefinitionNode): + self.add_variable_prototype(node) + elif isinstance(node, ImportModuleNode): + self.add_import_prototype(node) + else: + # We will ignore anything outside of functions! This is an error + raise ValueError(f"{node} cannot be outside of functions!") + + ### BODIES + def add_function_body(self, ast_definition: FunctionDefinitionNode): + name = ast_definition.get_name_as_str() + func: LolAnalysisFunction = self.module_symbol_table[name] + func.complete_body(self.module_symbol_table) + + def add_variable_body(self, ast_definition: VariableDefinitionNode): + # Intentionally do nothing + pass + + def add_import_body(self, ast_definition: ImportModuleNode): + # Intentionally do nothing + pass + + def get_module_bodies(self, ast_nodes: List[ASTNode]): + for i, node in enumerate(ast_nodes): + if isinstance(node, FunctionDefinitionNode): + self.add_function_body(node) + elif isinstance(node, VariableDefinitionNode): + self.add_variable_body(node) + elif isinstance(node, ImportModuleNode): + self.add_import_body(node) + else: + # We will ignore anything outside of functions! This is an error + raise ValueError(f"{node} cannot be outside of functions!") + + +def analyze(asts: List[ASTNode], raw_text: str) -> LolAnalysisModule: + module = LolAnalysisModule("main") + module.get_module_names(asts) + module.get_module_prototypes(asts) + module.get_module_bodies(asts) + print(module.to_dict()) + + return module diff --git a/prototype/emitter/lol_emitter.py b/prototype/emitter/lol_emitter.py new file mode 100644 index 0000000..f63d844 --- /dev/null +++ b/prototype/emitter/lol_emitter.py @@ -0,0 +1,100 @@ +""" +Take the AST and emit C code. + +TODO +---- + +1. Minimal Viable Product +2. Correct indentation +""" +from prototype.analyzer.new_lol_analyzer import ( + LolAnalysisModule, LolAnalysisFunction, LolAnalysisBuiltinType, + LolIRReturnStatement, LolIRFunctionCallStatement, LolIRDefinitionStatement, LolIRSetStatement, + LolIRExpression, + LolIRFunctionCallExpression, LolIROperatorExpression, LolIRLiteralExpression, LolAnalysisVariable +) + +headers = """ +#include +#include +""" + +lol_to_c_types = {"cstr": "char *", "i32": "int", "void": "void"} + + +def mangle_var_name(var_name: str) -> str: + return var_name.replace("%", "LOLvar_") + + +def emit_expr(expr: LolIRExpression) -> str: + if isinstance(expr, LolIRFunctionCallExpression): + func_name = expr.function.name + func_args = [mangle_var_name(arg.name) for arg in expr.arguments] + return f"{func_name}({', '.join(func_args)})" + elif isinstance(expr, LolIROperatorExpression): + if len(expr.operands) == 1: + return f"{expr.op}{mangle_var_name(expr.operands[0].name)}" + elif len(expr.operands) == 2: + return f"{mangle_var_name(expr.operands[0].name)} {expr.op} {mangle_var_name(expr.operands[1].name)}" + else: + raise ValueError("only 1 or 2 operands accepted!") + elif isinstance(expr, LolIRLiteralExpression): + literal = expr.literal + if isinstance(literal, str): + return f"\"{literal}\"" + elif isinstance(literal, int): + return f"{expr.literal}" + elif isinstance(expr, LolAnalysisVariable): + return f"{mangle_var_name(expr.name)}" + + +def emit_function(func: LolAnalysisFunction): + prototype = ( + f"{lol_to_c_types[func.return_types.name]}\n" + f"{func.name}({', '.join((f'{lol_to_c_types[arg_type]} {arg_name.name}' for arg_type, arg_name in zip(func.parameter_names, func.parameter_types)))})\n" + ) + statements = [] + for stmt in func.body: + if isinstance(stmt, LolIRDefinitionStatement): + var_name = mangle_var_name(stmt.name) + var_type = lol_to_c_types[stmt.type.name] + var_value = emit_expr(stmt.value) + statements.append(f" {var_type} {var_name} = {var_value};") + elif isinstance(stmt, LolIRSetStatement): + var_name = mangle_var_name(stmt.name) + var_value = emit_expr(stmt.value) + statements.append(f" {var_name} = {var_value};") + elif isinstance(stmt, LolIRFunctionCallStatement): + code = emit_expr(stmt.func_call) + statements.append(f" {code};") + elif isinstance(stmt, LolIRReturnStatement): + name = mangle_var_name(stmt.ret_var.name) + statements.append(f" return {name};") + else: + raise ValueError("unrecognized statement type (maybe if statement?)") + return prototype + "{\n" + "\n".join(statements) + "\n}\n" + + +def emit_import(include: LolAnalysisModule): + return f"#include <{include.name}>" + + +def emit_c(analysis_module: LolAnalysisModule): + import_statements = [] + func_statements = [] + # Emit modules + for name, s in analysis_module.module_symbol_table.items(): + if isinstance(s, LolAnalysisModule): + import_statements.append(emit_import(s)) + elif isinstance(s, LolAnalysisFunction): + func_statements.append(emit_function(s)) + elif isinstance(s, LolAnalysisBuiltinType): + # Obviously, we don't need to define built-in types + continue + else: + raise ValueError("unrecognized statement type") + + statements = import_statements + func_statements + code = "\n".join(statements) + print(code) + return code diff --git a/prototype/error/__init__.py b/prototype/error/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/prototype/lol_error.py b/prototype/error/lol_error.py similarity index 62% rename from prototype/lol_error.py rename to prototype/error/lol_error.py index 68918f1..bd61c9c 100644 --- a/prototype/lol_error.py +++ b/prototype/error/lol_error.py @@ -1,45 +1,73 @@ -from lol_lexer_types import Token -from lol_parser_helper import TokenStream - - -def print_error(text: str, line_number: int, column_number: int): - print(text.split("\n")[line_number - 1]) - print(" " * (column_number - 1) + "^") - - -def print_token_error(text: str, token: Token, error_msg: str) -> None: - print( - "--------------------------------------------------------------------------------" - ) - print( - f"Tokenization error on line " - f"{token.line_number}, column {token.column_number}" - ) - print(f"Error Message: {error_msg}") - print("```") - print_error(text, token.line_number, token.column_number) - print("```") - print( - "--------------------------------------------------------------------------------" - ) - return - - -def print_parser_error(stream: TokenStream, error_msg: str) -> None: - text = stream.get_text() - token = stream.get_token() - print( - "--------------------------------------------------------------------------------" - ) - print( - f"Parse error on line " - f"{token.line_number}, column {token.column_number}" - ) - print(f"Error Message: {error_msg}") - print("```") - print_error(text, token.line_number, token.column_number) - print("```") - print( - "--------------------------------------------------------------------------------" - ) - return +""" + +TODO +---- + +- [ ] Make this more user-friendly + - Pass token start position + length (we can work out the line number/column here) + - + +""" +from lexer.lol_lexer_types import Token +from parser.lol_parser_token_stream import TokenStream + + +def print_error(text: str, line_number: int, column_number: int): + print(text.split("\n")[line_number - 1]) + print(" " * (column_number - 1) + "^") + + +def print_token_error(text: str, token: Token, error_msg: str) -> None: + print( + "--------------------------------------------------------------------------------" + ) + print( + f"Tokenization error on line " + f"{token.line_number}, column {token.column_number}" + ) + print(f"Error Message: {error_msg}") + print("```") + print_error(text, token.line_number, token.column_number) + print("```") + print( + "--------------------------------------------------------------------------------" + ) + return + + +def print_parser_error(stream: TokenStream, error_msg: str) -> None: + text = stream.get_text() + token = stream.get_token() + print( + "--------------------------------------------------------------------------------" + ) + print( + f"Parse error on line " + f"{token.line_number}, column {token.column_number}" + ) + print(f"Error Message: {error_msg}") + print("```") + print_error(text, token.line_number, token.column_number) + print("```") + print( + "--------------------------------------------------------------------------------" + ) + return + + +def print_analyzer_error(text: str, token: Token, error_msg: str) -> None: + print( + "--------------------------------------------------------------------------------" + ) + print( + f"Analysis error on line " + f"{token.line_number}, column {token.column_number}" + ) + print(f"Error Message: {error_msg}") + print("```") + print_error(text, token.line_number, token.column_number) + print("```") + print( + "--------------------------------------------------------------------------------" + ) + return diff --git a/prototype/examples/fibonacci.lol b/prototype/examples/fibonacci.lol index d39d7e0..ae34c8b 100644 --- a/prototype/examples/fibonacci.lol +++ b/prototype/examples/fibonacci.lol @@ -1,5 +1,5 @@ ### Recursive Fibonacci Sequence -namespace io = import("io"); +module io = import("stdio.h"); function fibonacci(n: int64) -> int64 { return fibonacci(n - 1) + fibonacci(n - 2); diff --git a/prototype/examples/helloworld.lol b/prototype/examples/helloworld.lol index ac3783a..cc6a261 100644 --- a/prototype/examples/helloworld.lol +++ b/prototype/examples/helloworld.lol @@ -1,7 +1,7 @@ ### Basic hello world -namespace io = import("io"); +module io = import("stdio.h"); -function main() -> int32 { - io::stdout("Hello, World!\n"); +function main() -> i32 { + io::printf("Hello, World!\n"); return 0; } \ No newline at end of file diff --git a/prototype/examples/invalid/duplicate_function_names.lol b/prototype/examples/invalid/duplicate_function_names.lol new file mode 100644 index 0000000..7d08a23 --- /dev/null +++ b/prototype/examples/invalid/duplicate_function_names.lol @@ -0,0 +1,13 @@ +function duplicate() -> int32 { + return 0; +} + +# Duplicate function already defined. +function duplicate() -> int32 { + return 1; +} + +function main() -> int32 { + let r: int64 = duplicate(); + return 0; +} \ No newline at end of file diff --git a/prototype/lexer/__init__.py b/prototype/lexer/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/prototype/lol_lexer.py b/prototype/lexer/lol_lexer.py similarity index 93% rename from prototype/lol_lexer.py rename to prototype/lexer/lol_lexer.py index 147e347..858f11e 100644 --- a/prototype/lol_lexer.py +++ b/prototype/lexer/lol_lexer.py @@ -1,253 +1,247 @@ -""" -# Parser - -## Language - -This parser will parse white-space separated tokens. It will output the results -into a text file in the form: - -``` -,,,, -``` - -We use white-space as the delimiter because CSVs are well formed. - -******************************************************************************** - -The accepted tokens are (ASCII): - -1. identifiers : [A-Za-z_][A-Za-z0-9_]* - - Keywords: if, else, while, function, return, let, namespace -2. decimal integers : [1-9][0-9]* -3. strings : ["](\"|[^"])*["] -4. parentheses : "(" or ")" -5. braces : "{" or "}" -6. brackets : "[" or "]" -7. dot : "." -8. comma : "," -9. equals : "=" -10. colon : ":" -11. comments : #.* -12. semicolon : ";" -13. arrow : "->" # N.B. NOT a binary op. This is only used in the context of functions - -******************************************************************************** - -Future tokens to accept in the future are: - -1. More types of numbers (binary, octal, hexadecimal, floats, scientific notation) -2. Differentiate keywords from identifiers (not really necessary, we can do this later) -3. Add support for other operators (!, %, ^, &, *, +, -, ==, <, >, <=, >=, ->, |, ~, /, //) -4. Add escaping for strings ("\n\t\\\v\a\b") -5. Add support for single quote strings? -6. Add multiline strings ('''multiline string''') -7. Add multiline comments -""" - -lexeme = """ -# My first program! -print("Hello, World!") -""" - -medium_text = """ -io = import("io") - -io.stdout("Hello, World!") - -function sum3(a: int, b: int, c: int): int { - return a.add(b).add(c) -} - -let a: int = 0 -let b: int = 1 -let c: int = 2 -let d: int = sum3(a, b, c) -io.stdout("Answer: ", d) -""" - -advanced_text = """ -io = import ( "io" ); - -# Fibonacci sequence -function fibonacci ( iterations : int ) -> int { - let result : int = 0 , prev_result : int = 0 - for _ in range ( 0 , 10 ) { - result = 0 - } -} - -let a : int = fibonacci ( 0 ) -let b : int = fibonacci ( 1 ) - -let a_str : str = str ( a ) -let b_str : str = str ( b ) - -io . stdout ( a_str ) -io . stdout ( b_str ) -""" - -from typing import Dict, Tuple, List -from enum import Enum, auto, unique - -from lol_lexer_types import TokenType, Token, CharacterStream -from lol_error import print_token_error - - -class Tokenizer: - def __init__(self, src: str): - self.stream = CharacterStream(src) - self.tokens = [] - - def get_identifier(self, stream: CharacterStream): - # Concatentation to a list is more efficient than to a string, since - # strings are immutable. - c, pos = stream.get_char(), stream.get_pos() - token = [] - while c.isalnum() or c == "_": - token.append(c) - stream.next_char() - c = stream.get_char() - - ident = "".join(token) - if ident == "if": - return Token(*pos, TokenType.IF, ident) - elif ident == "else": - return Token(*pos, TokenType.ELSE, ident) - elif ident == "while": - return Token(*pos, TokenType.WHILE, ident) - elif ident == "function": - return Token(*pos, TokenType.FUNCTION, ident) - elif ident == "return": - return Token(*pos, TokenType.RETURN, ident) - elif ident == "let": - return Token(*pos, TokenType.LET, ident) - elif ident == "namespace": - return Token(*pos, TokenType.NAMESPACE, ident) - else: - return Token(*pos, TokenType.IDENTIFIER, ident) - - def get_dec(self, stream: CharacterStream): - # NOTE(dchu): for now, we assume that the number is a base-10 integer. - c, pos = stream.get_char(), stream.get_pos() - # Concatentation to a list is more efficient than to a string, since - # strings are immutable. - token = [] - while c.isdecimal(): - token.append(c) - stream.next_char() - c = stream.get_char() - return Token(*pos, TokenType.DEC, "".join(token)) - - def get_string(self, stream: CharacterStream): - c, pos = stream.get_char(), stream.get_pos() - # Concatentation to a list is more efficient than to a string, since - # strings are immutable. - - # We need to do one iteration outside the loop, since the first - # character is the same as the stop character in a string. - token = [c] - stream.next_char() - c = stream.get_char() - while c != '"' and c is not None: - token.append(c) - stream.next_char() - c = stream.get_char() - # Add trailing quote - token.append(c) - stream.next_char() - return Token(*pos, TokenType.STRING, "".join(token)) - - def get_comment(self, stream: CharacterStream): - c, pos = stream.get_char(), stream.get_pos() - # Concatentation to a list is more efficient than to a string, since - # strings are immutable. - token = [] - while c != "\n" and c is not None: - token.append(c) - stream.next_char() - c = stream.get_char() - return Token(*pos, TokenType.COMMENT, "".join(token)) - - def get_symbol(self, stream: CharacterStream): - c, pos = stream.get_char(), stream.get_pos() - stream.next_char() - if c == "(": - return Token(*pos, TokenType.LPAREN, c) - elif c == ")": - return Token(*pos, TokenType.RPAREN, c) - elif c == "[": - return Token(*pos, TokenType.LSQB, c) - elif c == "]": - return Token(*pos, TokenType.RSQB, c) - elif c == "{": - return Token(*pos, TokenType.LBRACE, c) - elif c == "}": - return Token(*pos, TokenType.RBRACE, c) - elif c == ",": - return Token(*pos, TokenType.COMMA, c) - elif c == ".": - return Token(*pos, TokenType.DOT, c) - elif c == "=": - return Token(*pos, TokenType.EQUAL, c) - elif c == ":": - if stream.get_char() == ":": - stream.next_char() - return Token(*pos, TokenType.COLON_COLON, "::") - return Token(*pos, TokenType.COLON, c) - elif c == ";": - return Token(*pos, TokenType.SEMICOLON, c) - elif c == "+": - return Token(*pos, TokenType.PLUS, c) - elif c == "-": - if stream.get_char() == ">": - stream.next_char() - return Token(*pos, TokenType.ARROW, "->") - return Token(*pos, TokenType.MINUS, c) - else: - raise ValueError(f"character '{c}' not supported!") - - def tokenize(self): - while True: - c = self.stream.get_char() - pos = self.stream.get_pos() - - if c is None: - break - - if c.isspace(): - self.stream.next_char() - elif c.isalpha() or c == "_": - token = self.get_identifier(self.stream) - self.tokens.append(token) - elif c.isdecimal(): - token = self.get_dec(self.stream) - self.tokens.append(token) - elif c == '"': - token = self.get_string(self.stream) - self.tokens.append(token) - elif c == "#": - token = self.get_comment(self.stream) - # TODO(dchu): re-enable this once the AST supports comments. - # Right now, we skip comments. - # self.tokens.append(token) - elif c in "()[]{}.,=:;-+": - # TODO(dchu): '-' does not necessarily imply a punctuation mark. - # It can also be the start of a negative number, e.g. -10.3 - token = self.get_symbol(self.stream) - self.tokens.append(token) - else: - raise ValueError(f"character '{c}' not supported!") - - -def tokenize(text: str) -> List[Token]: - t = Tokenizer(text) - t.tokenize() - return t.tokens - - -if __name__ == "__main__": - tokens = tokenize() - for token in tokens: - print(repr(token)) - for token in tokens: - token.to_yaml() +""" +# Parser + +## Language + +This parser will parse white-space separated tokens. It will output the results +into a text file in the form: + +``` +,,,, +``` + +We use white-space as the delimiter because CSVs are well formed. + +******************************************************************************** + +The accepted tokens are (ASCII): + +1. identifiers : [A-Za-z_][A-Za-z0-9_]* + - Keywords: if, else, while, function, return, let, namespace +2. decimal integers : [1-9][0-9]* +3. strings : ["](\"|[^"])*["] +4. parentheses : "(" or ")" +5. braces : "{" or "}" +6. brackets : "[" or "]" +7. dot : "." +8. comma : "," +9. equals : "=" +10. colon : ":" +11. comments : #.* +12. semicolon : ";" +13. arrow : "->" # N.B. NOT a binary op. This is only used in the context of functions + +******************************************************************************** + +Future tokens to accept in the future are: + +1. More types of numbers (binary, octal, hexadecimal, floats, scientific notation) +2. Differentiate keywords from identifiers (not really necessary, we can do this later) +3. Add support for other operators (!, %, ^, &, *, +, -, ==, <, >, <=, >=, ->, |, ~, /, //) +4. Add escaping for strings ("\n\t\\\v\a\b") +5. Add support for single quote strings? +6. Add multiline strings ('''multiline string''') +7. Add multiline comments +""" + +lexeme = """ +# My first program! +print("Hello, World!") +""" + +medium_text = """ +io = import("io") + +io.stdout("Hello, World!") + +function sum3(a: int, b: int, c: int): int { + return a.add(b).add(c) +} + +let a: int = 0 +let b: int = 1 +let c: int = 2 +let d: int = sum3(a, b, c) +io.stdout("Answer: ", d) +""" + +advanced_text = """ +io = import ( "io" ); + +# Fibonacci sequence +function fibonacci ( iterations : int ) -> int { + let result : int = 0 , prev_result : int = 0 + for _ in range ( 0 , 10 ) { + result = 0 + } +} + +let a : int = fibonacci ( 0 ) +let b : int = fibonacci ( 1 ) + +let a_str : str = str ( a ) +let b_str : str = str ( b ) + +io . stdout ( a_str ) +io . stdout ( b_str ) +""" + +from typing import List + +from lexer.lol_lexer_types import TokenType, Token, CharacterStream + + +class Tokenizer: + def __init__(self, src: str): + self.stream = CharacterStream(src) + self.tokens = [] + + def get_identifier(self, stream: CharacterStream): + # Concatentation to a list is more efficient than to a string, since + # strings are immutable. + c, pos = stream.get_char(), stream.get_pos() + token = [] + while c.isalnum() or c == "_": + token.append(c) + stream.next_char() + c = stream.get_char() + + ident = "".join(token) + if ident == "if": + return Token(*pos, TokenType.IF, ident) + elif ident == "else": + return Token(*pos, TokenType.ELSE, ident) + elif ident == "while": + return Token(*pos, TokenType.WHILE, ident) + elif ident == "function": + return Token(*pos, TokenType.FUNCTION, ident) + elif ident == "return": + return Token(*pos, TokenType.RETURN, ident) + elif ident == "let": + return Token(*pos, TokenType.LET, ident) + elif ident == "namespace": + return Token(*pos, TokenType.NAMESPACE, ident) + elif ident == "module": + return Token(*pos, TokenType.MODULE, ident) + elif ident == "import": + return Token(*pos, TokenType.IMPORT, ident) + else: + return Token(*pos, TokenType.IDENTIFIER, ident) + + def get_dec(self, stream: CharacterStream): + # NOTE(dchu): for now, we assume that the number is a base-10 integer. + c, pos = stream.get_char(), stream.get_pos() + # Concatentation to a list is more efficient than to a string, since + # strings are immutable. + token = [] + while c.isdecimal(): + token.append(c) + stream.next_char() + c = stream.get_char() + return Token(*pos, TokenType.DEC, "".join(token)) + + def get_string(self, stream: CharacterStream): + c, pos = stream.get_char(), stream.get_pos() + # Concatentation to a list is more efficient than to a string, since + # strings are immutable. + + # We need to do one iteration outside the loop, since the first + # character is the same as the stop character in a string. + token = [c] + stream.next_char() + c = stream.get_char() + while c != '"' and c is not None: + token.append(c) + stream.next_char() + c = stream.get_char() + # Add trailing quote + token.append(c) + stream.next_char() + return Token(*pos, TokenType.STRING, "".join(token)) + + def get_comment(self, stream: CharacterStream): + c, pos = stream.get_char(), stream.get_pos() + # Concatentation to a list is more efficient than to a string, since + # strings are immutable. + token = [] + while c != "\n" and c is not None: + token.append(c) + stream.next_char() + c = stream.get_char() + return Token(*pos, TokenType.COMMENT, "".join(token)) + + def get_symbol(self, stream: CharacterStream): + c, pos = stream.get_char(), stream.get_pos() + stream.next_char() + if c == "(": + return Token(*pos, TokenType.LPAREN, c) + elif c == ")": + return Token(*pos, TokenType.RPAREN, c) + elif c == "[": + return Token(*pos, TokenType.LSQB, c) + elif c == "]": + return Token(*pos, TokenType.RSQB, c) + elif c == "{": + return Token(*pos, TokenType.LBRACE, c) + elif c == "}": + return Token(*pos, TokenType.RBRACE, c) + elif c == ",": + return Token(*pos, TokenType.COMMA, c) + elif c == ".": + return Token(*pos, TokenType.DOT, c) + elif c == "=": + return Token(*pos, TokenType.EQUAL, c) + elif c == ":": + if stream.get_char() == ":": + stream.next_char() + return Token(*pos, TokenType.COLON_COLON, "::") + return Token(*pos, TokenType.COLON, c) + elif c == ";": + return Token(*pos, TokenType.SEMICOLON, c) + elif c == "+": + return Token(*pos, TokenType.PLUS, c) + elif c == "-": + if stream.get_char() == ">": + stream.next_char() + return Token(*pos, TokenType.ARROW, "->") + return Token(*pos, TokenType.MINUS, c) + else: + raise ValueError(f"character '{c}' not supported!") + + def tokenize(self): + while True: + c = self.stream.get_char() + pos = self.stream.get_pos() + + if c is None: + break + + if c.isspace(): + self.stream.next_char() + elif c.isalpha() or c == "_": + token = self.get_identifier(self.stream) + self.tokens.append(token) + elif c.isdecimal(): + token = self.get_dec(self.stream) + self.tokens.append(token) + elif c == '"': + token = self.get_string(self.stream) + self.tokens.append(token) + elif c == "#": + token = self.get_comment(self.stream) + # TODO(dchu): re-enable this once the AST supports comments. + # Right now, we skip comments. + # self.tokens.append(token) + elif c in "()[]{}.,=:;-+": + # TODO(dchu): '-' does not necessarily imply a punctuation mark. + # It can also be the start of a negative number, e.g. -10.3 + token = self.get_symbol(self.stream) + self.tokens.append(token) + else: + raise ValueError(f"character '{c}' not supported!") + + +def tokenize(text: str) -> List[Token]: + t = Tokenizer(text) + t.tokenize() + return t.tokens diff --git a/prototype/lol_lexer_types.py b/prototype/lexer/lol_lexer_types.py similarity index 74% rename from prototype/lol_lexer_types.py rename to prototype/lexer/lol_lexer_types.py index 0921aa2..d320a66 100644 --- a/prototype/lol_lexer_types.py +++ b/prototype/lexer/lol_lexer_types.py @@ -1,142 +1,157 @@ -from typing import Dict, Tuple, List -from enum import Enum, auto, unique - - -@unique -class TokenType(Enum): - LPAREN = auto() # ( - RPAREN = auto() # ) - LSQB = auto() # [ - RSQB = auto() # ] - LBRACE = auto() # { - RBRACE = auto() # } - - DOT = auto() # . - COMMA = auto() # , - EQUAL = auto() # = - COLON = auto() # : - SEMICOLON = auto() # ; - - ARROW = auto() # -> - - # Unimplemented in tokenizer - COLON_COLON = auto() # :: - EXCLAMATION = auto() # ! - AT = auto() # @ - PERCENT = auto() # % - CIRCUMFLEX = auto() # ^ - AMPERSAND = auto() # & - STAR = auto() # * - PLUS = auto() # + - MINUS = auto() # - - SLASH = auto() # / - - RSHIFT = auto() # >> - LSHIFT = auto() # << - QUESTION = auto() # ? - VBAR = auto() # | - BSLASH = auto() # \ - - EQUAL_EQUAL = auto() # == - NOT_EQUAL = auto() # != - GREATER = auto() # > - LESSER = auto() # < - GREATER_EQUAL = auto() # >= - LESSER_EQUAL = auto() # <= - - # Unimplemented in tokenizer (no plan to implement these yet) - STAR_STAR = auto() # ** - PLUS_PLUS = auto() # ++ - MINUS_MINUS = auto() # -- - SLASH_SLASH = auto() # // - - # DEPRECATED - NEWLINE = auto() # \n - - # Multicharacter conglomerates - IDENTIFIER = auto() # [A-Za-z_][A-Za-z_0-9] - STRING = auto() # "[^"\n]*" - DEC = auto() # [1-9][0-9]* - COMMENT = auto() # #.* - - # Keywords - IF = auto() - ELSE = auto() - WHILE = auto() - FUNCTION = auto() - RETURN = auto() - LET = auto() - NAMESPACE = auto() - - -class Token: - def __init__( - self, - idx: int, - line_number: int, - column_number: int, - token_type: TokenType, - lexeme: str, - ): - self.idx = idx - self.line_number = line_number - self.column_number = column_number - self.token_type = token_type - self.lexeme = lexeme - - def type(self): - return self.token_type - - def type_name(self): - return self.token_type.name - - def __str__(self): - """Serialize the token.""" - return f"{self.idx},{self.line_number},{self.column_number},{self.token_type.value},{self.lexeme}" - - def __repr__(self): - """Pretty print the token. This is NOT for serialization, because the - token type should be an integer id so that it's easier to parse.""" - return f"{self.idx},{self.line_number},{self.column_number},{self.token_type},{self.lexeme}" - - def to_yaml(self): - print(f"- Token: {self.token_type}") - print(f" Position: {self.idx}") - print(f" LineNumber: {self.line_number}") - print(f" ColumnNumber: {self.column_number}") - print(f" Length: {len(self.lexeme)}") - print(f" Lexeme: {self.lexeme}") - - -class CharacterStream: - def __init__(self, text: str): - self.text = text - self.idx = 0 - self.line_number = 1 - self.column_number = 1 - - def get_text(self) -> str: - return self.text - - def get_char(self) -> str: - """Get the current character or return None""" - if self.idx >= len(self.text): - return None - return self.text[self.idx] - - def next_char(self): - """Advance to the next character or return early if we are at the last character.""" - c = self.get_char() - if c is None: - return - self.idx += 1 - if c == "\n": - self.line_number += 1 - self.column_number = 1 - else: - self.column_number += 1 - - def get_pos(self) -> Tuple[int, int, int]: - """Get the current character position in a (absolute_index, line_number, - column_number) tuple""" - return (self.idx, self.line_number, self.column_number) +from typing import Dict, Tuple, Union, Optional +from enum import Enum, auto, unique + + +@unique +class TokenType(Enum): + LPAREN = auto() # ( + RPAREN = auto() # ) + LSQB = auto() # [ + RSQB = auto() # ] + LBRACE = auto() # { + RBRACE = auto() # } + + DOT = auto() # . + COMMA = auto() # , + EQUAL = auto() # = + COLON = auto() # : + SEMICOLON = auto() # ; + + ARROW = auto() # -> + + # Unimplemented in tokenizer + COLON_COLON = auto() # :: + EXCLAMATION = auto() # ! + AT = auto() # @ + PERCENT = auto() # % + CIRCUMFLEX = auto() # ^ + AMPERSAND = auto() # & + STAR = auto() # * + PLUS = auto() # + + MINUS = auto() # - + SLASH = auto() # / + + RSHIFT = auto() # >> + LSHIFT = auto() # << + QUESTION = auto() # ? + VBAR = auto() # | + BSLASH = auto() # \ + + EQUAL_EQUAL = auto() # == + NOT_EQUAL = auto() # != + GREATER = auto() # > + LESSER = auto() # < + GREATER_EQUAL = auto() # >= + LESSER_EQUAL = auto() # <= + + # Unimplemented in tokenizer (no plan to implement these yet) + STAR_STAR = auto() # ** + PLUS_PLUS = auto() # ++ + MINUS_MINUS = auto() # -- + SLASH_SLASH = auto() # // + + # DEPRECATED + NEWLINE = auto() # \n + + # Multicharacter conglomerates + IDENTIFIER = auto() # [A-Za-z_][A-Za-z_0-9] + STRING = auto() # "[^"\n]*" + DEC = auto() # [1-9][0-9]* + COMMENT = auto() # #.* + + # Keywords + IF = auto() + ELSE = auto() + WHILE = auto() + FUNCTION = auto() + RETURN = auto() + LET = auto() + NAMESPACE = auto() + MODULE = auto() + IMPORT = auto() + PRINT = auto() + + +class Token: + def __init__( + self, + idx: int, + line_number: int, + column_number: int, + token_type: TokenType, + lexeme: str, + ): + self.idx = idx + self.line_number = line_number + self.column_number = column_number + self.token_type = token_type + self.lexeme = lexeme + + def is_type(self, token_type: TokenType) -> bool: + return self.token_type == token_type + + def as_str(self): + return self.lexeme + + def type(self): + return self.token_type + + def type_name(self): + return self.token_type.name + + def __repr__(self): + """Pretty print the token. This is NOT for serialization, because the + token type should be an integer id so that it's easier to parse.""" + return ( + f"`{self.lexeme}`: {self.token_type.name} at position {self.idx} " + f"(line {self.line_number}:col {self.column_number})" + ) + + def to_dict(self) -> Dict[str, Union[TokenType, int, str]]: + """ + Pretty print the serialized token. + + To make this purely functional, we would print the token type ID, + the start position, and the lexeme. Everything else is superfluous.""" + return dict( + TokenType=self.token_type.name, + StartPosition=self.idx, + LineNumber=self.line_number, + ColumnNumber=self.column_number, + LexemeLength=len(self.lexeme), + Lexeme=self.lexeme, + ) + + +class CharacterStream: + def __init__(self, text: str): + self.text = text + self.idx = 0 + self.line_number = 1 + self.column_number = 1 + + def get_text(self) -> str: + return self.text + + def get_char(self) -> Optional[str]: + """Get the current character or return None""" + if self.idx >= len(self.text): + return None + return self.text[self.idx] + + def next_char(self): + """Advance to the next character or return early if we are at the last character.""" + c = self.get_char() + if c is None: + return + self.idx += 1 + if c == "\n": + self.line_number += 1 + self.column_number = 1 + else: + self.column_number += 1 + + def get_pos(self) -> int: + """Get the current character position in a (absolute_index, line_number, + column_number) tuple""" + return (self.idx, self.line_number, self.column_number) diff --git a/prototype/lol.py b/prototype/lol.py new file mode 100644 index 0000000..21545f7 --- /dev/null +++ b/prototype/lol.py @@ -0,0 +1,133 @@ +import argparse +import json +import os +import time +from typing import Any, Dict, List + +from prototype.lexer.lol_lexer_types import Token +from prototype.parser.lol_parser_token_stream import TokenStream +from prototype.parser.lol_parser_types import ASTNode + +from prototype.lexer.lol_lexer import tokenize +from prototype.parser.lol_parser import parse +from prototype.analyzer.new_lol_analyzer import analyze +from prototype.emitter.lol_emitter import emit_c + + +class LolSymbol: + def __init__(self): + self.type: Any = None + self.definition: Any = None + + def to_dict(self) -> Dict[str, Any]: + return {"type": self.type, } + + +class LolModule: + def __init__(self): + # Metadata + self.init_timestamp = time.time() + + self.text: str = "" + self.tokens: List[Token] = [] + self.ast: List[ASTNode] = [] + self.symbol_table: Dict[str, LolSymbol] = {} + + def read_file(self, file_name: str): + assert isinstance(file_name, str) + with open(file_name) as f: + self.text = f.read() + + ############################################################################ + ### LEXER + ############################################################################ + + def run_lexer(self): + assert self.text != "", "LolModule" + assert self.tokens == [] + + self.tokens = tokenize(self.text) + + def save_lexer_output_only(self, output_dir: str): + file_name: str = f"{output_dir}/{self.init_timestamp}-lexer-output-only.json" + with open(file_name, "w") as f: + json.dump({"lexer-output": [x.to_dict() for x in self.tokens]}, f, indent=4) + + ############################################################################ + ### PARSER + ############################################################################ + + def run_parser(self): + assert self.tokens != [] + + stream = TokenStream(self.tokens, self.text) + self.ast = parse(stream) + + def save_parser_output_only(self, output_dir: str): + file_name: str = f"{output_dir}/{self.init_timestamp}-parser-output-only.json" + with open(file_name, "w") as f: + json.dump({"parser-output": [x.to_dict() for x in self.ast]}, f, indent=4) + + ############################################################################ + ### ANALYZER + ############################################################################ + + def run_analyzer(self): + self.symbol_table = analyze(self.ast, self.text) + + def save_analyzer_output_only(self, output_dir: str): + file_name: str = f"{output_dir}/{self.init_timestamp}-analyzer-output-only.json" + with open(file_name, "w") as f: + json.dump({"analyzer-output": {x: y.to_dict() for x, y in self.symbol_table.module_symbol_table.items()}}, f, indent=4) + + ############################################################################ + ### EMITTER + ############################################################################ + + def run_emitter(self): + # TODO: Make this in the __init__function + self.code = emit_c(self.symbol_table) + + def save_emitter_output_only(self, output_dir: str): + file_name: str = f"{output_dir}/{self.init_timestamp}-emitter-output-only.json" + with open(file_name, "w") as f: + json.dump({"emitter-output": self.code}, f, indent=4) + + + +def main() -> None: + parser = argparse.ArgumentParser() + # TODO(dchu): make this accept multiple file names or folders. Also accept + # a full configuration file. + parser.add_argument( + "-i", "--input", type=str, required=True, help="Input file name" + ) + parser.add_argument( + "-o", "--output", type=str, default=None, help="Output directory name" + ) + args = parser.parse_args() + + # I explicitly extract the names because otherwise one may be tempted to + # pass the 'args' namespace, which is always confusing. + input_file = args.input + output_dir = args.output + + module = LolModule() + # Assume input_file is not None because it is required + module.read_file(input_file) + # Make empty output dir if it doesn't exist + if not os.path.exists(output_dir): + os.mkdir(output_dir) + + module.run_lexer() + module.save_lexer_output_only(output_dir) + module.run_parser() + module.save_parser_output_only(output_dir) + module.run_analyzer() + module.save_analyzer_output_only(output_dir) + module.run_emitter() + module.save_emitter_output_only(output_dir) + + +if __name__ == "__main__": + main() diff --git a/prototype/lol_ast_types.py b/prototype/lol_ast_types.py deleted file mode 100644 index 2b9c79a..0000000 --- a/prototype/lol_ast_types.py +++ /dev/null @@ -1,160 +0,0 @@ -from abc import ABCMeta, abstractmethod -from typing import List - -from lol_lexer import Token - -# Might have to be an Abstract Base Class? -class ASTNode(metaclass=ABCMeta): - @abstractmethod - def to_dict(self): - raise NotImplementedError("abc") - - def __repr__(self) -> str: - return repr(self.to_dict()) - - -class ASTLeaf(ASTNode, metaclass=ABCMeta): - def __init__(self, token: Token): - self.token = token - - -class LiteralLeaf(ASTLeaf, metaclass=ABCMeta): - def __init__(self, token: Token): - super().__init__(token) - self.value = None - - -class DecimalLeaf(LiteralLeaf): - def __init__(self, token: Token): - super().__init__(token) - self.value = int(token.lexeme) - - def to_dict(self): - return dict(type=__class__.__name__, value=self.value) - - -class StringLeaf(LiteralLeaf): - def __init__(self, token: Token): - super().__init__(token) - self.value = token.lexeme[1:-1] # String surrounding quotations - - def to_dict(self): - return dict(type=self.__class__.__name__, value=self.value) - - -class IdentifierLeaf(ASTLeaf): - def __init__(self, token: Token): - super().__init__(token) - - def to_dict(self): - return dict(type=self.__class__.__name__, identifier=self.token.lexeme) - - -class OperatorLeaf(ASTLeaf): - """An operator, such as '+'.""" - - def __init__(self, token: Token): - super().__init__(token) - - def to_dict(self): - return dict(type=self.__class__.__name__, op=self.token.lexeme) - - -class BinOpNode(ASTNode): - """A binary op with a lhs and rhs.""" - - def __init__(self, op: OperatorLeaf, lhs: ASTNode, rhs: ASTNode): - self.op = op - self.lhs = lhs - self.rhs = rhs - - def to_dict(self): - return dict( - type=self.__class__.__name__, - op=self.op.to_dict(), - lhs=self.lhs.to_dict(), - rhs=self.rhs.to_dict(), - ) - - -class FunctionPrototypeNode(ASTNode): - def __init__( - self, - name: IdentifierLeaf, - parameters: List[ASTNode], - return_type: ASTNode, - ): - self.name = name - self.parameters = parameters - self.return_type = return_type - - def to_dict(self): - return dict( - type=self.__class__.__name__, - name=self.name.to_dict(), - parameters=[p.to_dict() for p in self.parameters], - return_type=self.return_type.to_dict(), - ) - - -class FunctionDefNode(ASTNode): - def __init__(self, prototype: FunctionPrototypeNode, body: List[ASTNode]): - self.prototype = prototype - self.body = body - - def to_dict(self): - return dict( - type=self.__class__.__name__, - prototype=self.prototype.to_dict(), - body=[b.to_dict() for b in self.body], - ) - - -class FunctionCallNode(ASTNode): - # Include generics in function call? - def __init__(self, name: IdentifierLeaf, arguments: List[ASTNode]): - self.name = name - self.arguments = arguments - - def to_dict(self): - return dict( - type=self.__class__.__name__, - name=self.name.to_dict(), - arguments=[a.to_dict() for a in self.arguments], - ) - - -class LetNode(ASTNode): - def __init__(self, expression: ASTNode) -> None: - super().__init__() - self.expression = expression - - def to_dict(self): - return dict( - type=self.__class__.__name__, - expression=self.expression.to_dict(), - ) - - -class ReturnNode(ASTNode): - def __init__(self, expression: ASTNode) -> None: - super().__init__() - self.expression = expression - - def to_dict(self): - return dict( - type=self.__class__.__name__, - expression=self.expression.to_dict(), - ) - - -class NamespaceNode(ASTNode): - def __init__(self, expression: ASTNode) -> None: - super().__init__() - self.expression = expression - - def to_dict(self): - return dict( - type=self.__class__.__name__, - expression=self.expression.to_dict(), - ) diff --git a/prototype/lol_parser.py b/prototype/lol_parser.py deleted file mode 100644 index 3eb5886..0000000 --- a/prototype/lol_parser.py +++ /dev/null @@ -1,314 +0,0 @@ -""" -Sources -------- - -I used the LLVM Kaleidoscope tutorial extensively. - -TODO ----- - -1. Differentiate between nodes and leaves - - Leaf: literal (decimal, string, variable, op) -2. Handle errors instead of `assert` -""" -from typing import List, Union - -from lol_lexer import Token, TokenType, tokenize -from lol_parser_helper import TokenStream -from lol_ast_types import ( - DecimalLeaf, - StringLeaf, - IdentifierLeaf, - OperatorLeaf, - BinOpNode, - FunctionPrototypeNode, - FunctionDefNode, - FunctionCallNode, - LetNode, - ReturnNode, - NamespaceNode, - # Abstract Types - LiteralLeaf, - ASTNode, -) -from lol_error import print_parser_error - - -def eat_token(stream: TokenStream, expected_type: TokenType) -> Token: - token = stream.get_token() - if token.type() != expected_type: - error_msg = f"expected {expected_type.name}, got {token.type_name()}" - print_parser_error(stream, error_msg) - raise ValueError(error_msg) - stream.next_token() - return token - - -def parse_literal(stream: TokenStream) -> LiteralLeaf: - token = stream.get_token() - if token.type() == TokenType.STRING: - stream.next_token() - return StringLeaf(token) - elif token.type() == TokenType.DEC: - stream.next_token() - return DecimalLeaf(token) - else: - raise ValueError(f"unexpected token type: {repr(token)}") - - -def parse_paren(stream: TokenStream) -> ASTNode: - eat_token(stream, TokenType.LPAREN) - ret = parse_expr(stream) - eat_token(stream, TokenType.RPAREN) - return ret - - -def parse_identifier( - stream: TokenStream, -) -> Union[IdentifierLeaf, FunctionCallNode]: - """Parse both variables and function calls. - - This is due to the semantics we do not know whether the identifier will be a - variable name or a function call. - - In the future, it may be an array thing too array[100].""" - token = eat_token(stream, TokenType.IDENTIFIER) - - if stream.get_token().type() != TokenType.LPAREN: - return IdentifierLeaf(token) - - # Call - eat_token(stream, TokenType.LPAREN) - args = [] - # TODO(dchu): fix the bug where this will exceed the array limits of the - # token stream if it doesn't find anything. - while stream.get_token().type() != TokenType.RPAREN: - args.append(parse_expr(stream)) - if stream.get_token().type() == TokenType.RPAREN: - break - elif stream.get_token().type() == TokenType.COMMA: - continue - else: - print_parser_error( - stream, - error_msg=f"Expected COMMA or RPAREN, got {stream.get_token().type()}", - ) - raise ValueError("Expected COMMA or RPAREN") - eat_token(stream, TokenType.RPAREN) - return FunctionCallNode(IdentifierLeaf(token), args) - - -# TODO(dchu): figure out why this is called "primary" -def parse_primary(stream: TokenStream) -> ASTNode: - """Helper functions for parsing identifiers, literals, and parenthetic expressions.""" - token = stream.get_token() - if token.type() == TokenType.IDENTIFIER: - return parse_identifier(stream) - elif token.type() in {TokenType.DEC, TokenType.STRING}: - return parse_literal(stream) - elif token.type() == TokenType.LPAREN: - return parse_paren(stream) - else: - error_msg = f"unrecognized primary {token}" - print_parser_error(stream, error_msg) - raise ValueError(error_msg) - - -# TODO(dchu): refactor this to make it smarter. Also move the hard-coded -# precedence somewhere smarter. -def get_binop_precedence(op: Token) -> int: - precedence = { - # The '::' operator should always be on the left of any '.' operators, - # so it has precedence due to left-associativity anyways. - TokenType.COLON_COLON: 1500, # Highest - TokenType.DOT: 1400, - TokenType.ARROW: 1400, - # Prefix operators have precedence of 1300 - TokenType.STAR: 1200, - TokenType.SLASH: 1200, # TODO(dchu): Is this real divide? - TokenType.SLASH_SLASH: 1200, # Not in C - TokenType.PERCENT: 1200, # TODO(dchu): Is this same semantics as in C? - TokenType.PLUS: 1100, - TokenType.MINUS: 1100, - TokenType.LSHIFT: 1000, - TokenType.RSHIFT: 1000, - TokenType.AMPERSAND: 900, # In C, this is lower than comparison ops - TokenType.CIRCUMFLEX: 800, # In C, this is lower than comparison ops - TokenType.VBAR: 700, # In C, this is lower than comparison ops - TokenType.COLON: 600, # Not in C - TokenType.LESSER: 500, - TokenType.LESSER_EQUAL: 500, - TokenType.GREATER: 500, - TokenType.GREATER_EQUAL: 500, - TokenType.EQUAL_EQUAL: 500, # In C, this is lower than other comparison ops - TokenType.NOT_EQUAL: 500, # In C, this is lower than other comparison ops - # The '&&'/'and' operator is 400 - # The '||'/'or' operator is 300 - TokenType.EQUAL: 200, - TokenType.COMMA: 100, # Weakest - } - return precedence.get(op.type(), -1) - - -def parse_expr(stream: TokenStream) -> ASTNode: - lhs = parse_primary(stream) - assert lhs is not None - return parse_binop_rhs(stream, 0, lhs) - - -def parse_binop_rhs( - stream: TokenStream, min_expression_precedence: int, lhs: ASTNode -) -> ASTNode: - """ - Inputs - ------ - - * min_expression_precedence: int - min operator precedence that function is - allowed to eat. - """ - while True: - binop_token = stream.get_token() - binop_token_precedence = get_binop_precedence(binop_token) - - # Exit if the token has a lower precedence than what we're allowed to - # consume. This could be for a variety of reasons: if we pass an invalid - # binop, if the token is None (representing the end of the stream), or - # if it is a binop with too low precedence. - if binop_token_precedence < min_expression_precedence: - return lhs - - stream.next_token() - rhs = parse_primary(stream) - assert rhs - - # TODO(dchu): I have no idea what this is actually doing. I just copied - # it from https://llvm.org/docs/tutorial/MyFirstLanguageFrontend/LangImpl02.html - token = stream.get_token() - next_prec = get_binop_precedence(token) - if binop_token_precedence < next_prec: - rhs = parse_binop_rhs(stream, binop_token_precedence + 1, rhs) - assert rhs - - binop = OperatorLeaf(binop_token) - lhs = BinOpNode(binop, lhs, rhs) - - -def parse_function_prototype(stream: TokenStream) -> FunctionPrototypeNode: - """Parse function definition.""" - eat_token(stream, TokenType.FUNCTION) - - # TODO(dchu): Optional for anonymous functions. - name = eat_token(stream, TokenType.IDENTIFIER) - name = IdentifierLeaf(name) - - # TODO(dchu): add generics parsing here! - pass - - # Parameters - eat_token(stream, TokenType.LPAREN) - args = [] - while True: - token = stream.get_token() - if token.type() == TokenType.IDENTIFIER: - # NOTE: we explicitly accept only an identifier. Refactor this out - # because we'll use this in the 'let' statements and 'namespace' - # statements too! - lhs = IdentifierLeaf(token) - stream.next_token() - comma_precedence = get_binop_precedence( - Token(-1, -1, -1, TokenType.COMMA, ",") - ) - expr = parse_binop_rhs(stream, comma_precedence + 1, lhs) - args.append(expr) - elif token.type() == TokenType.COMMA: - eat_token(stream, TokenType.COMMA) - continue - elif token.type() == TokenType.RPAREN: - eat_token(stream, TokenType.RPAREN) - break - else: - print_parser_error( - stream.get_text(), stream.get_token(), error_msg=f"error!" - ) - raise ValueError(f"unexpected token type: {repr(token)}") - - # Return type - eat_token(stream, TokenType.ARROW) - # TODO(dchu): enable compound types, e.g. Int[0:255] - ret = eat_token(stream, TokenType.IDENTIFIER) - ret = IdentifierLeaf(ret) - return FunctionPrototypeNode(name, args, ret) - - -def parse_function_def(stream: TokenStream) -> FunctionDefNode: - proto = parse_function_prototype(stream) - eat_token(stream, TokenType.LBRACE) - - # Parse body - # TODO(dchu): figure out what to do here. https://llvm.org/docs/tutorial/MyFirstLanguageFrontend/LangImpl02.html - body = [] - while True: - if stream.get_token().type() == TokenType.RBRACE: - break - body.append(parse_statement(stream)) - eat_token(stream, TokenType.RBRACE) - - return FunctionDefNode(proto, body) - - -def parse_let(stream: TokenStream) -> ASTNode: - """Parse 'let' statement outside of a function.""" - eat_token(stream, TokenType.LET) - return LetNode(parse_statement(stream)) - - -def parse_namespace(stream: TokenStream) -> ASTNode: - """Parse 'namespace' statement outside of a function.""" - eat_token(stream, TokenType.NAMESPACE) - return NamespaceNode(parse_statement(stream)) - - -def parse_statement(stream: TokenStream) -> ASTNode: - token = stream.get_token() - if token.type() == TokenType.LET: - eat_token(stream, TokenType.LET) - result = parse_expr(stream) - eat_token(stream, TokenType.SEMICOLON) - return LetNode(result) - elif token.type() == TokenType.RETURN: - eat_token(stream, TokenType.RETURN) - result = parse_expr(stream) - eat_token(stream, TokenType.SEMICOLON) - return ReturnNode(result) - # TODO(dchu): if, while, for loops - else: - result = parse_expr(stream) - eat_token(stream, TokenType.SEMICOLON) - return result - - -def parse(stream: TokenStream) -> List[ASTNode]: - result = [] - while stream.get_token() is not None: - token = stream.get_token() - if token.type() == TokenType.FUNCTION: - result.append(parse_function_def(stream)) - elif token.type() == TokenType.NAMESPACE: - result.append(parse_namespace(stream)) - elif token.type() == TokenType.LET: - result.append(parse_let(stream)) - else: - raise ValueError(f"Unexpected token: {token}") - return result - - -if __name__ == "__main__": - with open("prototype/examples/fibonacci.lol") as f: - text = f.read() - tokens = tokenize(text=text) - stream = TokenStream(tokens, text=text) - ast = parse(stream) - import json - - for a in ast: - print(json.dumps(a.to_dict(), indent=4)) diff --git a/prototype/parser/__init__.py b/prototype/parser/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/prototype/parser/lol_parser.py b/prototype/parser/lol_parser.py new file mode 100644 index 0000000..5632aff --- /dev/null +++ b/prototype/parser/lol_parser.py @@ -0,0 +1,421 @@ +""" +Sources +------- + +I used the LLVM Kaleidoscope tutorial extensively. + +TODO +---- + +1. Differentiate between nodes and leaves + - Leaf: literal (decimal, string, variable, op) +2. Handle errors instead of `assert` +""" +from typing import List, Set, Union + +from prototype.lexer.lol_lexer import Token, TokenType +from prototype.parser.lol_parser_token_stream import TokenStream +from prototype.parser.lol_parser_types import ( + OperatorType, + # AST Nodes + ASTNode, + # Literals and Identifiers + Literal, # Abstract data type + DecimalLiteral, + StringLiteral, + OperatorLiteral, + Identifier, + # Value Expressions + Expression, # Abstract data type + ValueExpression, + OperatorValueExpression, + # Type Expressions + TypeExpression, + OperatorTypeExpression, + # Functions + FunctionNode, + FunctionDefinitionNode, + FunctionCallNode, + # Variables + VariableDefinitionNode, + VariableModificationNode, + VariableCallNode, + # Imports + ImportModuleNode, + # Inner Function Expressions + ReturnNode, +) +from prototype.error.lol_error import print_parser_error + + +LITERAL_TOKENS: Set[TokenType] = {TokenType.DEC, TokenType.STRING} +FUNCTION_STATEMENTS = Union[ + ValueExpression, + TypeExpression, # Only if we allow isolated type statements in + # functions + VariableDefinitionNode, + VariableModificationNode, + ReturnNode, + FunctionDefinitionNode, # Only if we allow function defintions within + # other functions + ImportModuleNode, # Only if we allow module imports within functions +] + + +################################################################################ +### HELPER FUNCTIONS +################################################################################ +def eat_token(stream: TokenStream, expected_type: TokenType) -> Token: + token = stream.get_token() + if token.type() != expected_type: + error_msg = f"expected {expected_type.name}, got {token.type_name()}" + print_parser_error(stream, error_msg) + raise ValueError(error_msg) + stream.next_token() + return token + + +# TODO(dchu): refactor this to make it smarter. Also move the hard-coded +# precedence somewhere smarter. +def get_binop_precedence(op: Token) -> int: + """Get the precedence of a binary operator.""" + precedence = { + # The '::' operator should always be on the left of any '.' operators, + # so it has precedence due to left-associativity anyways. + TokenType.COLON_COLON: 1500, # Highest + TokenType.DOT: 1400, + TokenType.ARROW: 1400, + # Prefix operators have precedence of 1300 + TokenType.STAR: 1200, + TokenType.SLASH: 1200, # TODO(dchu): Is this real divide? + TokenType.SLASH_SLASH: 1200, # Not in C + TokenType.PERCENT: 1200, # TODO(dchu): Is this same semantics as in C? + TokenType.PLUS: 1100, + TokenType.MINUS: 1100, + TokenType.LSHIFT: 1000, + TokenType.RSHIFT: 1000, + TokenType.AMPERSAND: 900, # In C, this is lower than comparison ops + TokenType.CIRCUMFLEX: 800, # In C, this is lower than comparison ops + TokenType.VBAR: 700, # In C, this is lower than comparison ops + TokenType.COLON: 600, # Not in C + TokenType.LESSER: 500, + TokenType.LESSER_EQUAL: 500, + TokenType.GREATER: 500, + TokenType.GREATER_EQUAL: 500, + TokenType.EQUAL_EQUAL: 500, # In C, this is lower than other comparison ops + TokenType.NOT_EQUAL: 500, # In C, this is lower than other comparison ops + # The '&&'/'and' operator is 400 + # The '||'/'or' operator is 300 + # NOTE(dchu): I remove the ability to parse the '=' and ',' as operators since this would be confusing! + # TokenType.EQUAL: 200, + # TokenType.COMMA: 100, # Weakest + } + return precedence.get(op.type(), -1) + + +################################################################################ +### PARSE VALUE EXPRESSIONS +################################################################################ +def parse_literal(stream: TokenStream) -> Literal: + token = stream.get_token() + if token.is_type(TokenType.STRING): + stream.next_token() + return StringLiteral(token) + elif token.is_type(TokenType.DEC): + stream.next_token() + return DecimalLiteral(token) + else: + raise ValueError(f"unexpected token type: {repr(token)}") + + +def parse_parenthetic_expression(stream: TokenStream) -> ValueExpression: + eat_token(stream, TokenType.LPAREN) # Eat '(' + ret = parse_value_expression(stream) + eat_token(stream, TokenType.RPAREN) # Eat ')' + return ret + + +def parse_func_call_args( + stream: TokenStream, identifier_leaf: Identifier +) -> FunctionCallNode: + eat_token(stream, TokenType.LPAREN) + args = [] + token = stream.get_token() + # Check if empty set of arguments + if token.is_type(TokenType.RPAREN): + eat_token(stream, TokenType.RPAREN) + return FunctionCallNode(identifier_leaf, []) + # At this point, we have at least one argument (or error) + while True: + expr = parse_value_expression(stream) + args.append(expr) + token = stream.get_token() + if token.is_type(TokenType.RPAREN): + eat_token(stream, TokenType.RPAREN) + break + elif token.is_type(TokenType.COMMA): + eat_token(stream, TokenType.COMMA) + continue + else: + print_parser_error( + stream, + error_msg=f"Expected COMMA or RPAREN, got {stream.get_token().type()}", + ) + raise ValueError("Expected COMMA or RPAREN") + return FunctionCallNode(identifier_leaf, args) + + +def parse_namespace(stream: TokenStream, identifier_leaf: Identifier) -> Identifier: + namespaces = [identifier_leaf] + while True: + next_separator_token = stream.get_token() + if next_separator_token.is_type(TokenType.COLON_COLON): + eat_token(stream, TokenType.COLON_COLON) + identifier_leaf = Identifier(eat_token(stream, TokenType.IDENTIFIER)) + namespaces.append(identifier_leaf) + else: + break + hacky_token = Token(0, 0, 0, TokenType.IDENTIFIER, "::".join(n.get_name_as_str() for n in namespaces)) + return Identifier(hacky_token) + + +def parse_identifier_or_call_or_access( + stream: TokenStream, +) -> Union[Identifier, FunctionCallNode, OperatorValueExpression, VariableCallNode]: + """ + Parse both variables and function calls. + + This is due to the semantics we do not know whether the identifier will be a + variable name or a function call. + + In fact, this will handle any postfix unary operations. Postfix operators + would have to be different than prefix operators, otherwise we would need to + add backtracking into the parser. E.g. `x+ + x` would require backtracking. + A unique operator, e.g. `x++ + x` would not. + + In the future, it may be an array thing too array[100]. + """ + id_token = eat_token(stream, TokenType.IDENTIFIER) + identifier_leaf = Identifier(id_token) + + token = stream.get_token() + if token.is_type(TokenType.COLON_COLON): + identifier_leaf = parse_namespace(stream, identifier_leaf) + + token = stream.get_token() + if token.is_type(TokenType.LPAREN): + return parse_func_call_args(stream, identifier_leaf) + elif token.is_type(TokenType.LSQB): + raise ValueError("accesses not supported yet... i.e. `x[100]`") + else: + return VariableCallNode(identifier_leaf) + + +# TODO(dchu): figure out why this is called "primary" +def parse_primary(stream: TokenStream) -> ValueExpression: + """Helper functions for parsing identifiers, literals, and parenthetic expressions.""" + token = stream.get_token() + if token.is_type(TokenType.IDENTIFIER): + return parse_identifier_or_call_or_access(stream) + elif token.type() in LITERAL_TOKENS: + return parse_literal(stream) + elif token.is_type(TokenType.LPAREN): + return parse_parenthetic_expression(stream) + else: + error_msg = f"unrecognized primary {token}" + print_parser_error(stream, error_msg) + raise ValueError(error_msg) + + +def parse_value_expression(stream: TokenStream) -> ValueExpression: + lhs = parse_primary(stream) + assert lhs is not None + return parse_binop_rhs(stream, 0, lhs) + + +def parse_binop_rhs( + stream: TokenStream, min_expression_precedence: int, lhs: ValueExpression +) -> ValueExpression: + """ + Inputs + ------ + + * min_expression_precedence: int - min operator precedence that function is + allowed to eat. + """ + while True: + binop_token = stream.get_token() + binop_token_precedence = get_binop_precedence(binop_token) + + # Exit if the token has a lower precedence than what we're allowed to + # consume. This could be for a variety of reasons: if we pass an invalid + # binop (which is OK), if the token is None (representing the end of the + # stream), or if it is a binop with too low precedence. + if binop_token_precedence < min_expression_precedence: + return lhs + + stream.next_token() + rhs = parse_primary(stream) + assert rhs + + # TODO(dchu): I have no idea what this is actually doing. I just copied + # it from https://llvm.org/docs/tutorial/MyFirstLanguageFrontend/LangImpl02.html + token = stream.get_token() + next_prec = get_binop_precedence(token) + if binop_token_precedence < next_prec: + rhs = parse_binop_rhs(stream, binop_token_precedence + 1, rhs) + assert rhs + + binop = OperatorLiteral(binop_token, OperatorType.BINARY_INFIX) + lhs = OperatorValueExpression(binop, lhs, rhs) + + +################################################################################ +### PARSE VALUE EXPRESSIONS +################################################################################ +def parse_type_expression(stream: TokenStream) -> TypeExpression: + # TODO(dchu): enable compound types, e.g. Int[0:255] + # Note: we only accept single-identifier expressions for types! + data_type_token = eat_token(stream, TokenType.IDENTIFIER) + data_type = Identifier(data_type_token) + return data_type + + +################################################################################ +### FUNCTION DEFINITION +################################################################################ +def parse_statement_in_function_body( + stream: TokenStream, +) -> FUNCTION_STATEMENTS: + token = stream.get_token() + if token.is_type(TokenType.LET): # Local variable + return parse_let_statement(stream) + elif token.is_type(TokenType.RETURN): + eat_token(stream, TokenType.RETURN) + result = parse_value_expression(stream) + eat_token(stream, TokenType.SEMICOLON) + return ReturnNode(result) + # TODO(dchu): if, while, for loops + else: + result = parse_value_expression(stream) + eat_token(stream, TokenType.SEMICOLON) + return result + + +def parse_function_definition(stream: TokenStream) -> FunctionDefinitionNode: + """ + Parse function definition. + + E.g. `function func_name(param: type) {...}` + """ + eat_token(stream, TokenType.FUNCTION) # Eat 'function' + # TODO(dchu): Optional for anonymous functions. + name = eat_token(stream, TokenType.IDENTIFIER) # Eat function name + name = Identifier(name) + # TODO(dchu): add generics parsing here! + pass # Eat generics + + # Parameters + params = [] + eat_token(stream, TokenType.LPAREN) + while True: + token = stream.get_token() + if token.is_type(TokenType.IDENTIFIER): + params.append(parse_variable_definition(stream)) + else: + # NOTE: this is somewhat redundant. It is only useful to check for a + # ')' immediately after the opening '('; on all other iterations, it + # servers no purpose. + eat_token(stream, TokenType.RPAREN) + break + token = stream.get_token() + if token.is_type(TokenType.COMMA): + eat_token(stream, TokenType.COMMA) + continue + elif token.is_type(TokenType.RPAREN): + eat_token(stream, TokenType.RPAREN) + break + else: + print_parser_error(stream, error_msg=f"error!") + raise ValueError(f"unexpected token type: {repr(token)}") + + # Return type. Supports single word type for now. + eat_token(stream, TokenType.ARROW) + ret_t = parse_type_expression(stream) + + # Eat '{' + eat_token(stream, TokenType.LBRACE) + + # Parse body + # TODO(dchu): figure out what to do here. https://llvm.org/docs/tutorial/MyFirstLanguageFrontend/LangImpl02.html + body = [] + while True: + if stream.get_token().is_type(TokenType.RBRACE): + break + statement = parse_statement_in_function_body(stream) + body.append(statement) + # Eat '}' + eat_token(stream, TokenType.RBRACE) + return FunctionDefinitionNode(name, params, ret_t, body) + + +def parse_variable_definition(stream: TokenStream) -> VariableDefinitionNode: + """Parse " [: ] [= ]" expression. + + E.g. `one_hundred: int = 100;`""" + name_token = eat_token(stream, TokenType.IDENTIFIER) + identifier = Identifier(name_token) + data_type: TypeExpression = None + value: ValueExpression = None + if stream.get_token().is_type(TokenType.COLON): + eat_token(stream, TokenType.COLON) + data_type = parse_type_expression(stream) + if stream.get_token().is_type(TokenType.EQUAL): + eat_token(stream, TokenType.EQUAL) + value = parse_value_expression(stream) + return VariableDefinitionNode(identifier, data_type, value) + + +def parse_let_statement(stream: TokenStream) -> VariableDefinitionNode: + """Parse 'let' statement either inside or outside of a function.""" + eat_token(stream, TokenType.LET) + result = parse_variable_definition(stream) + eat_token(stream, TokenType.SEMICOLON) + return result + + +################################################################################ +### IMPORT MODULE STATEMENT +################################################################################ +def parse_module_import_statement(stream: TokenStream) -> ImportModuleNode: + """ + Parse import statement outside of a function. + + E.g. `module io = import("stdio.h");` + """ + # TODO(dchu): this is deprecated because eventually we will have namespaces + # and let statements all be one thing. + eat_token(stream, TokenType.MODULE) + name = eat_token(stream, TokenType.IDENTIFIER) + eat_token(stream, TokenType.EQUAL) + eat_token(stream, TokenType.IMPORT) + eat_token(stream, TokenType.LPAREN) + library = eat_token(stream, TokenType.STRING) + eat_token(stream, TokenType.RPAREN) + eat_token(stream, TokenType.SEMICOLON) + return ImportModuleNode(Identifier(name), StringLiteral(library)) + + +def parse(stream: TokenStream) -> List[ASTNode]: + result = [] + while stream.get_token() is not None: + token = stream.get_token() + if token.is_type(TokenType.FUNCTION): + result.append(parse_function_definition(stream)) + elif token.is_type(TokenType.MODULE): + result.append(parse_module_import_statement(stream)) + elif token.is_type(TokenType.LET): # Global variable + result.append(parse_let_statement(stream)) + else: + raise ValueError(f"Unexpected token: {token}") + return result diff --git a/prototype/lol_parser_helper.py b/prototype/parser/lol_parser_token_stream.py similarity index 73% rename from prototype/lol_parser_helper.py rename to prototype/parser/lol_parser_token_stream.py index da35fc2..a7dd0f7 100644 --- a/prototype/lol_parser_helper.py +++ b/prototype/parser/lol_parser_token_stream.py @@ -1,31 +1,35 @@ -from typing import List - -from lol_lexer import Token - - -class TokenStream: - """Semantics taken from CharacterStream""" - - def __init__(self, src: List[Token], text: str = None) -> None: - self.text = text - self.src = src - self.idx = 0 - - def get_text(self) -> str: - return self.text - - def get_token(self) -> Token: - """Get the current token or return None""" - if self.idx >= len(self.src): - return None - return self.src[self.idx] - - def next_token(self): - """Advance to the next token""" - t = self.get_token() - if t is None: - return - self.idx += 1 - - def get_pos(self): - return self.idx +from typing import List + +from prototype.lexer.lol_lexer import Token + + +class TokenStream: + """Semantics taken from CharacterStream""" + + def __init__(self, src: List[Token], text: str = None) -> None: + self.text = text + self.src = src + self.idx = 0 + + def get_text(self) -> str: + return self.text + + def get_token(self) -> Token: + """ + Get the current token or return None if at the end. + + N.B. Does NOT advance the token! + """ + if self.idx >= len(self.src): + return None + return self.src[self.idx] + + def next_token(self): + """Advance to the next token.""" + t = self.get_token() + if t is None: + return + self.idx += 1 + + def get_pos(self): + return self.idx diff --git a/prototype/parser/lol_parser_types.py b/prototype/parser/lol_parser_types.py new file mode 100644 index 0000000..a092927 --- /dev/null +++ b/prototype/parser/lol_parser_types.py @@ -0,0 +1,360 @@ +from abc import ABCMeta, abstractmethod +from enum import Enum, auto, unique +from typing import List, Tuple, Union + +from prototype.lexer.lol_lexer import Token + + +@unique +class OperatorType(Enum): + UNARY_PREFIX = auto() + UNARY_POSTFIX = auto() + BINARY_INFIX = auto() + + +################################################################################ +### AST ABSTRACT CLASSES +################################################################################ +# Might have to be an Abstract Base Class? +class ASTNode(metaclass=ABCMeta): + @abstractmethod + def to_dict(self): + raise NotImplementedError("abc") + + def __repr__(self) -> str: + return repr(self.to_dict()) + + +class Expression(ASTNode, metaclass=ABCMeta): + pass + + +class ValueExpression(Expression, metaclass=ABCMeta): + pass + + +class TypeExpression(Expression, metaclass=ABCMeta): + pass + + +class ASTLeaf(ASTNode, metaclass=ABCMeta): + def __init__(self, token: Token): + self.token = token + + +class Literal(ASTLeaf, ValueExpression, metaclass=ABCMeta): + """The literals. + + N.B. Type expressions cannot contain literals yet. Eventually, they should + be able to, e.g. `x: int[100]`. + """ + + def __init__(self, token: Token): + super().__init__(token) + self.value = None + + +################################################################################ +### OPERATOR LITERAL +################################################################################ +class OperatorLiteral(ASTLeaf): + """An operator, such as '+'.""" + + def __init__(self, token: Token, operator_type: OperatorType): + super().__init__(token) + self._operator_type = operator_type + + def get_operator_type_as_str(self): + return self._operator_type.name + + def to_dict(self): + return dict( + type=self.__class__.__name__, + operator=self.token.lexeme, + operator_type=self._operator_type.name, + ) + + +################################################################################ +### OPERATOR NODES +################################################################################ +class OperatorValueExpression(ValueExpression): + """ + Operator expressions containing a value (as opposed to a type). + + E.g. `10 + 10` vs `int[100]` + + The position of the operand is important if it has differing semantics + between positions. E.g. prefix and postfix operators: ++x` vs `x++`. + """ + + def __init__( + self, + operator: OperatorLiteral, + *operands: ValueExpression, # i.e. each argument is a ValueExpression + ): + self._operator = operator + self._operands = operands + + def get_operator_as_str(self) -> str: + return self._operator.token.lexeme + + def get_operator_type_as_str(self) -> str: + return self._operator.get_operator_type_as_str() + + def get_operands(self) -> Tuple[ValueExpression]: + return self._operands + + def to_dict(self): + return dict( + type=self.__class__.__name__, + operator=self._operator.to_dict(), + operands=[x.to_dict() for x in self._operands], + ) + + +################################################################################ +### TYPE OPERATOR NODES +################################################################################ +class OperatorTypeExpression(TypeExpression): + """ + Operator expressions containing a type (as opposed to a value). + + E.g. `int[100]` vs `10 + 10` + + The position of the operand is important if it has differing semantics + between positions. E.g. prefix and postfix operators: ++x` vs `x++`. + """ + + def __init__( + self, + operator: OperatorLiteral, + *operands: TypeExpression, + operator_type: OperatorType = None, # May be unspecified for binary ops + ): + self._operator = operator + self._operands = operands + + def to_dict(self): + return dict( + type=self.__class__.__name__, + operator=self._operator.to_dict(), + operands=[x.to_dict() for x in self._operands], + ) + + +################################################################################ +### LITERALS +################################################################################ +class DecimalLiteral(Literal): + def __init__(self, token: Token): + super().__init__(token) + self.value = int(token.lexeme) + + def to_dict(self): + return dict(type=self.__class__.__name__, value=self.value) + + +class StringLiteral(Literal): + def __init__(self, token: Token): + super().__init__(token) + # TODO(dchu): Parse string such that "hello, world\n" has the characters + # TODO(dchu): suitably replaced. + self.value = token.lexeme[1:-1] # String surrounding quotations + + def to_dict(self): + return dict(type=self.__class__.__name__, value=self.value) + + +class Identifier(ASTLeaf, TypeExpression): + def __init__(self, token: Token): + super().__init__(token) + + def get_name_as_str(self): + return self.token.as_str() + + def to_dict(self): + return dict(type=self.__class__.__name__, name=self.token.lexeme) + + +################################################################################ +### VARIABLE NODES +################################################################################ +class VariableNode(ASTNode, metaclass=ABCMeta): + pass + + +class VariableDefinitionNode(VariableNode): + def __init__( + self, + name: Identifier, + data_type: TypeExpression, + value: Union[ValueExpression, None], + ): + self._name = name + self._data_type = data_type + self._value = value + + def get_name_as_str(self) -> List[str]: + return self._name.token.lexeme + + def get_data_type(self): + return self._data_type + + def get_value(self): + return self._value + + def to_dict(self): + to_dict_or_none = lambda x: x.to_dict() if x is not None else None + return dict( + type=self.__class__.__name__, + name=self._name.to_dict(), + data_type=to_dict_or_none(self._data_type), + value=to_dict_or_none(self._value), + ) + + +class VariableModificationNode(VariableNode): + def __init__( + self, + name: Identifier, + value: Union[ValueExpression, None], + ): + self._name = name + self._value = value + + def get_name_as_str(self) -> str: + return self._name.token.lexeme + + def get_value(self): + return self._value + + def to_dict(self): + to_dict_or_none = lambda x: x.to_dict() if x is not None else None + return dict( + type=self.__class__.__name__, + name=self._name.to_dict(), + value=to_dict_or_none(self._value), + ) + + +class VariableCallNode(VariableNode, ValueExpression): + """Use this when the value of the identifier is being used.""" + + def __init__(self, name: Identifier): + self._name = name + + def get_name_as_str(self): + return self._name.token.lexeme + + def to_dict(self): + return dict( + type=self.__class__.__name__, + name=self._name.to_dict(), + ) + + +################################################################################ +### FUNCTION NODES +################################################################################ +class FunctionNode(ASTNode, metaclass=ABCMeta): + pass + + +class FunctionDefinitionNode(FunctionNode): + def __init__( + self, + name: Identifier, + parameters: List[VariableDefinitionNode], + return_type: TypeExpression, + body: List[ASTNode], + ): + self._name = name + self._parameters = parameters + self._return_type = return_type + self._body = body + + def get_name_as_str(self): + return self._name.token.lexeme + + def get_parameters(self): + return self._parameters + + def get_return_type(self): + return self._return_type + + def get_body(self): + return self._body + + def to_dict(self): + return dict( + type=self.__class__.__name__, + name=self._name.to_dict(), + parameters=[p.to_dict() for p in self._parameters], + return_type=self._return_type.to_dict(), + body=[b.to_dict() for b in self._body], + ) + + +class FunctionCallNode(FunctionNode, ValueExpression): + # Include generics in function call? + def __init__(self, name: Identifier, arguments: List[ValueExpression]): + self._name = name + self._arguments = arguments + + def get_name_as_str(self): + return self._name.token.lexeme + + def get_arguments(self): + return self._arguments + + def to_dict(self): + return dict( + type=self.__class__.__name__, + name=self._name.to_dict(), + arguments=[a.to_dict() for a in self._arguments], + ) + + +################################################################################ +### IMPORT MODULE NODES +################################################################################ +class ImportModuleNode(ASTNode): + def __init__(self, name: Identifier, library: StringLiteral) -> None: + super().__init__() + self._name = name + self._library = library + + def get_name_as_str(self): + return self._name.token.lexeme + + def get_library_as_str(self): + # N.B. this may be a raw string value. Injection attacks? Or just plain + # ugly messes. + return self._library.value + + def to_dict(self): + return dict( + type=self.__class__.__name__, + name=self._name.to_dict(), + expression=self._library.to_dict(), + ) + + +################################################################################ +### FUNCTION BODY NODES +################################################################################ +# TODO(dchu): if-else, while, for, etc. +class ReturnNode(ASTNode): + def __init__(self, expression: ValueExpression) -> None: + super().__init__() + self._expression = expression + + def get_expression(self): + return self._expression + + def to_dict(self): + return dict( + type=self.__class__.__name__, + expression=self._expression.to_dict(), + )