diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml new file mode 100644 index 0000000..7339bb0 --- /dev/null +++ b/.github/workflows/lint.yml @@ -0,0 +1,24 @@ +name: Lint Code Base + +on: + push: + branches: [ "main" ] + pull_request: + branches: [ "main" ] + +jobs: + run-linter: + runs-on: ubuntu-latest + steps: + - name: Check out Git repository + uses: actions/checkout@v2 + + - name: Run Black (Python) + uses: psf/black@stable + with: + options: | + --verbose + --line-length=80 + --exclude /(\.github|\.git|\.venv|\.vscode)/ + src: "." + version: "22.3.0" diff --git a/.github/workflows/linux_compiler_test.yml b/.github/workflows/linux_compiler_test.yml new file mode 100644 index 0000000..675a118 --- /dev/null +++ b/.github/workflows/linux_compiler_test.yml @@ -0,0 +1,27 @@ +name: Linux Compiler CI + +on: + push: + branches: [ "main" ] + pull_request: + branches: [ "main" ] + +jobs: + run-linux-python3: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.10' + - name: Compile + run: | + # For some reason, if we change to the compiler directory, Python complains. + export PYTHONPATH="${PYTHONPATH}:/home/runner/work/lolc/lolc/src/" + for x in fibonacci helloworld math_ops nested_if sum_three + do + python src/compiler/lol.py -i examples/$x.lol -o results + gcc results/$x-*.c + ./a.out + done diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..b1f143c --- /dev/null +++ b/.gitignore @@ -0,0 +1,68 @@ +# Ignore PyCharm and VS Code directory +.idea +.vscode/ +# Ignore Python Venv +.venv/ +venv/ + +# Ignore Python Caches +/results/ +__pycache__/ + +# Ignore CMake's build directory +build/ +cmake/ +test/output/* + +# Prerequisites +*.d + +# Object files +*.o +*.ko +*.obj +*.elf + +# Linker output +*.ilk +*.map +*.exp + +# Precompiled Headers +*.gch +*.pch + +# Libraries +*.lib +*.a +*.la +*.lo + +# Shared objects (inc. Windows DLLs) +*.dll +*.so +*.so.* +*.dylib + +# Executables +*.exe +*.out +*.app +*.i*86 +*.x86_64 +*.hex + +# Debug files +*.dSYM/ +*.su +*.idb +*.pdb + +# Kernel Module Compile Results +*.mod* +*.cmd +.tmp_versions/ +modules.order +Module.symvers +Mkfile.old +dkms.conf diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..97974d0 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2022 David Chu + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..58aa762 --- /dev/null +++ b/README.md @@ -0,0 +1,69 @@ +# Light Object-Oriented Language (LOL) + +## Project Naming + +I am trying to come up with an appropriate name for this project. +I like "Light Object Language" (because lol), but it's not really object-based. +Maybe "Light Open-Source Language" is better. +I was thinking of something short and close to the beginning of the alphabet (e.g. +"Ah"). +I was musing with design objectives as well; my goals for this language are to be +(1) secure, (2) usable, and (3) performant in that order (or maybe. +This would lead to the acronym "sup", like "wassup". +As a mature individual (the "lol" notwithstanding), I'm not sure if this would be +great; it would also put my language in the middle-toward-the-end of the alphabet, +i.e. the most forgettable place ever. + +## Project Goals + +1. Expose the internals of the transpiler to the user (if they so choose). +2. Provide a (1) secure, (2) usable, and (3) performant language, in that order. +3. Allow the programmer to dump as much information into the compiler as they wish. +How much is used by the compiler is another question. (Is this a good idea?) +4. Don't make silly features. + +## Bootstrapping + +This project is to be bootstrapped in Python. Since it targets C, I can choose any +language for the bootstrap. I chose Python because of its rich standard library, +and I am faster at writing Python than C. + +## Language Features + +Make safety and usability/intuitiveness the priority. Then, performance can be +added for a little extra work (since much of the optimization will be done on a +small amount of the code). + +- Drop in replacement for C + - Structs are in order + - Public functions, structs, etc. are linked with C + - Initially, I will emit C code (and then maybe target LLVM-IR or some other compiler framework's IR) +- Ability to give compile-time information (in square brackets) + - This can include ranges on integers (which must be proven at compile time or upon conversion, bound-checked at runtime) +- Inspired by Rust + - Memory safety, immutability, "reserved" (in C's context) by default + - Mark unowned data as "unowned"; otherwise, borrow check everything (but then lifetimes would be annoying to implement... how do they even work?) + - Enum type for errors -- but we could use unused portions of integer/other ranges (see above, compile-time information) + - Traits are cool + - I like the fact there is no inheritance +- Inspired by Zig + - No macros/preprocessor; compile-time running of any function + - I like the idea of having the memory allocator specified--efficient allocation is a huge problem, so it would be cool to specify something with the "allocator trait", to use Rust's terminology +- Inspired by Python + - Types can store methods--but maybe use the "::" syntax for any compile-time namespace stuff + - E.g. `int16::max` -- I guess this is a language feature and not in keeping with putting things in the standard library + - Have namespaces like Python, where you inherit the importing namespace's name + - E.g. `import math; math.sqrt(10)` instead of C++'s `#include \n std::sqrt` + - I actually want to go further and use Nix's import syntax of `math = import("math");` or something... it's been a while since I wrote Nix + - Python's constructor syntax makes more intuitive sense to me than C++'s + - E.g. `x: ClassName = ClassName(arg0, arg1)` rather than `ClassName x(arg0, arg1);`. I just feel like the '=' makes everything clearer. +- Inspired by C + - Limited language features + - Optional standard library (I'll need to write wrappers for the C standard library) + - By default, struct layouts are like in C (which I believe is in the order the user specified?) + - Transparent what exactly is happening. No hidden function calls, no hidden operations (e.g. '<<' can be overloaded in C++) + - Syntax is inspired by C-family +- Inspired by C++ + - Generics with templates. I'm going to use Python/Go's syntax +- Inspired by Java + - The name "interface" instead of Rust's "trait" makes more sense to me... I may just be missing the full idea of Rust traits. diff --git a/docs/CODING_STYLE.md b/docs/CODING_STYLE.md new file mode 100644 index 0000000..4dfe9a6 --- /dev/null +++ b/docs/CODING_STYLE.md @@ -0,0 +1,181 @@ +# Coding Style + +## Definitions + +* Integral Value: any `char`, `int`, `enum`, etc. + +## Compiler Version +The code in this project should be as compatible with C89 as possible. This is +because C89 remains the best supported version of C (e.g. MSVC). The compiler +shall be called with `-std=c99 -pedantic -Wall -Werror` (ohhh, this pains me. I +wish I could ask you to use C89, but it is so limiting). + +However, C99 has some nice-to-have features. I will admit that I really want to + +* IO Functionality: `snprintf()` (get the number of characters used) and +`printf("%zu", (size_t)x);` +* Named Structure Tags: `struct point p = {.x = 0, .y = 0};` +* New Libraries: `` and `` + * Make your own boolean library. + * +* Key Words: `inline`, `restrict` + * In this project, `#define` them to empty strings if you are compiling + wit C89. +* Variable initialization anywhere: `for (int i = 0; i < N; ++i) { ... }` + * Try to avoid this in this project +* Calls with Structures: `f(((struct point){.x = 0, .y = 0}))` + * Try to avoid this in this project +* C++ Style Comments: `// This is an inline comment` + * Don't use this in this project. + +Needless to say, some of these are excellent additions to the language. + +## Portability +While passing or returning structures from functions is an established part of +the C89 language, the binary implementation is compiler-dependent. While some +compilers may use registers, others may use memory. For this reason, we will +not pass structures between functions, but rather pass _pointers_ to structures. + +Strictly speaking, external identifiers in C89 are only guaranteed to be +significant to 6 characters; internal identifiers are only guaranteed to be +significant to 31 characters. We will ignore this. This is simply a matter of +refactoring the code. + +## Safety Conventions + +### Initialization +All invalid pointers shall be set to NULL. My `mem_malloc()` function will +actually enforce this. Where difficulty with this arises is if we allocate a +structure containing pointers, but the structure has non-null pointers. It is +the user's job to set all these pointers to NULL as soon as the memory is +allocated. The function `mem_malloc()` does not do this. + +All numeric values shall be set to `0`. + +All structures shall be initialized using `{ 0 }` + +### Bracing +All groups of code shall be braced (including no code). I believe this is inline +with MISRA's standard. + +```c +while (x-- > 0) { + /* no op */ +} +``` + +Yes, I know that you can put a semicolon at the end of the while statement. But +don't. + + +### Switch Statements +Switch statements should not have fall-through unless it is explicitly marked. +Moreover, every instance should have a default, even if it is impossible. If it +is deemed impossible to hit the default, then an assertion should be thrown. + +### Increasing Integral Values +For any of the operators that grow an integral value, the bounds must be checked +before an operator. These include `x + y`, `x << y`, and `x * y`. An idiom to +check the validity of these operators is to apply them, then apply the inverse, +before checking for equality. + +E.g. `x == (x + y) - y` implies a valid addition. + +It is for this reason that my memory functions take in two arguments, so that +the user does not have to check the bounds on multiplication themselves. + +Admittedly, the `++i` or `i++` operators can overflow. However, we will ignore +this fact for now. In a for-loop, they are checked upon every iteration. + +### Division +A check for zero must be performed before doing any division operation. These +include `x / y` and `x % y`. + +### Right Bit Shifting with Signed Integrals +Don't right bit shift with signed integrals. The implementation is compiler- +specific with regard to the sign extension. + +Or at the very least, never do this with a negative signed integral. If I +remember correctly, positive signed integrals behave like unsigned integrals for +right bit shifts. + +### Bitwise Operation Ordering +The order of bitwise operations is implementation defined. There is no short- +circuit logic, unlike logical boolean operations. + +### Side Effects in Function Calls +No function call may use arguments with side-effects. + +At the very least, do not rely on a particular order of side-effects when +calling a function. All of the side-effects will take place before the called +function is entered, however the order is compiler dependent. + +### Strings +Unless the string appears directly as a `""` in the code, do +not rely on it being null-terminated. Especially not if you used a copy to a +buffer. Unless you explicity copy in a null character at the end. + +Where possible, store the length of a string along with the string. + +Do not use unsafe standard library functions that rely on strings to be null- +terminated. Only use ones that have a known number of bytes to operate upon. + + +## Conventions + +### Braces and If-Else Chains +The structure of braces shall follow K&R. The braces for functions shall follow +this pattern (contrary to K&R's usage). + +```c +int function(int x, int y) { + if (x) { + /* ... */ + } else if (y) { + /* ... */ + } else { + /* ... */ + } +} +``` + +### Labels and Switches +Labels shall be indented 1 less than the surrounding code. + +```c +int function(int x) { + switch (x) { + case 0: + /* ... */ + case 1: + /* ... */ + case 2: + /* ... */ + default: + /* ... */ + } + + goto cleanup: + +cleanup: + /* ... */ +} +``` + +### Use of Assertions +Use assertions for code where it is impossible to get to somewhere. Otherwise, +use a return statement for ease of testing. This means that assertions can be +used liberally for commenting. + +Assertions should not be used for deployment code of legitimate cases (e.g. NULL +error handling). This makes the code untestable (because it exits the test +program). + +### Use of macros as functions + +If a macro behaves entirely like a function (i.e. arguments evaluated exactly +once each, no manipulating variables), then it can be named following the +function naming conventions. + +Otherwise, it shall be upper-case to warn the user of potentially funky +behaviour. diff --git a/docs/PORTABILITY_IN_C.md b/docs/PORTABILITY_IN_C.md new file mode 100644 index 0000000..11c65d7 --- /dev/null +++ b/docs/PORTABILITY_IN_C.md @@ -0,0 +1,46 @@ +Notes of Portability in C +========================= + +1. Variable Names + * Internal variables are only guaranteed unique for the first 31 characters. + (you can stick an arbitrary number of insignificant letters after???) + * External variables (and function names) are only guaranteed unique for the + first 6 characters. I think this refers specifically to external function + names. +2. Using Libraries + * Use the standard libraries where possible to promote portability. For + example, the character encoding is not guaranteed to be ASCII. This also + means one should use `'a'` to represent characters rather than numeric + values. +3. Global Variables + * Declare global variables as `extern ;` at the top of the + function if you want the user to be able to quickly see that the function + uses global variables. + * I am not sure whether the variable's scope will have to be the whole + project or just the translation unit or either? +4. Explicitly Mark Unsigned and Long Numbers + * `123L` is a `long`, `123U` is an `unsigned`, and `123UL` is an + `unsigned long`. + * For floating-point numbers, the default is `double`. That is, `123.0` is a + `double`, `123.0F` is a `float`, and `123.0L` is a `long double`. +5. Immutability of `const` + * Do not modify a `const` variable. Doing so is machine defined. +6. Constraints on Numeric Values with Certain Operators + * Use `%` only on integral types. + * `%` and `/` only for positive numbers, because truncation up or down is + machine defined. Underflow/zero/overflow is machine defined. + * Do not compare unsigned to negative signed values, because the negative + signed value will be promoted. + * E.g. -1L < 1U, because -1L < 1L (promotion ok). BUT: -1L > 1UL, + because -1L goes to the maximum `unsigned long`. + * Do not use `>>` on negative signed values. Whether to pad with 1 or 0 is + machine defined. +8. Automatic cast if header of function available. + * E.g. `double sqrt(double x)` means `sqrt(12L)` is equivalent to + `sqrt((double)12L)`. +9. Wrap assignments that you take the value of in parentheses. + * E.g. `if (!(x = get_value())) { /* ... */ }`. +10. Do not convert `double` to `float`, because rounding/truncation is machine +defined. +11. True is non-zero, not necessarily 1. This means, do not check +`if (x == true)`; check `if (x != false)` or `if (x)`. diff --git a/docs/STANDARD_LIBRARY.md b/docs/STANDARD_LIBRARY.md new file mode 100644 index 0000000..4c4c575 --- /dev/null +++ b/docs/STANDARD_LIBRARY.md @@ -0,0 +1,425 @@ +# Using the Standard Library + +This file follows the structure of **Appendix B** in Brian Kernighan and Dennis +Ritchie's _The C Programming Language, Edition 2_ (i.e. "K&R"), which itself is +based on the C89/90 Standard. + +Large sections were copied verbatim with the understanding that the sections +were taken from the Standard. + +An idiom that we will use for standard library calls is as follows: + +```c +if (errno) { + return errno; +} +/* $r is the status or error condition */ +if (/* error condition in standard library call */) { + assert(errno && "errno not set upon standard library error!"); /* if applicable */ + /* DO CLEAN UP HERE */ + return errno; +} +assert (errno == 0 && "errno set upon no error!"); +/* CONTINUE OTHER WORK HERE */ +``` + +Take an example using the standard library function, `calloc`. + +```c +int r = 0; +int *ptr = NULL; + +if (errno) { + return errno; +} +if (N && (ptr = calloc(N, sizeof *ptr)) == NULL) { + assert(errno == ENOMEM && "errno not set upon calloc error!"); + /* No cleanup required here */ + return errno; +} +/* CONTINUE OTHER WORK HERE */ +``` + +## Contents + +1. Input and Output (``) +2. Character Class Tests (``) +3. String Functions (``) +4. Mathematical Functions (``) +5. Utility Functions (``) +6. Diagnostics (``) +7. Variable Argument Lists (``) +8. Non-Local Jumps (``) +9. Signals (``) +10. Date and Time Functions (``) +11. Implementation-Defined Ingegral Limits (`) +12. Implementation-Defined Floating-Point Limits (`) + + +## Input and Output (``) + +### File Operations +Function | Description +:--------|:--------------------------------------------------------------------- +`FILE *fopen(const char *filename, const char *mode)` | +`FILE *freopen(const char *filename, const char *mode, FILE *stream)` | +`int fflush(FILE *stream)` | +`int fclose(FILE *stream)` | +`int remove(const char *filename)` | +`int rename(const char *oldname, const char *newname)` | +`FILE *tmpfile(void)` | +`char *tmpname(chars[L_tmpnam])` | +`int setvbuf(FILE *stream, char *buf, int mode, size_t size)` | +`void setbuf(FILE *stream, char *buf)` | + +### Formatted Output +Function | Description +:-----------------------------------------------------------|:------------------ +`fprintf(FILE *stream, const char *format, ...)` | +`printf(const char *format, ...)` | +`sprintf(char *s, const char *format, ...)` | +`vfprintf(FILE *stream, const char *format, va_list arg)` | +`vprintf(const char *format, va_list arg)` | +`vsprintf(char *s, const char *format, va_list arg)` | + + +### Formatted Input +Function | Description +:---------------------------------------------------|:-------------------------- +`int fscanf(FILE *stream, const char *format, ...)` | +`int scanf(const char *format, ...)` | +`int sscanf(char *s, const char *format, ...)` | + +### Character Input and Output Functions/Macros +Function | Description +:-------------------------------------------|:---------------------------------- +`int fgetc(FILE *stream)` | return the next character of a stream (`unsigned char`) or `EOF` if an error occurs +`char *fgets(char *s, int n, FILE *stream)` | gets at most the nex `n-1` characters, stopping upon a newline, which is included in the array. A `'\0'` is included at the end. Returns `s`, or `NULL` upon error. +`int fputc(int c, FILE *stream)` | writes `c` as an `unsigned char` onto the `stream`. Returns `c`, or `EOF` upon error. +`int fputs(const char *s, FILE *stream)` | writes the string `s` (doesn't necessarily contain `'\n'`) on `stream`. Returns non-negative, or `EOF` upon error. +`int getc(FILE *stream)` | equivalent to `fgetc`, except it may be implemented as a macro (thus evaluating `stream` twice). +`int getchar(void)` | equivalent to `getc(stdin)`. +`char *gets(char *s)` | reads the next input line into `s`, replacing the `'\n'` with `'\0'`. Returns s, or `NULL` upon end-of-file or error. +`int putc(int c, FILE *stream)` | equivalent to `fputc`, except it may be implemented as a macro (thus evaluating `stream` twice). +`int putchar(int c)` | equivalent to `putc(c, stdout)`. +`int puts(const char *s)` | +`int ungetc(int c, FILE *stream)` | pushes `c` (converted to `unsigned char`) back onto the stream whence it came. It will be returned on the next read. Only one character of pushback is guaranteed per stream. `EOF` may not be pushed back. Returns the pushed back character, or `EOF` upon error. + +### Direct Input and Output Functions +I am not entirely sure what these functions do. + +Function | Description +:-----------------------------------------------------------------------|:------ +`size_t fread(void *ptr, size_t size, size_t nobj, FILE *stream)` | +`size_t fwrite(const void *ptr, size_t size, size_t nobj, FILE *stream)`| + +### File Positioning +Function | Description +:---------------------------------------------------|:-------------------------- +`int fseek(FILE *stream, long offset, int origin)` | +`long ftell(FILE *stream)` | +`void rewind(FILE *stream)` | +`int fgetpos(FILE *stream, fpos_t *ptr)` | +`int fsetpos(FILE *stream, const fpos_t *ptr)` | + +### Error Functions +Function | Description +:-------------------------------|:---------------------------------------------- +`void clearerr(FILE *stream)` | clears end-of-file and error indicators for the stream. +`int feof(FILE *stream)` | returns non-zero if the end-of-file indicator for stream is set. +`int ferror(FILE *stream)` | returns non-zero if the error indicator for stream is set. +`void perror(const char *s)` | `fprintf(stderr, "%s: %s\n", s, "")`, c.f. `strerror`. + + +## Character Class Tests (``) + +### Character Testing +The argument `c` is an `int` that is either `EOF` or representable as an +`unsigned char`. The return value is an `int`; non-zero denotes _true_, while +zero denotes _false_. + +Function | Description +:---------------|:-------------------------------------------------------------- +`isalnum(c)` | letter or digit character. Implies that `isalpha(c) || isdigit(c)` is true. +`isalpha(c)` | letter character. Implies that `isupper(c) || islower(c)` is true. +`iscntrl(c)` | control character. +`isdigit(c)` | digit character. +`isgraph(c)` | printing character (excluding space). +`islower(c)` | lower-case letter. +`isprint(c)` | printing character (including space). +`ispunct(c)` | printing character (excluding space, letter, digit). +`isspace(c)` | space, formfeed, newline, carriage return, tab, vertical tab. +`isupper(c)` | upper-case letter. +`isxdigit(c)` | hexadecimal digit. + +### Conversions +Function | Description +:---------------|:-------------------------------------------------------------- +`int tolower(c)`| convert `c` to lower-case. +`int toupper(c)`| convert `c` to upper-case. + + +## String Functions (``) + +### Null-Terminated Character String Functions +The arguments `s` and `t` are of type `char *`; the arguments `cs` and `ct` are +of type `const char *`; and the argument `n` is of type `size_t`. + +String Function | Description +:---------------------------|:-------------------------------------------------- +`char *strcpy(s, ct)` | do not use. +`char *strncpy(s, ct, n)` | confusing, do not use (may not null-terminate?). +`char *strcat(s, ct)` | +`char *strncat(s, ct, n)` | confusing, do not use (may not null-terminate?). +`int strcmp(cs, ct)` | +`int strncmp(cs, ct, n)` | +`char *strchr(cs, c)` | +`char *strrchr(cs, c)` | +`size_t strspn(cs, ct)` | +`size_t strcspn(cs, ct)` | +`char *strpbrk(cs, ct)` | +`char *strstr(cs, ct)` | +`size_t strlen(cs)` | returns the length of a string, not including the terminating `'\0'`. +`char *strerror(n)` | returns an implementation-dependent error message about error n. +`char *strtok(s, ct)` | confusing, do not use. + +### Raw Memory Functions + +The arguments `s` and `t` are of type `void *`; the arguments `cs` and `ct` are +of type `const void *`; the argument `n` is of type `size_t`; and the argument +`c` is of type `int` representable as an `unsigned char`. + +Memory Function | Description +:---------------------------|:-------------------------------------------------- +`void *memcpy(s, ct, n)` | copy `n` characters from `ct` to `s`. Return `s`. +`void *memmove(s, ct, n)` | same as `memcpy`, but `s` and `ct` may overlap. Return `s`. +`int memcmp(cs, ct, n)` | compare `cs` and `ct`. Return -1 if `cs < ct`; 0 if `cs == ct`; and +1 if `cs > ct`. +`void *memchr(cs, c, n)` | return a pointer to the first occurence of `c` in `cs` or `NULL` if `c` is not in the first `n` characters of `cs`. +`void *memset(s, c, n)` | place character `c` into the first `n` characters of `s`. Return `s`. + +## Mathematical Functions (``) + +### Important Macros + +* `HUGE_VAL`: a positive `double` value. +* `EDOM` : from ``. A _domain error_ has occurred (i.e. the argument + is outside the bounds of the allowable domain for the function). + Upon a _domain error_, `errno` is set to `EDOM`; the return value + of the function is implementation-dependent. +* `ERANGE` : from ``. A _range error_ has occurred. (i.e. the result + is outside the bounds of the representable range of a `double`). + Upon overflow, the function returns `HUGE_VAL` is returned with + the correct sign and `errno` is set to `ERANGE`; upon underflow, + the function returns 0 and `errno` being set to `ERANGE` is + implementation-dependent. + +### Error Checking + +Error checking should be done as specified by the common idiom. For example, a + +```c +double r = 0.0; + +if (errno != 0) { + return errno; +} +r = exp(x); +/* Check for out-of-domain error before we check that the return is in the +correct range, because the return is implementation-dependent upon a domain +error, so we cannot guarantee that it will be in the valid range. */ +if (errno == EDOM) { + return EDOM; +} + +assert(r >= 0.0 && "illegal value for exponent!"); +assert(r != -HUGE_VAL && "-HUGE_VAL is an illegal value!"); +/* Overflow. */ +if (r == HUGE_VAL && errno == ERANGE) { + return EDOM; +} else if (r == 0.0 && errno == ERANGE) { + return EDOM; +} else { + /* ok */ +} +``` + +### Functions + +The arguments `x` and `y` are of type `double`; the argument `n` is of type +`int`. All functions return a `double`. + +Function | Description +:-----------------------|:------------------------------------------------------ +`sin(x)` | sine of `x`. +`cos(x)` | cosine of `x`. +`tan(x)` | tangent of `x`. +`asin(x)` | arcsine of `x` in range [`-\pi/2`, `\pi/2`], `x \epsilon [-1, 1]`. +`acos(x)` | arccosine of `x` in range [`0`, `\pi`], `x \epsilon [-1, 1]`. +`atan(x)` | arctangent of `x` in range [`-\pi/2`, `\pi/2`]. +`atan(y, x)` | arctangent of (`y/x`) in range [`-\pi`, `\pi`]. +`sinh(x)` | hyperbolic sine of x. +`cosh(x)` | hyperbolic cosine of x. +`tanh(x)` | hyperbolic tangent of x. +`exp(x)` | natural exponential function `e^x`. +`log(x)` | natural logarithm `ln(x)`, x > 0. +`log10(x)` | base-10 logarithm `log_10(x)`, x > 0. +`pow(x, y)` | `x^y`. A domain error occurs if `x = 0` and `y <= 0`, or if `x < 0` and y in not an integer. +`sqrt(x)` | square root of `x`, `x >= 0`. +`ceil(x)` | returns an integer no less than `x` as a double. +`floor(x)` | returns an integer no greater than `x` as a double. +`fabs(x)` | absolute value of `x` +`ldexp(x, n)` | `x * 2^n` +`frexp(x, int *exp)` | no clue +`modf(x, double *ip)` | splits `x` into the integer and fractional parts. +`fmod(x, y)` | floating-point remainder of `x/y`, with + +## Utility Functions (``) + +Function | Description +:---------------------------------------------------------------|:-------------- +`double atof(const char *s)` | equivalent to `strtod(s, (char **)NULL)`. +`int atoi(const char *s)` | equivalent to `(int)strtol(s (char **)NULL, 10)`. +`long atol(const char *s)` | equivalent to `strtol(s, (char **)NULL, 10)`. +`double strtod(const char *s, char **endp)` | return `double` with unprocessed characters starting at `endp`. +`long strtol(const char *s, char **endp, int base)` | return `long` with unprocessed characters starting at `endp`. The `base` from 2 to 36 uses 0, 1, ... 9, a/A, bB, ..., z/Z. A base of 0 means the function will parse the value as a decimal, octal (starting with `0...`), or hexadecimal (starting with `0x...`). +`unsigned long strtoul(const char *s, char **endp, int base)` | return `unsigned long` version of `strtol`. + +### Pseudo Random Numbers +Function | Description +:-------------------------------|:---------------------------------------------- +`int rand(void)` | returns a pseudo-random integer in the range 0 to `RAND_MAX`, which is at least 32767 +`void srand(unsigned int seed)` | sets the seed for pseudo-random generator. + +### Memory Allocation +Function | Description +:---------------------------------------|:-------------------------------------- +`void *calloc(size_t nobj, size_t size)`| memory. +`void *malloc(size_t size)` | memory. +`void *realloc(void *p, size_t size)` | memory. +`void free(void *p)` | memory. + +### Exiting +Function | Description +:-------------------------------|:---------------------------------------------- +`void abort(void)` | +`void exit(int status)` | +`int atexit(void (*fcn)(void))` | run `fcn` upon exit (in reverse order). + +### System Interactions +Function | Description +:-------------------------------|:---------------------------------------------- +`int system(const char *s)` | +`char *getenv(const char *name)`| + +### Array Sorting and Searching +Function | Description +:---------------------------------------------------------------|:-------------- +`void *bsearch(const void *key, const void *base, size_t n, size_t size, int(*cmp)(const void *keyval, const void *datum))` | +`void qsort(void *base, size_t n, size_t size, int (*cmp)(const void *, const void *))` | + +### Miscellaneous Math Functions +Function | Description +:-----------------------------------|:------------------------------------------ +`int abs(int n)` | +`long labs(long n)` | +`div_t div(int num, int denom)` | compute the quotient and remainder of `num/denom`. Returns a `div_t = {.quot = $quotient, .rem = $remainder}`. +`ldiv_t ldiv(long num, long denom)` | + + +## Diagnostics (``) + +Function | Description +:-----------------------|:------------------------------------------------------ +`assert(expression)` | tests the expression. Upon failure, prints `Assertion failed: $expression, file $__FILE__, line $__LINE__` to `stderr` and aborts. Assertions are ignored if the macro `NDEBUG` is defined when `` is included. + +## Variable Argument Lists (``) + +```c +/* Declare a `va_list` that will point to each successive argument. */ +va_list ap; +/* Initialize `ap` to point to the first argument. */ +va_start(va_list ap, $last_argument); +/* Get the next argument in the list. How does this signal the last element? */ +$type va_arg(va_list ap, $type); +/* Clean up the `va_list` when we are done with the arguments and before we +return from our current function. Returns void. */ +va_end(va_list ap); +``` + +## Non-Local Jumps (``) + +Non-local jumps are useful for escaping deeply nested functions. That said, I +will not use them in this project. + +Function | Description +:-----------------------------------|:------------------------------------------ +`int setjmp(jmp_buf env)` | save the state in `env` and return zero the first time we call it. Upon a jump back, this will return a non-zero value. This can only legally occur in an `if`, `switch`, or loop-condition. +`void longjmp(jmp_buf env, int val)`| restores the state saved by the most recent `setjmp`. Note + +```c +if (setjmp(env) == 0) { + /* execute on direct call */ +} else { + /* execute after calling longjump */ +} +``` + +## Signals (``) + +This library deals with handling runtime exceptions. It is confusing, so I will +not use it for now. I do, however, admit that this may be interesting for a more +advanced C programmer. + +## Date and Time Functions (``) + +This library helps with manipulating dates and times. I will not enumerate the +details because I believe it is a little bit silly. Well, not silly. But a bit +arbitrary. + +## Implementation-Defined Ingegral Limits (`) + +This library defines a set of macros detailing various limits. Those I provide, +as in the reference manual, are the minimum allowable values (although, I +believe the `char` type is limited to 8 bits); thus the values may be greater. + +Macro | Value (2's Complement) | Description +:-----------|------------------------------:|:---------------------------------- +`CHAR_BIT` | 8 | bits in a `char` +`CHAR_MAX` | `UCHAR_MAX` or `SCHAR_MAX` | maximum value of `char` +`CHAR_MIN` | `0` or `SCHAR_MIN` | minimum value of `char` +`INT_MAX` | +32767 | maximum value of `int` +`INT_MIN` | -32767 (-32768) | minimum value of `int` +`LONG_MAX` | +2147483647 | maximum value of `long` +`LONG_MIN` | -2147483647 (-2147483648) | minimum value of `long` +`SCHAR_MAX` | +127 | maximum value of `signed char` +`SCHAR_MIN` | -127 (-128) | minimum value of `signed char` +`SHRT_MAX` | +32767 | maximum value of `short` +`SHRT_MIN` | -32767 (-32768) | minimum value of `short` +`UCHAR_MAX` | 255 | maximum value of `unsigned char` +`UINT_MAX` | 65535 | maximum value of `unsigned int` +`ULONG_MAX` | 4294967295 | maximum value of `unsigned long` +`USHRT_MAX` | 65535 | maximum value of `unsigned short` + +## Implementation-Defined Floating-Point Limits (`) + +This library provides macros dealing with floating-point numbers. The following +are a subset of those available, as in the reference text. + +Macro | Value | Description +:---------------|----------:|:-------------------------------------------------- +FLT_RADIX | 2 | radix of exponent representation (e.g. 2, 16). +FLT_ROUNDS | | floating-point rounding mode for addition. +FLT_DIG | 6 | decimal digits of precision. +FLT_EPSILON | 1E-5 | smallest number `x` such that `1.0 + x != 1.0`. +FLT_MANT_DIG | | number of base `FLT_RADIX` digits in mantissa. +FLT_MAX | 1E+37 | maximum floating-point number. +FLT_MAX_EXP | | maximum `n` such that `FLT_RADIX^n-1` is representable. +FLT_MIN | 1E-37 | minimum normalized floating-point number. +FLT_MIN_EXP | | minimum n such that `10^n` is a normalized number. +DBL_DIG | 10 | decimal digits of precision. +DBL_EPSILON | 1E-9 | smallest number `x` such that `1.0 + x != 1.0`. +DBL_MANT_DIG | | number of base `FLT_RADIX` digits in mantissa. +DBL_MAX | 1E+37 | maximum `double` floating-point number. +DBL_MAX_EXP | | maximum `n` such that `FLT_RADIX^n-1` is representable. +DBL_MIN | 1E-37 | minimum normalized `double`floating-point number. +DBL_MIN_EXP | | minimum `n` such that `10^n` is a normalized number. diff --git a/examples/fibonacci.lol b/examples/fibonacci.lol new file mode 100644 index 0000000..53b1983 --- /dev/null +++ b/examples/fibonacci.lol @@ -0,0 +1,26 @@ +/* Recursive Fibonacci Sequence */ +module io = import("stdio.h"); + +function fibonacci(n: i32) -> i32 { + if n == 0 or n == 1 { + return 1; + } + return fibonacci(n - 1) + fibonacci(n - 2); +} + +function main() -> i32 { + let r0: i32 = fibonacci(0); + let r1: i32 = fibonacci(1); + let r2: i32 = fibonacci(2); + let r3: i32 = fibonacci(3); + let r4: i32 = fibonacci(4); + let r5: i32 = fibonacci(5); + let r6: i32 = fibonacci(6); + let r7: i32 = fibonacci(7); + let r8: i32 = fibonacci(8); + let r9: i32 = fibonacci(9); + let r10: i32 = fibonacci(10); + /* I do not like copying the printf semantics from C... */ + io::printf("fibonacci(0..=10) = {%d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d}\n", r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10); + return 0; +} diff --git a/examples/helloworld.lol b/examples/helloworld.lol new file mode 100644 index 0000000..44e069d --- /dev/null +++ b/examples/helloworld.lol @@ -0,0 +1,7 @@ +/* Basic hello world */ +module io = import("stdio.h"); + +function main() -> i32 { + io::printf("Hello, World!\n"); + return 0; +} diff --git a/examples/invalid/duplicate_function_names.lol b/examples/invalid/duplicate_function_names.lol new file mode 100644 index 0000000..7d08a23 --- /dev/null +++ b/examples/invalid/duplicate_function_names.lol @@ -0,0 +1,13 @@ +function duplicate() -> int32 { + return 0; +} + +# Duplicate function already defined. +function duplicate() -> int32 { + return 1; +} + +function main() -> int32 { + let r: int64 = duplicate(); + return 0; +} \ No newline at end of file diff --git a/examples/math_ops.lol b/examples/math_ops.lol new file mode 100644 index 0000000..c0519bb --- /dev/null +++ b/examples/math_ops.lol @@ -0,0 +1,14 @@ +/* Sum three numbers */ +module io = import("stdio.h"); + +function math_operation(a: i32, b: i32, c: i32, d: i32) -> i32 { + return + a + b * c + d + + a * (b + c) * d; +} + +function main() -> i32 { + let sum: i32 = math_operation(1, 2, 3, 4); + io::printf("Sum should be 31: %d\n", sum); + return 0; +} \ No newline at end of file diff --git a/examples/nested_if.lol b/examples/nested_if.lol new file mode 100644 index 0000000..f14a2bc --- /dev/null +++ b/examples/nested_if.lol @@ -0,0 +1,29 @@ +/* A simple program to demonstrate recursion */ +module io = import("stdio.h"); + +function nested_if(x: i32, y: i32) -> i32 { + if x == 0 { + if y == 0 { + io::printf("Both x and y are zero!\n"); + } else { + io::printf("x is zero but y is %d\n", y); + } + } else { + if y == 0 { + io::printf("x is %d but y is zero\n", x); + } else { + io::printf("x is %d and y is %d\n", x, y); + } + } + + /* NOTE I cannot return void because my compiler treats it as a regular type */ + return 0; +} + +function main() -> i32 { + nested_if(0, 0); + nested_if(0, 1); + nested_if(1, 0); + nested_if(1, 1); + return 0; +} diff --git a/examples/sum_three.lol b/examples/sum_three.lol new file mode 100644 index 0000000..d374c88 --- /dev/null +++ b/examples/sum_three.lol @@ -0,0 +1,12 @@ +/* Sum three numbers */ +module io = import("stdio.h"); + +function sum_three(a: i32, b: i32, c: i32) -> i32 { + return a + b + c; +} + +function main() -> i32 { + let sum: i32 = sum_three(1, 2, 3); + io::printf("Sum should be 6: %d\n", sum); + return 0; +} diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..25b93fc --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,35 @@ +[project] +name = "lolc" +version = "0.0.0" +description = "Prototype for an interpreter." +authors = [ + "David Chu " +] +license = "MIT" +readme = "README.md" +python = "^3.6" +homepage = "https://github.com/thedavidchu/lolc/" +repository = "https://github.com/thedavidchu/lolc/" +documentation = "https://github.com/thedavidchu/lolc/" + +# Requirements +[dependencies] + +[dev-dependencies] +black = { version = "^22.3.0", python = "^3.10" } + +[tool.black] +line-length = 80 +target-version = ['py310'] +exclude = ''' +( + /( + \.github + | \.git + | \.venv + | \.vscode + | _actions + | third-party + )/ +) +''' diff --git a/src/compiler/README.md b/src/compiler/README.md new file mode 100644 index 0000000..98c8467 --- /dev/null +++ b/src/compiler/README.md @@ -0,0 +1,28 @@ +# Light Operational Language + +Formerly `Light Object Language`, but then I decided that I wasn't going to use +OOP patterns. + +## Intent + +Create a transpiler that can rewrite a modern language to C. + +There are many limiting things in C89 and C99 that can be made by a more +intelligent preprocessing step. This is it. + +## Eventual Features + +In no particular order, here are some fun features that I may add. + +1. Generics. +2. Traits/Interfaces. +3. Lambdas/Closure. (???) +4. Borrow checker. (???) +5. Closures. (???) + +## Future Ecosystem + +1. VS Code extension with syntax highlighting. +2. Bootstrap this into its own language. What's nice is that once we write it, +it will generate C code so we can bootstrap it somewhat continuously (if you +know what I mean... I wasn't very clear). \ No newline at end of file diff --git a/src/compiler/__init__.py b/src/compiler/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/compiler/analyzer/__init__.py b/src/compiler/analyzer/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/compiler/analyzer/lol_analyzer.py b/src/compiler/analyzer/lol_analyzer.py new file mode 100644 index 0000000..6a26c1f --- /dev/null +++ b/src/compiler/analyzer/lol_analyzer.py @@ -0,0 +1,614 @@ +from typing import Any, Dict, List, Optional, Union + +import compiler.parser.lol_parser as parser_types +from compiler.parser.lol_parser import ( + # Generic + LolParserLiteralType, + + # Generic Expressions + LolParserTypeExpression, + LolParserExpression, + + LolParserModuleLevelStatement, + LolParserFunctionLevelStatement, + + # Specific Expressions + LolParserIdentifier, + LolParserLiteral, + LolParserFunctionCall, + LolParserOperatorExpression, + + LolParserImportStatement, + LolParserVariableDefinition, + LolParserParameterDefinition, + LolParserVariableModification, + LolParserFunctionDefinition, + LolParserReturnStatement, + LolParserIfStatement, +) + +################################################################################ +### LOL ANALYSIS INTERMEDIATE REPRESENTATION +################################################################################ +LolIRExpression = Union["LolIRFunctionCallExpression", "LolIROperatorExpression", "LolIRLiteralExpression", "LolAnalysisVariable"] +LolIRStatement = Union["LolIRDefinitionStatement", "LolIRSetStatement", "LolIRFunctionCallStatement", "LolIRIfStatement", "LolIRReturnStatement"] + + +### Expressions +class LolIRFunctionCallExpression: + def __init__(self, function: "LolAnalysisFunction", arguments: List["LolAnalysisVariable"]): + assert isinstance(function, LolAnalysisFunction) + assert isinstance(arguments, list) + assert all(isinstance(arg, LolAnalysisVariable) for arg in arguments) + self.function = function + self.arguments = arguments + + def __str__(self): + return f"{self.function.name}{tuple(arg.name for arg in self.arguments)}" + + +class LolIROperatorExpression: + def __init__(self, op: str, operands: List["LolAnalysisVariable"]): + assert isinstance(op, str) + assert isinstance(operands, list) + assert all(isinstance(operand, LolAnalysisVariable) for operand in operands) + self.op = op + self.operands: List["LolAnalysisVariable"] = operands + + def __str__(self): + return f"{self.operands[0].name} {self.op} {self.operands[1].name}" + + +class LolIRLiteralExpression: + def __init__(self, literal: Any): + self.literal = literal + + def __str__(self): + return f"{self.literal}" + + +### Statements +class LolIRDefinitionStatement: + def __init__(self, name: str, type: "LolAnalysisDataType", value: LolIRExpression): + assert isinstance(name, str) + # TODO(dchu): This is true for now, but will have to be generalized in + # future to allow different types. + assert isinstance(type, LolAnalysisBuiltinType) + self.name: str = name + self.type: "LolAnalysisDataType" = type + self.value = value + + def __str__(self): + return f"let {self.name}: {str(self.type)} = {str(self.value)};" + + +class LolIRSetStatement: + def __init__(self, name: str, value: LolIRExpression): + self.name = name + self.value = value + + def __str__(self): + return f"let {self.name} = {str(self.value)};" + + +class LolIRFunctionCallStatement: + def __init__(self, func_call: LolIRFunctionCallExpression): + self.func_call = func_call + + def __str__(self): + return f"{str(self.func_call)};" + + +class LolIRIfStatement: + def __init__(self, if_cond: "LolAnalysisVariable", if_body: List[LolIRStatement], else_body: List[LolIRStatement]): + self.if_cond = if_cond + self.if_body = if_body + self.else_body = else_body + + def __str__(self): + return f"if ({str(self.if_cond)}) {{...}} else {{...}}" + + +class LolIRReturnStatement: + def __init__(self, ret_var: "LolAnalysisVariable"): + self.ret_var = ret_var + + def __repr__(self): + return f"return {str(self.ret_var)}" + + +################################################################################ +### LOL ANALYSIS TYPES +################################################################################ +LolAnalysisDataType = Union["LolAnalysisBuiltinType"] +LolAnalysisSymbol = Union[LolAnalysisDataType, "LolAnalysisFunction", "LolAnalysisVariable"] + + +def optional_to_dict(obj: Any): + """Return obj.to_dict() if it has that attribute.""" + if obj is None: + return None + else: + return obj.to_dict() + + +def recursive_to_dict(obj: Optional[Dict[str, LolAnalysisSymbol]]): + """This is like optional_names() but calls to_dict() on each value.""" + if obj is None: + return None + else: + return {key: val.to_dict() for key, val in obj.items()} + + +def optional_names(obj: Optional[Dict[str, LolAnalysisSymbol]]): + """Get the names of a dict of objects with that attribute.""" + if obj is None: + return None + else: + # assert isinstance(obj, Dict[str, LolAnalysisSymbol]) + return {key: val.name for key, val in obj.items()} + + +class LolAnalysisBuiltinType: + # TODO(dchu): Make the object of the ops into a function so that we can + # specify the parameter types and the pointer types. + def __init__(self, name: str, ops: Dict[str, "LolAnalysisBuiltinType"]): + self.name = name + self.ops = ops + + def __str__(self): + return self.name + + def __repr__(self): + return f"{self.__class__.__name__}(name={self.name})" + + def to_dict(self): + return dict( + metatype=self.__class__.__name__, + name=self.name, + ops={op: dt.name for op, dt in self.ops.items()}, + id=id(self), + ) + + +def get_type( + type_ast: LolParserTypeExpression, + module_symbol_table: Dict[str, LolAnalysisSymbol] +) -> LolAnalysisDataType: + """Get the data type of an AST node.""" + # TODO: Change this in when we support multi-token TypeExpressions + assert isinstance(type_ast, LolParserIdentifier) + type_name = type_ast.name + if type_name not in module_symbol_table: + raise ValueError(f"module symbol table should contain name {type_name}") + type_symbol: LolAnalysisDataType = module_symbol_table[type_name] + # Python 3.10 should support this due to PIP 604 + # assert isinstance(type_symbol, LolAnalysisDataType) + return type_symbol + + +class LolAnalysisVariable: + def __init__( + self, + name: str, + ast_definition_node: Optional[Union[LolParserVariableDefinition, LolParserParameterDefinition]], + *, + type: Optional[LolAnalysisDataType] = None, + ): + assert isinstance(name, str) + assert isinstance(ast_definition_node, (LolParserVariableDefinition, LolParserParameterDefinition)) or ast_definition_node is None + self.name = name + self.ast_definition_node = ast_definition_node + + self.type: Optional[LolAnalysisDataType] = type + + def __str__(self): + return f"{self.name}: {str(self.type)}" + + def __repr__(self): + return f"{self.__class__.__name__}(name={self.name}, type={str(self.type)})" + + @staticmethod + def init_local_variable( + name: str, + ast_definition_node: Optional[Union[LolParserVariableDefinition, LolParserParameterDefinition]], + module_symbol_table: Dict[str, LolAnalysisSymbol] + ) -> "LolAnalysisVariable": + """This method is to allow initializing a variable without needing to + wait to complete the prototype. This is just for convenience.""" + r = LolAnalysisVariable(name, ast_definition_node) + r.complete_prototype(module_symbol_table) + return r + + def complete_prototype(self, module_symbol_table: Dict[str, LolAnalysisSymbol]): + assert self.type is None + self.type = get_type(self.ast_definition_node.type, module_symbol_table) + + def to_dict(self): + return dict( + metatype=self.__class__.__name__, + name=self.name, + type=optional_to_dict(self.type), + ) + + +class LolAnalysisFunction: + def __init__( + self, + name: str, + ast_definition_node: Optional[parser_types.LolParserFunctionDefinition], + *, + # Function Prototype + return_types: Optional[LolAnalysisDataType] = None, + parameter_types: Optional[List[LolAnalysisDataType]] = None, + parameter_names: Optional[List[str]] = None, + # Function Body + symbol_table: Optional[Dict[str, LolAnalysisSymbol]] = None, + body: Optional[List[LolIRStatement]] = None, + ): + self.name = name + self.ast_definition_node = ast_definition_node + + self.return_types: Optional[LolAnalysisDataType] = return_types + self.parameter_types: Optional[List[LolAnalysisDataType]] = parameter_types + self.parameter_names: Optional[List[str]] = parameter_names + + self.symbol_table: Optional[Dict[str, LolAnalysisSymbol]] = symbol_table + self.body: Optional[List[LolIRStatement]] = body + + def __str__(self): + parameters = ", ".join( + f'{name}: {str(type_)}' + for name, type_ in + zip(self.parameter_names, self.parameter_types) + ) + return f"function {self.name}({parameters}) -> {str(self.return_types)}" + + def __repr__(self): + parameters = ", ".join( + f'{name}: {str(type_)}' + for name, type_ in zip(self.parameter_names, self.parameter_types) + ) + return f"{self.__class__.__name__}(name={self.name}, parameters=({parameters}), return_type={str(self.return_types)})" + + def complete_prototype(self, module_symbol_table: Dict[str, LolAnalysisSymbol]): + assert self.return_types is None + assert self.parameter_types is None + assert self.parameter_names is None + self.return_types = get_type(self.ast_definition_node.return_type, module_symbol_table) + self.parameter_types = [ + get_type(t.type, module_symbol_table) for t in self.ast_definition_node.parameters + ] + self.parameter_names = [ + t.get_name_as_str() for t in self.ast_definition_node.parameters + ] + + def _get_temporary_variable_name(self) -> str: + # NOTE: this is a complete hack! + if not hasattr(self, "tmp_cnt"): + self.tmp_cnt = 0 + tmp = self.tmp_cnt + self.tmp_cnt += 1 + return f"%{tmp}" + + def _get_symbol(self, module_symbol_table: Dict[str, LolAnalysisSymbol], name: str) -> LolAnalysisSymbol: + split_names = name.split("::") + first_name = split_names[0] + + if first_name in self.symbol_table: + module = self.symbol_table + for name in split_names[:-1]: + module = module[name].module_symbol_table + last_name = split_names[-1] + return module[last_name] + elif first_name in module_symbol_table: + module = module_symbol_table + for name in split_names[:-1]: + module = module[name].module_symbol_table + last_name = split_names[-1] + return module[last_name] + else: + raise ValueError(f"symbol {first_name} not found in either module") + + def _get_operator_return_type( + self, + module_symbol_table: Dict[str, LolAnalysisSymbol], + op_name: str, + operands: List["LolAnalysisVariable"] + ) -> Optional[LolAnalysisDataType]: + first_operand, *_ = operands + hacky_ret_type = self._get_symbol(module_symbol_table, first_operand.name).type + return hacky_ret_type + + def _parse_expression_recursively( + self, + x: LolParserExpression, + module_symbol_table: Dict[str, LolAnalysisSymbol], + *, + body_block: List[LolIRStatement], + ) -> str: + if isinstance(x, LolParserOperatorExpression): + op_name: str = x.operator + operands: List["LolAnalysisVariable"] = [ + self._get_symbol( + module_symbol_table, + self._parse_expression_recursively(y, module_symbol_table, body_block=body_block) + ) + for y in x.operands + ] + ret = self._get_temporary_variable_name() + ret_type = self._get_operator_return_type(module_symbol_table, op_name, operands) + ret_value = LolIROperatorExpression(op_name, operands) + stmt = LolIRDefinitionStatement( + ret, ret_type, ret_value + ) + body_block.append(stmt) + self.symbol_table[ret] = LolAnalysisVariable(ret, None, type=ret_type) + return ret + elif isinstance(x, LolParserLiteral): + if x.type == LolParserLiteralType.INTEGER: + ret = self._get_temporary_variable_name() + ret_type = module_symbol_table["i32"] + stmt = LolIRDefinitionStatement( + ret, ret_type, LolIRLiteralExpression(x.value) + ) + body_block.append(stmt) + self.symbol_table[ret] = LolAnalysisVariable(ret, None, type=ret_type) + return ret + elif x.type == LolParserLiteralType.STRING: + ret = self._get_temporary_variable_name() + ret_type = module_symbol_table["cstr"] + stmt = LolIRDefinitionStatement( + ret, ret_type, LolIRLiteralExpression(x.value) + ) + body_block.append(stmt) + self.symbol_table[ret] = LolAnalysisVariable(ret, None, type=ret_type) + return ret + else: + raise NotImplementedError + elif isinstance(x, LolParserFunctionCall): + func_name: str = x.get_name_as_str() + func: LolAnalysisFunction = self._get_symbol(module_symbol_table, func_name) + assert isinstance(func, LolAnalysisFunction) + args: List["LolAnalysisVariable"] = [ + self._get_symbol( + module_symbol_table, + self._parse_expression_recursively(y, module_symbol_table, body_block=body_block) + ) + for y in x.arguments + ] + ret: str = self._get_temporary_variable_name() + ret_type = func.return_types + stmt = LolIRDefinitionStatement( + ret, ret_type, LolIRFunctionCallExpression(func, args) + ) + body_block.append(stmt) + self.symbol_table[ret] = LolAnalysisVariable(ret, None, type=ret_type) + return ret + elif isinstance(x, LolParserReturnStatement): + ret = self._parse_expression_recursively(x.value, module_symbol_table, body_block=body_block) + stmt = LolIRReturnStatement(self._get_symbol(module_symbol_table, ret)) + body_block.append(stmt) + elif isinstance(x, LolParserIfStatement): + if_cond_name = self._parse_expression_recursively(x.if_condition, module_symbol_table, body_block=body_block) + if_cond = self._get_symbol(module_symbol_table, if_cond_name) + if_block = [] + for y in x.if_block: + self._parse_statement(module_symbol_table, y, body_block=if_block) + else_block = [] + for y in x.else_block: + self._parse_statement(module_symbol_table, y, body_block=else_block) + stmt = LolIRIfStatement(if_cond, if_block, else_block) + body_block.append(stmt) + elif isinstance(x, LolParserIdentifier): + return x.name + else: + raise NotImplementedError + + def _parse_statement( + self, + module_symbol_table: Dict[str, LolAnalysisSymbol], + x: LolParserFunctionLevelStatement, + *, + body_block: List[LolIRStatement], + ): + if isinstance(x, LolParserVariableDefinition): + name = x.get_name_as_str() + ast_data_type = x.type + assert isinstance(ast_data_type, LolParserIdentifier) + data_type = self._get_symbol(module_symbol_table, ast_data_type.name) + value = self._parse_expression_recursively(x.value, module_symbol_table, body_block=body_block) + self.symbol_table[name] = LolAnalysisVariable.init_local_variable(name, x, module_symbol_table) + stmt = LolIRDefinitionStatement( + name, data_type, self._get_symbol(module_symbol_table, value) + ) + body_block.append(stmt) + elif isinstance(x, LolParserVariableModification): + # I'm not even sure that the parser supports modification nodes + raise NotImplementedError + else: + _unused_return_variable = self._parse_expression_recursively(x, module_symbol_table, body_block=body_block) + + def complete_body(self, module_symbol_table: Dict[str, LolAnalysisSymbol]): + assert self.symbol_table is None + assert self.body is None + # Add parameters to the symbol table + self.symbol_table = { + t.get_name_as_str(): LolAnalysisVariable.init_local_variable( + t.get_name_as_str(), t, module_symbol_table + ) + for t in self.ast_definition_node.parameters + } + self.body = [] + for statement in self.ast_definition_node.body: + self._parse_statement(module_symbol_table, statement, body_block=self.body) + + def to_dict(self): + return dict( + metatype=self.__class__.__name__, + name=self.name, + return_types=None, + parameter_types=None, + parameter_names=self.parameter_names, + symbol_table=optional_names(self.symbol_table), + body="TODO", + ) + + +class LolAnalysisModule: + def __init__(self, name: str, caller_module: Optional["LolAnalysisModule"] = None): + self.name = name + self.intermediate_repr: List[Any] = [] + self.module_symbol_table: Dict[str, LolAnalysisSymbol] = {} + + self.add_builtin_types(caller_module) + + def add_to_module_symbol_table(self, name, symbol): + if name in self.module_symbol_table: + raise ValueError(f"name {name} already in module symbol table") + self.module_symbol_table[name] = symbol + + def add_builtin_types(self, caller_module: Optional["LolAnalysisModule"]): + if caller_module is None: + i32 = LolAnalysisBuiltinType("i32", {}) + i32.ops["+"] = i32 + i32.ops["-"] = i32 + i32.ops["*"] = i32 + i32.ops["/"] = i32 + cstr = LolAnalysisBuiltinType("cstr", {}) + void = LolAnalysisBuiltinType("void", {}) + else: + # We want all of the built-in objects to be identical objects with + # even the pointers matching (so module_a's i32 is module_b's i32) + i32 = caller_module.module_symbol_table["i32"] + cstr = caller_module.module_symbol_table["cstr"] + void = caller_module.module_symbol_table["void"] + self.add_to_module_symbol_table("i32", i32) + self.add_to_module_symbol_table("cstr", cstr) + self.add_to_module_symbol_table("void", void) + + def to_dict(self): + # NOTE: This could end up in an infinite loop of recursion if we + # have circular imports; however, it is useful to see the verbose + # printing of modules, especially leaf modules. + return dict( + metatype=self.__class__.__name__, + name=self.name, + module_symbol_table=recursive_to_dict(self.module_symbol_table), + ) + + ### NAME + def _add_function_name(self, ast_definition: LolParserFunctionDefinition): + name = ast_definition.get_name_as_str() + symbol = LolAnalysisFunction(name, ast_definition) + self.add_to_module_symbol_table(name, symbol) + + def _add_variable_name(self, ast_definition: LolParserVariableDefinition): + name = ast_definition.get_name_as_str() + symbol = LolAnalysisVariable.init_local_variable(name, ast_definition) + self.add_to_module_symbol_table(name, symbol) + + # TODO: merge this into the variable! + def _add_import_name(self, ast_definition: LolParserImportStatement): + alias = ast_definition.get_alias_as_str() + library = ast_definition.get_library_name_as_str() + if library == "\"stdio.h\"": + module = LolAnalysisModule(library, caller_module=self) + i32: LolAnalysisBuiltinType = self.module_symbol_table["i32"] + cstr: LolAnalysisBuiltinType = self.module_symbol_table["cstr"] + printf_func = LolAnalysisFunction( + "printf", + None, + return_types=i32, + parameter_types=[cstr], + parameter_names=["format"], + ) + module.add_to_module_symbol_table("printf", printf_func) + else: + raise NotImplementedError("only stdio.h library is supported!") + self.add_to_module_symbol_table(alias, module) + + def get_module_names(self, ast_nodes: List[LolParserModuleLevelStatement]): + """ + Extract names (only) of function definitions, module definitions, and + imports. + + TODO + ---- + 1. Add struct/enum/monad + """ + for i, node in enumerate(ast_nodes): + if isinstance(node, LolParserFunctionDefinition): + self._add_function_name(node) + elif isinstance(node, LolParserVariableDefinition): + self._add_variable_name(node) + elif isinstance(node, LolParserImportStatement): + # TODO(dchu) - recursively add members to this submodule! + self._add_import_name(node) + # TODO(dchu): accept data structures + else: + # We will ignore anything outside of functions! This is an error + raise ValueError(f"{node} cannot be outside of functions!") + + ### PROTOTYPES + def add_function_prototype(self, ast_definition: LolParserFunctionDefinition): + name = ast_definition.get_name_as_str() + func: LolAnalysisFunction = self.module_symbol_table[name] + func.complete_prototype(self.module_symbol_table) + + def add_variable_prototype(self, ast_definition: LolParserVariableDefinition): + name = ast_definition.get_name_as_str() + var: LolAnalysisVariable = self.module_symbol_table[name] + var.complete_prototype(self.module_symbol_table) + + def add_import_prototype(self, ast_definition: LolParserImportStatement): + # Intentionally do nothing + pass + + def get_module_prototypes(self, ast_nodes: List[LolParserModuleLevelStatement]): + """Get function and variable prototypes.""" + for i, node in enumerate(ast_nodes): + if isinstance(node, LolParserFunctionDefinition): + self.add_function_prototype(node) + elif isinstance(node, LolParserVariableDefinition): + self.add_variable_prototype(node) + elif isinstance(node, LolParserImportStatement): + self.add_import_prototype(node) + else: + # We will ignore anything outside of functions! This is an error + raise ValueError(f"{node} cannot be outside of functions!") + + ### BODIES + def add_function_body(self, ast_definition: LolParserFunctionDefinition): + name = ast_definition.get_name_as_str() + func: LolAnalysisFunction = self.module_symbol_table[name] + func.complete_body(self.module_symbol_table) + + def add_variable_body(self, ast_definition: LolParserVariableDefinition): + # Intentionally do nothing + pass + + def add_import_body(self, ast_definition: LolParserImportStatement): + # Intentionally do nothing + pass + + def get_module_bodies(self, ast_nodes: List[LolParserModuleLevelStatement]): + for i, node in enumerate(ast_nodes): + if isinstance(node, LolParserFunctionDefinition): + self.add_function_body(node) + elif isinstance(node, LolParserVariableDefinition): + self.add_variable_body(node) + elif isinstance(node, LolParserImportStatement): + self.add_import_body(node) + else: + # We will ignore anything outside of functions! This is an error + raise ValueError(f"{node} cannot be outside of functions!") + + +def analyze(asts: List[LolParserModuleLevelStatement], raw_text: str) -> LolAnalysisModule: + module = LolAnalysisModule("main") + module.get_module_names(asts) + module.get_module_prototypes(asts) + module.get_module_bodies(asts) + + return module diff --git a/src/compiler/emitter/__init__.py b/src/compiler/emitter/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/compiler/emitter/lol_emitter.py b/src/compiler/emitter/lol_emitter.py new file mode 100644 index 0000000..7493e92 --- /dev/null +++ b/src/compiler/emitter/lol_emitter.py @@ -0,0 +1,118 @@ +""" +Take the AST and emit C code. + +TODO +---- + +1. Minimal Viable Product +2. Correct indentation +""" +from typing import List + +from compiler.analyzer.lol_analyzer import ( + LolAnalysisModule, LolAnalysisFunction, LolAnalysisBuiltinType, + LolIRReturnStatement, LolIRFunctionCallStatement, LolIRDefinitionStatement, + LolIRSetStatement, LolIRIfStatement, + LolIRExpression, LolIRStatement, + LolIRFunctionCallExpression, LolIROperatorExpression, + LolIRLiteralExpression, LolAnalysisVariable +) + + +lol_to_c_types = {"cstr": "char *", "i32": "int", "void": "void"} + + +def mangle_var_name(var_name: str) -> str: + return var_name.replace("%", "LOLvar_") + + +def emit_expr(expr: LolIRExpression) -> str: + if isinstance(expr, LolIRFunctionCallExpression): + func_name = expr.function.name + func_args = [mangle_var_name(arg.name) for arg in expr.arguments] + return f"{func_name}({', '.join(func_args)})" + elif isinstance(expr, LolIROperatorExpression): + if len(expr.operands) == 1: + return f"{expr.op}{mangle_var_name(expr.operands[0].name)}" + elif len(expr.operands) == 2: + if expr.op in {"or", "and"}: + expr_op = {"or": "||", "and": "&&"}.get(expr.op) + else: + expr_op = expr.op + return f"{mangle_var_name(expr.operands[0].name)} {expr_op} {mangle_var_name(expr.operands[1].name)}" + else: + raise ValueError("only 1 or 2 operands accepted!") + elif isinstance(expr, LolIRLiteralExpression): + literal = expr.literal + if isinstance(literal, str): + return f"\"{literal}\"" + elif isinstance(literal, int): + return f"{expr.literal}" + elif isinstance(expr, LolAnalysisVariable): + return f"{mangle_var_name(expr.name)}" + + +def emit_statements( + ir_statements: List[LolIRStatement], + *, + indentation: str = " " +) -> List[str]: + statements: List[str] = [] + for stmt in ir_statements: + if isinstance(stmt, LolIRDefinitionStatement): + var_name = mangle_var_name(stmt.name) + var_type = lol_to_c_types[stmt.type.name] + var_value = emit_expr(stmt.value) + statements.append(indentation + f"{var_type} {var_name} = {var_value};") + elif isinstance(stmt, LolIRSetStatement): + var_name = mangle_var_name(stmt.name) + var_value = emit_expr(stmt.value) + statements.append(indentation + f"{var_name} = {var_value};") + elif isinstance(stmt, LolIRFunctionCallStatement): + code = emit_expr(stmt.func_call) + statements.append(indentation + f"{code};") + elif isinstance(stmt, LolIRReturnStatement): + name = mangle_var_name(stmt.ret_var.name) + statements.append(indentation + f"return {name};") + elif isinstance(stmt, LolIRIfStatement): + statements.append(indentation + f"if ({mangle_var_name(stmt.if_cond.name)}) {{") + statements.extend(emit_statements(stmt.if_body, indentation=indentation + " ")) + statements.append(indentation + "} else {") + statements.extend(emit_statements(stmt.else_body, indentation=indentation + " ")) + statements.append(indentation + "}") + else: + raise ValueError("unrecognized statement type (maybe if statement?)") + return statements + +def emit_function(func: LolAnalysisFunction): + prototype = ( + f"{lol_to_c_types[func.return_types.name]}\n" + f"{func.name}({', '.join((f'{lol_to_c_types[arg_type.name]} {arg_name}' for arg_type, arg_name in zip(func.parameter_types, func.parameter_names)))})\n" + ) + statements = emit_statements(func.body) + + return prototype + "{\n" + "\n".join(statements) + "\n}\n" + + +def emit_import(include: LolAnalysisModule): + return f"#include <{include.name[1:-1]}>" + + +def emit_c(analysis_module: LolAnalysisModule): + import_statements = [] + func_statements = [] + # Emit modules + for name, s in analysis_module.module_symbol_table.items(): + if isinstance(s, LolAnalysisModule): + import_statements.append(emit_import(s)) + elif isinstance(s, LolAnalysisFunction): + func_statements.append(emit_function(s)) + elif isinstance(s, LolAnalysisBuiltinType): + # Obviously, we don't need to define built-in types + continue + else: + raise ValueError("unrecognized statement type") + + statements = import_statements + func_statements + code = "\n".join(statements) + return code diff --git a/src/compiler/lexer/__init__.py b/src/compiler/lexer/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/compiler/lexer/lol_lexer.py b/src/compiler/lexer/lol_lexer.py new file mode 100644 index 0000000..67965ed --- /dev/null +++ b/src/compiler/lexer/lol_lexer.py @@ -0,0 +1,260 @@ +""" +# Lexer + +## Language + +This parser will parse white-space separated tokens. It will output the results +into a text file in the form: + +``` +,,,, +``` + +We use white-space as the delimiter because CSVs are well formed. + +******************************************************************************** + +The accepted tokens are (ASCII): + +1. identifiers : [A-Za-z_][A-Za-z0-9_]* + - Keywords: if, else, while, function, return, let, namespace +2. decimal integers : [1-9][0-9]* +3. strings : ["](\"|[^"])*["] +4. parentheses : "(" or ")" +5. braces : "{" or "}" +6. brackets : "[" or "]" +7. dot : "." +8. comma : "," +9. equals : "=" +10. colon : ":" +11. comments : #.* +12. semicolon : ";" +13. arrow : "->" # N.B. NOT a binary op. This is only used in the context of functions + +******************************************************************************** + +Future tokens to accept in the future are: + +1. More types of numbers (binary, octal, hexadecimal, floats, scientific notation) +2. Differentiate keywords from identifiers (not really necessary, we can do this later) +3. Add support for other operators (!, %, ^, &, *, +, -, ==, <, >, <=, >=, ->, |, ~, /, //) +4. Add escaping for strings ("\n\t\\\v\a\b") +5. Add support for single quote strings? +6. Add multiline strings ('''multiline string''') +7. Add multiline comments +""" +from typing import Dict, List + +from compiler.lexer.lol_lexer_types import ( + TokenType, Token, CharacterStream, SYMBOL_CONTROL +) + + +class Lexer: + def __init__(self, src: str): + self.stream = CharacterStream(src) + self.tokens = [] + + @staticmethod + def _get_identifier_token_type(identifier: str): + if identifier in { + "while", "for", "namespace", "break", "continue", "not" + }: + raise NotImplementedError( + f"lexer supports keyword '{identifier}'; no further stage does" + ) + key_words: Dict[str, TokenType] = { + "if": TokenType.IF, + "else": TokenType.ELSE, + "let": TokenType.LET, + "while": TokenType.WHILE, + "for": TokenType.FOR, + "function": TokenType.FUNCTION, + "return": TokenType.RETURN, + "namespace": TokenType.NAMESPACE, + "module": TokenType.MODULE, + "import": TokenType.IMPORT, + "break": TokenType.BREAK, + "continue": TokenType.CONTINUE, + "and": TokenType.AND, + "or": TokenType.OR, + "not": TokenType.NOT, + } + token_type = key_words.get(identifier, TokenType.IDENTIFIER) + return token_type + + @staticmethod + def lex_identifier(stream: CharacterStream): + # Concatentation to a list is more efficient than to a string, since + # strings are immutable. + c, pos = stream.get_char(), stream.get_pos() + token = [] + while c.isalnum() or c == "_": + token.append(c) + stream.next_char() + c = stream.get_char() + + identifier = "".join(token) + token_type = Lexer._get_identifier_token_type(identifier) + return Token( + identifier, + token_type, + start_position=pos, + full_text=stream.get_text() + ) + + @staticmethod + def lex_number(stream: CharacterStream): + # NOTE(dchu): for now, we assume that the number is a base-10 integer. + c, pos = stream.get_char(), stream.get_pos() + current_token_type = TokenType.INTEGER + # Concatentation to a list is more efficient than to a string, since + # strings are immutable. + token = [] + while c.isdecimal(): + if c.isdecimal(): + token.append(c) + stream.next_char() + c = stream.get_char() + elif c == "." and current_token_type == TokenType.INTEGER: + raise NotImplementedError("floats not supported yet!") + current_token_type = TokenType.FLOAT + token.append(c) + stream.next_char() + c = stream.get_char() + else: + raise NotImplementedError + return Token("".join(token), current_token_type, start_position=pos, full_text=stream.get_text()) + + @staticmethod + def lex_string(stream: CharacterStream): + c, pos = stream.get_char(), stream.get_pos() + # Concatentation to a list is more efficient than to a string, since + # strings are immutable. + stream.next_char() + token = ['"'] + while True: + # TODO(dchu) support escaped quotations + c = stream.get_char() + if c == '"' or c is None: + stream.next_char() + break + token.append(c) + stream.next_char() + # Add trailing quote + token.append(c) + return Token("".join(token), TokenType.STRING, start_position=pos, full_text=stream.get_text()) + + @staticmethod + def lex_comment(stream: CharacterStream): + """Get a comment that is like a C-style comment: /* Comment */. We + assume that there is already a '/*' and the front.""" + pos = stream.get_pos() + assert stream.get_char() == "/" and stream.get_char(offset=1) == "*" + stream.next_char() + stream.next_char() + # Concatentation to a list is more efficient than to a string, since + # strings are immutable. + token = ["/*"] + c = stream.get_char() + while True: + token.append(c) + stream.next_char() + c, n = stream.get_char(), stream.get_char(offset=1) + if c == "*" and n == "/": + stream.next_char() + stream.next_char() + break + elif c is None: + raise ValueError("expected terminal '*/' in the comment") + return Token("".join(token), TokenType.COMMENT, start_position=pos, full_text=stream.get_text()) + + @staticmethod + def _is_punctuation_implemented(token_type: TokenType) -> bool: + # TODO(dchu): This is a hack! I should just maintain a list of + # unimplemented punctuation token types. The reason I do this is + # because it is very clear when inspecting the TokenType definition to + # see what is and isn't implemented. + if ( + isinstance(token_type.value, tuple) + and len(token_type.value) >= 2 + and token_type.value[1] in { + TokenType.NOT_YET_IMPLEMENTED, TokenType.WONT_BE_IMPLEMENTED + } + ): + raise NotImplementedError( + f"token_type {token_type.n} not implemented" + ) + return True + + @staticmethod + def lex_punctuation(stream: CharacterStream): + start_pos = stream.get_pos() + + control = SYMBOL_CONTROL + lexeme = [] + while True: + c = stream.get_char() + if c is None: + if isinstance(control, TokenType): + token_type = control + break + elif None in control: + token_type = control[None] + break + if isinstance(control, TokenType): + token_type = control + break + elif c in control: + lexeme.append(c) + stream.next_char() + control = control[c] + elif None in control: + token_type = control[None] + break + else: + raise ValueError(f"cannot append {c} to {''.join(lexeme)} -- potential bug, just separate the symbols") + + if not Lexer._is_punctuation_implemented(token_type): + raise NotImplementedError + + return Token( + "".join(lexeme), token_type, + start_position=start_pos, full_text=stream.get_text() + ) + + def tokenize(self): + while True: + c = self.stream.get_char() + if c is None: + break + + if c.isspace(): + self.stream.next_char() + elif c.isalpha() or c == "_": + token = self.lex_identifier(self.stream) + self.tokens.append(token) + elif c.isdecimal(): + token = self.lex_number(self.stream) + self.tokens.append(token) + elif c == '"': + token = self.lex_string(self.stream) + self.tokens.append(token) + elif c == "/" and self.stream.get_char(offset=1) == "*": + _unused_token = self.lex_comment(self.stream) + # TODO(dchu): re-enable this once the AST supports comments. + # Right now, we skip comments. + # self.tokens.append(_unused_token) + elif c in SYMBOL_CONTROL: + # TODO(dchu): '-' does not necessarily imply a punctuation mark. + # It can also be the start of a negative number, e.g. -10.3 + token = self.lex_punctuation(self.stream) + self.tokens.append(token) + else: + raise ValueError(f"character '{c}' not supported!") + + +def tokenize(text: str) -> List[Token]: + t = Lexer(text) + t.tokenize() + return t.tokens diff --git a/src/compiler/lexer/lol_lexer_types.py b/src/compiler/lexer/lol_lexer_types.py new file mode 100644 index 0000000..2141cbd --- /dev/null +++ b/src/compiler/lexer/lol_lexer_types.py @@ -0,0 +1,249 @@ +from enum import Enum, auto, unique +from typing import Dict, Tuple, Union, Optional + + +@unique +class TokenType(Enum): + # Just for error checking! + IMPLEMENTED = auto() + NOT_YET_IMPLEMENTED = auto() + WONT_BE_IMPLEMENTED = auto() + + # Parentheses, square brackets, and braces + LPAREN = auto() # ( + RPAREN = auto() # ) + LSQB = auto() # [ + RSQB = auto() # ] + LBRACE = auto() # { + RBRACE = auto() # } + + # Separators + DOT = auto() # . + COMMA = auto() # , + EQUAL = auto() # = + COLON = auto() # : + SEMICOLON = auto() # ; + + ARROW = auto() # -> + + # Unimplemented in tokenizer + EXCLAMATION = auto(), NOT_YET_IMPLEMENTED # ! + AT = auto(), NOT_YET_IMPLEMENTED # @ + PERCENT = auto(), NOT_YET_IMPLEMENTED # % + CIRCUMFLEX = auto(), NOT_YET_IMPLEMENTED # ^ + AMPERSAND = auto(), NOT_YET_IMPLEMENTED # & + STAR = auto() # * + PLUS = auto() # + + MINUS = auto() # - + SLASH = auto() # / + + QUESTION = auto(), NOT_YET_IMPLEMENTED # ? + VBAR = auto(), NOT_YET_IMPLEMENTED # | + + GREATER = auto() # > + LESSER = auto() # < + + # Doubled characters + COLON_COLON = auto() # :: + RSHIFT = auto(), NOT_YET_IMPLEMENTED # >> + LSHIFT = auto(), NOT_YET_IMPLEMENTED # << + GREATER_EQUAL = auto(), NOT_YET_IMPLEMENTED # >= + LESSER_EQUAL = auto(), NOT_YET_IMPLEMENTED # <= + EQUAL_EQUAL = auto(), NOT_YET_IMPLEMENTED # == + NOT_EQUAL = auto(), NOT_YET_IMPLEMENTED # != + + # Unimplemented in tokenizer (no plan to implement these yet) + STAR_STAR = auto(), WONT_BE_IMPLEMENTED # ** + PLUS_PLUS = auto(), WONT_BE_IMPLEMENTED # ++ + MINUS_MINUS = auto(), WONT_BE_IMPLEMENTED # -- + SLASH_SLASH = auto(), WONT_BE_IMPLEMENTED # // + + # COLON_EQUAL = auto() # := + # STAR_EQUAL = WONT_BE_IMPLEMENTED # *= + # PLUS_EQUAL = WONT_BE_IMPLEMENTED # += + # MINUS_EQUAL = WONT_BE_IMPLEMENTED # -= + # SLASH_EQUAL = WONT_BE_IMPLEMENTED # /= + # RSHIFT_EQUAL = WONT_BE_IMPLEMENTED # >>= + # LSHIFT_EQUAL = WONT_BE_IMPLEMENTED # <<= + # PERCENT_EQUAL = WONT_BE_IMPLEMENTED # %= + # CIRCUMFLEX_EQUAL = WONT_BE_IMPLEMENTED # ^= + # AMPERSAND_EQUAL = WONT_BE_IMPLEMENTED # &= + # QUESTION_EQUAL = WONT_BE_IMPLEMENTED # ?= + # VBAR_EQUAL = WONT_BE_IMPLEMENTED # |= + # AT_EQUAL = WONT_BE_IMPLEMENTED # @= + # BSLASH = auto(), WONT_BE_IMPLEMENTED # \ + + + # Multicharacter conglomerates + IDENTIFIER = auto() # [A-Za-z_][A-Za-z_0-9] + STRING = auto() # "[^"\n]*" + INTEGER = auto() # [1-9][0-9]* + FLOAT = auto() + COMMENT = auto() # [/][*].*[*][/] + + # Keywords + IF = auto() + ELSE = auto() + WHILE = auto() + FOR = auto() + FUNCTION = auto() + RETURN = auto() + LET = auto() + NAMESPACE = auto() + MODULE = auto() + IMPORT = auto() + BREAK = auto() + CONTINUE = auto() + AND = auto() + OR = auto() + NOT = auto() + + +SYMBOL_CONTROL: Dict[Optional[str], Union[Dict, TokenType]] = { + "(": {None: TokenType.LPAREN}, + ")": {None: TokenType.RPAREN}, + "[": {None: TokenType.LSQB}, + "]": {None: TokenType.RSQB}, + "{": {None: TokenType.LBRACE}, + "}": {None: TokenType.RBRACE}, + ",": {None: TokenType.COMMA}, + ".": {None: TokenType.DOT}, + ";": {None: TokenType.SEMICOLON}, + "?": {None: TokenType.QUESTION}, + "|": {None: TokenType.QUESTION}, + "&": {None: TokenType.AMPERSAND}, + "^": {None: TokenType.CIRCUMFLEX}, + "@": {None: TokenType.AT}, + ":": { + ":": TokenType.COLON_COLON, + None: TokenType.COLON, + }, + "=": { + "=": TokenType.EQUAL_EQUAL, + None: TokenType.EQUAL, + }, + ">": { + ">": TokenType.RSHIFT, + "=": TokenType.GREATER_EQUAL, + None: TokenType.GREATER, + }, + "<": { + "<": TokenType.LSHIFT, + "=": TokenType.LESSER_EQUAL, + None: TokenType.LESSER, + }, + "!": { + "=": TokenType.NOT_EQUAL, + None: TokenType.NOT, + }, + "+": { + "+": TokenType.PLUS_PLUS, + None: TokenType.PLUS, + }, + "*": { + "*": TokenType.STAR_STAR, + None: TokenType.STAR, + }, + "-": { + "-": TokenType.MINUS, + ">": TokenType.ARROW, + None: TokenType.MINUS, + }, + "/": { + "/": TokenType.SLASH_SLASH, + None: TokenType.SLASH, + }, +} + + +class Token: + def __init__( + self, + lexeme: str, + token_type: TokenType, + *, + start_position: Optional[int] = None, + full_text: Optional[str] = None, + ): + self.lexeme = lexeme + self.token_type = token_type + + self.start_position = start_position + self.full_text = full_text + + def is_type(self, token_type: TokenType) -> bool: + return self.token_type == token_type + + def as_str(self): + return self.lexeme + + def get_token_type(self): + return self.token_type + + def get_token_type_as_str(self): + return self.token_type.name + + def __repr__(self): + """Pretty print the token. This is NOT for serialization, because the + token type should be an integer id so that it's easier to parse.""" + return ( + f"Token(lexeme={repr(self.lexeme)}, token_type={self.get_token_type_as_str()}, start_idx={self.start_position}, full_text?={isinstance(self.full_text, str)})" + ) + + def get_line_and_column_numbers(self) -> Optional[Tuple[int, int]]: + if self.start_position is None or self.full_text is None: + return None + line_no = self.full_text[:self.start_position].count("\n") + 1 + lines = self.full_text.split("\n") + col_no = self.start_position - sum(len(line) for line in lines[:line_no]) + return line_no, col_no + + def to_dict(self) -> Dict[str, Union[TokenType, int, str]]: + """ + Pretty print the serialized token. + + To make this purely functional, we would print the token type ID, + the start position, and the lexeme. Everything else is superfluous.""" + return dict( + metatype=self.__class__.__name__, + lexeme=repr(self.lexeme), + token_type=self.get_token_type_as_str(), + start_position=self.start_position, + ) + + +class CharacterStream: + def __init__(self, text: str): + self.text = text + self.idx = 0 + self.line_number = 1 + self.column_number = 1 + + def get_text_after(self): + return self.text[self.idx:] + + def get_text(self) -> str: + return self.text + + def get_char(self, *, offset: Optional[int] = 0) -> Optional[str]: + """Get the current character or return None""" + if self.idx + offset >= len(self.text): + return None + return self.text[self.idx + offset] + + def next_char(self): + """Advance to the next character or return early if we are at the last character.""" + c = self.get_char() + if c is None: + return + self.idx += 1 + if c == "\n": + self.line_number += 1 + self.column_number = 1 + else: + self.column_number += 1 + + def get_pos(self) -> int: + """Get the current character position in a (absolute_index, line_number, + column_number) tuple""" + return self.idx diff --git a/src/compiler/lol.py b/src/compiler/lol.py new file mode 100644 index 0000000..e1ee4ec --- /dev/null +++ b/src/compiler/lol.py @@ -0,0 +1,145 @@ +import argparse +import json +import os +import time +from typing import Any, Dict, List, Optional + +from compiler.analyzer.lol_analyzer import analyze, LolAnalysisModule +from compiler.emitter.lol_emitter import emit_c +from compiler.lexer.lol_lexer import tokenize +from compiler.lexer.lol_lexer_types import Token +from compiler.parser.lol_parser import parse, LolParserModuleLevelStatement +from compiler.parser.lol_parser_token_stream import TokenStream + + +class LolSymbol: + def __init__(self): + self.type: Any = None + self.definition: Any = None + + def to_dict(self) -> Dict[str, Any]: + return {"type": self.type, } + + +class LolModule: + def __init__( + self, + *, + input_file: str, + output_dir: str, + ): + # Metadata + self.input_file = input_file + self.output_dir = output_dir + prefix, ext = os.path.splitext(os.path.basename(input_file)) + self.output_prefix = prefix + + self.text: str = "" + self.tokens: List[Token] = [] + self.ast: List[LolParserModuleLevelStatement] = [] + self.module: Optional[LolAnalysisModule] = None + self.code: Optional[str] = None + self.output_language: Optional[str] = None + + def read_input_file(self): + with open(self.input_file) as f: + self.text = f.read() + + def setup_output_dir(self): + # Make empty output dir if it doesn't exist + if not os.path.exists(self.output_dir): + os.mkdir(self.output_dir) + + ############################################################################ + ### LEXER + ############################################################################ + + def run_lexer(self): + assert self.text != "", "LolModule" + assert self.tokens == [] + + self.tokens = tokenize(self.text) + + def save_lexer_output_only(self): + file_name: str = f"{self.output_dir}/{self.output_prefix}-{time.time()}-lexer-output-only.json" + with open(file_name, "w") as f: + json.dump({"lexer-output": [x.to_dict() for x in self.tokens]}, f, indent=4) + + ############################################################################ + ### PARSER + ############################################################################ + + def run_parser(self): + assert self.tokens != [] + + stream = TokenStream(self.tokens, self.text) + self.ast = parse(stream) + + def save_parser_output_only(self): + file_name: str = f"{self.output_dir}/{self.output_prefix}-{time.time()}-parser-output-only.json" + with open(file_name, "w") as f: + json.dump({"parser-output": [x.to_dict() for x in self.ast]}, f, indent=4) + + ############################################################################ + ### ANALYZER + ############################################################################ + + def run_analyzer(self): + self.module = analyze(self.ast, self.text) + + def save_analyzer_output_only(self): + assert isinstance(self.module, LolAnalysisModule) + file_name: str = f"{self.output_dir}/{self.output_prefix}-{time.time()}-analyzer-output-only.json" + with open(file_name, "w") as f: + json.dump({"analyzer-output": {x: y.to_dict() for x, y in self.module.module_symbol_table.items()}}, f, indent=4) + + ############################################################################ + ### EMITTER + ############################################################################ + + def run_emitter(self): + # TODO: Make this in the __init__function + assert self.code is None and self.output_language is None + self.code = emit_c(self.module) + self.output_language = "c" + + def save_emitter_output_only(self): + assert isinstance(self.code, str) and self.output_language == "c" + file_name: str = f"{self.output_dir}/{self.output_prefix}-{time.time()}-emitter-output-only.c" + with open(file_name, "w") as f: + f.write(self.code) + + +def main() -> None: + parser = argparse.ArgumentParser() + # TODO(dchu): make this accept multiple file names or folders. Also accept + # a full configuration file. + parser.add_argument( + "-i", "--input", type=str, required=True, help="Input file name" + ) + parser.add_argument( + "-o", "--output", type=str, default=".", help="Output directory name" + ) + args = parser.parse_args() + + # I explicitly extract the names because otherwise one may be tempted to + # pass the 'args' namespace, which is always confusing. + input_file = args.input + output_dir = args.output + + module = LolModule(input_file=input_file, output_dir=output_dir) + module.read_input_file() + module.setup_output_dir() + + module.run_lexer() + module.save_lexer_output_only() + module.run_parser() + module.save_parser_output_only() + module.run_analyzer() + module.save_analyzer_output_only() + module.run_emitter() + module.save_emitter_output_only() + + +if __name__ == "__main__": + main() diff --git a/src/compiler/parser/__init__.py b/src/compiler/parser/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/compiler/parser/lol_parser.py b/src/compiler/parser/lol_parser.py new file mode 100644 index 0000000..eff1138 --- /dev/null +++ b/src/compiler/parser/lol_parser.py @@ -0,0 +1,656 @@ +""" +# Parser + +## Issues +- [ ] Parenthetic expressions are not necessarily evaluated before others (so if +they have side-effects, we have undefined behaviour) +""" + + +from abc import ABCMeta, abstractmethod +from dataclasses import dataclass +from enum import Enum, auto, unique +from typing import Any, List, Set, Tuple, Union + +from compiler.lexer.lol_lexer_types import Token, TokenType +from compiler.parser.lol_parser_token_stream import TokenStream + +frozen_dataclass = dataclass(frozen=True) + + +################################################################################ +### GENERIC +################################################################################ +class LolParserGeneric(metaclass=ABCMeta): + @abstractmethod + def to_dict(self): + raise NotImplementedError + + +@unique +class LolParserOperatorType(Enum): + UNARY_PREFIX = auto() + UNARY_POSTFIX = auto() + BINARY_INFIX = auto() + + +@unique +class LolParserLiteralType(Enum): + INTEGER = auto() + BOOLEAN = auto() + STRING = auto() + FLOAT = auto() + + +################################################################################ +### EXPRESSIONS, LEAVES, AND AMBIGUOUS (e.g. function calls) +################################################################################ +LolParserValueExpression = Union[Any] +LolParserTypeExpression = Union[Any] +LolParserExpression = Union[LolParserValueExpression, LolParserTypeExpression] + + +@frozen_dataclass +class LolParserLiteral(LolParserGeneric): + type: LolParserLiteralType + value: Union[int, bool, float, str] + + def to_dict(self): + return dict( + metatype=self.__class__.__name__, + type=self.type.name, + value=self.value, + ) + + +@frozen_dataclass +class LolParserIdentifier(LolParserGeneric): + name: str + + def to_dict(self): + return dict( + metatype=self.__class__.__name__, + name=self.name + ) + + +@frozen_dataclass +class LolParserOperatorExpression(LolParserGeneric): + operator: str + type: LolParserOperatorType + operands: List[LolParserExpression] + + def to_dict(self): + return dict( + metatype=self.__class__.__name__, + operator=repr(self.operator), + type=self.type.name, + operands=[o.to_dict() for o in self.operands], + ) + + +@frozen_dataclass +class LolParserParameterDefinition(LolParserGeneric): + name: LolParserIdentifier + type: LolParserTypeExpression + + def get_name_as_str(self) -> str: + return self.name.name + + def to_dict(self): + return dict( + metatype=self.__class__.__name__, + name=self.name.to_dict(), + type=self.type.to_dict(), + ) + + +################################################################################ +### AMBIGUOUS (can be both expression or statement) +################################################################################ +@frozen_dataclass +class LolParserFunctionCall(LolParserGeneric): + name: LolParserIdentifier + arguments: List[LolParserExpression] + + def get_name_as_str(self): + return self.name.name + + def to_dict(self): + return dict( + metatype=self.__class__.__name__, + name=self.name.to_dict(), + arguments=[a.to_dict() for a in self.arguments], + ) + + +@frozen_dataclass +class LolParserVariableDefinition(LolParserGeneric): + name: LolParserIdentifier + type: LolParserTypeExpression + value: LolParserValueExpression + + def get_name_as_str(self) -> str: + return self.name.name + + def to_dict(self): + return dict( + metatype=self.__class__.__name__, + name=self.name.to_dict(), + type=self.type.to_dict(), + value=self.value.to_dict(), + ) + + +################################################################################ +### STATEMENTS +################################################################################ +LolParserModuleLevelStatement = Union[Any] +LolParserFunctionLevelStatement = Union[Any] +LolParserStatement = Union[LolParserModuleLevelStatement, LolParserFunctionLevelStatement] + + +@frozen_dataclass +class LolParserImportStatement(LolParserGeneric): + metatype = "LolParserImportStatement" + alias: LolParserIdentifier + # This must be a string literal + library_name: LolParserLiteral + + def __post_init__(self): + assert self.library_name.type == LolParserLiteralType.STRING + + def get_alias_as_str(self) -> str: + return self.alias.name + + def get_library_name_as_str(self) -> str: + assert self.library_name.type == LolParserLiteralType.STRING + return self.library_name.value + + def to_dict(self): + return dict( + metatype=self.__class__.__name__, + alias=self.alias.to_dict(), + library_name=self.library_name.to_dict(), + ) + + +@frozen_dataclass +class LolParserFunctionDefinition(LolParserGeneric): + name: LolParserIdentifier + parameters: List[LolParserParameterDefinition] + return_type: LolParserTypeExpression + body: List[LolParserFunctionLevelStatement] + + def get_name_as_str(self) -> str: + return self.name.name + + def to_dict(self): + return dict( + metatype=self.__class__.__name__, + name=self.name.to_dict(), + parameters=[p.to_dict() for p in self.parameters], + return_type=self.return_type.to_dict(), + body=[s.to_dict() for s in self.body], + ) + + +@frozen_dataclass +class LolParserVariableModification(LolParserGeneric): + name: LolParserIdentifier + value: LolParserValueExpression + + def to_dict(self): + return dict( + metatype=self.__class__.__name__, + name=self.name.to_dict(), + value=self.value.to_dict(), + ) + + +@frozen_dataclass +class LolParserIfStatement(LolParserGeneric): + if_condition: LolParserValueExpression + # NOTE These may only be inside a function. + if_block: List[LolParserFunctionLevelStatement] + else_block: List[LolParserFunctionLevelStatement] + + def to_dict(self): + return dict( + metatype=self.__class__.__name__, + if_condition=self.if_condition.to_dict(), + if_block=[s.to_dict() for s in self.if_block], + else_block=[s.to_dict() for s in self.else_block], + ) + + +@frozen_dataclass +class LolParserLoopStatement(LolParserGeneric): + block: List[LolParserFunctionLevelStatement] + + def to_dict(self): + return dict( + metatype=self.__class__.__name__, + block=[s.to_dict() for s in self.block], + ) + + +@frozen_dataclass +class LolParserBreakStatement(LolParserGeneric): + def to_dict(self): + return dict( + metatype=self.__class__.__name__, + ) + + +@frozen_dataclass +class LolParserReturnStatement(LolParserGeneric): + value: LolParserValueExpression + + def to_dict(self): + return dict( + metatype=self.__class__.__name__, + value=self.value.to_dict(), + ) + + +################################################################################ +### PARSER +################################################################################ +LITERAL_TOKENS: Set[TokenType] = {TokenType.INTEGER, TokenType.STRING} + + +def eat_token(stream: TokenStream, expected_type: TokenType) -> Token: + token = stream.get_token() + if token.get_token_type() != expected_type: + error_msg = f"expected {expected_type.name}, got {token.get_token_type_as_str()}" + raise ValueError(error_msg) + stream.next_token() + return token + + +class Parser: + def __init__(self): + self.module_level_statements: List[LolParserModuleLevelStatement] = [] + + @staticmethod + def parse_literal(stream: TokenStream) -> LolParserLiteral: + start_pos = stream.get_pos() + token = stream.get_token() + if token.is_type(TokenType.STRING): + lit_type = LolParserLiteralType.STRING + # Remove the surrounding quotations + lit_value = token.as_str()[1:-1] + elif token.is_type(TokenType.INTEGER): + lit_type = LolParserLiteralType.INTEGER + lit_value = int(token.as_str()) + else: + raise ValueError(f"unexpected token type: {repr(token)}") + stream.next_token() + end_pos = stream.get_pos() + return LolParserLiteral(lit_type, lit_value) + + @staticmethod + def parse_parenthetic_expression(stream: TokenStream) -> LolParserExpression: + eat_token(stream, TokenType.LPAREN) # Eat '(' + ret = Parser.parse_expression(stream) + eat_token(stream, TokenType.RPAREN) # Eat ')' + return ret + + @staticmethod + def parse_func_call_args( + stream: TokenStream, func_identifier: LolParserIdentifier + ) -> LolParserFunctionCall: + eat_token(stream, TokenType.LPAREN) + args: List[LolParserValueExpression] = [] + token = stream.get_token() + # Check if empty set of arguments + if token.is_type(TokenType.RPAREN): + eat_token(stream, TokenType.RPAREN) + return LolParserFunctionCall(func_identifier, args) + # At this point, we have at least one argument (or error) + while True: + expr = Parser.parse_value_expression(stream) + args.append(expr) + token = stream.get_token() + if token.is_type(TokenType.RPAREN): + eat_token(stream, TokenType.RPAREN) + break + elif token.is_type(TokenType.COMMA): + eat_token(stream, TokenType.COMMA) + continue + else: + raise ValueError("Expected COMMA or RPAREN") + return LolParserFunctionCall(func_identifier, args) + + @staticmethod + def parse_identifier_with_namespace_separator( + stream: TokenStream, identifier_leaf: LolParserIdentifier + ) -> LolParserIdentifier: + namespaces: List[str] = [identifier_leaf.name] + while True: + next_separator_token = stream.get_token() + if next_separator_token.is_type(TokenType.COLON_COLON): + eat_token(stream, TokenType.COLON_COLON) + identifier_name = eat_token(stream, TokenType.IDENTIFIER).as_str() + namespaces.append(identifier_name) + else: + break + hacky_identifier_str = "::".join(n for n in namespaces) + return LolParserIdentifier(hacky_identifier_str) + + @staticmethod + def parse_leading_identifier( + stream: TokenStream, + ) -> Union[ + LolParserIdentifier, + LolParserFunctionCall, + LolParserOperatorExpression, + LolParserIdentifier + ]: + """ + Parse both variables and function calls. + + This is due to the semantics we do not know whether the identifier will be a + variable name or a function call. + + In fact, this will handle any postfix unary operations. Postfix operators + would have to be different than prefix operators, otherwise we would need to + add backtracking into the parser. E.g. `x+ + x` would require backtracking. + A unique operator, e.g. `x++ + x` would not. + + In the future, it may be an array thing too array[100]. + """ + id_token = eat_token(stream, TokenType.IDENTIFIER) + identifier_leaf = LolParserIdentifier(id_token.as_str()) + + token = stream.get_token() + if token.is_type(TokenType.COLON_COLON): + identifier_leaf = Parser.parse_identifier_with_namespace_separator( + stream, identifier_leaf + ) + token = stream.get_token() + if token.is_type(TokenType.LPAREN): + return Parser.parse_func_call_args(stream, identifier_leaf) + elif token.is_type(TokenType.LSQB): + raise ValueError("accesses not supported yet... i.e. `x[100]`") + else: + return LolParserIdentifier(identifier_leaf.name) + + @staticmethod + def parse_if(stream: TokenStream) -> LolParserIfStatement: + eat_token(stream, TokenType.IF) + if_cond = Parser.parse_value_expression(stream) + if_block = Parser.parse_block_body(stream) + token = stream.get_token() + else_block = [] + if token.is_type(TokenType.ELSE): + eat_token(stream, TokenType.ELSE) + else_block = Parser.parse_block_body(stream) + return LolParserIfStatement(if_cond, if_block, else_block) + + @staticmethod + def parse_primary(stream: TokenStream) -> LolParserExpression: + token = stream.get_token() + if token.is_type(TokenType.IDENTIFIER): + return Parser.parse_leading_identifier(stream) + elif token.get_token_type() in LITERAL_TOKENS: + return Parser.parse_literal(stream) + elif token.is_type(TokenType.LPAREN): + return Parser.parse_parenthetic_expression(stream) + else: + error_msg = f"unrecognized primary {token}" + raise ValueError(error_msg) + + @staticmethod + # TODO(dchu): refactor this to make it smarter. Also move the hard-coded + # precedence somewhere smarter. + def get_binop_precedence(op: Token) -> int: + """Get the precedence of a binary operator.""" + precedence = { + # The '::' operator should always be on the left of any '.' operators, + # so it has precedence due to left-associativity anyways. + TokenType.COLON_COLON: 1500, # Highest + TokenType.DOT: 1400, + TokenType.ARROW: 1400, + # Prefix operators have precedence of 1300 + TokenType.STAR: 1200, + TokenType.SLASH: 1200, # TODO(dchu): Is this real divide? + TokenType.SLASH_SLASH: 1200, # Not in C + TokenType.PERCENT: 1200, + # TODO(dchu): Is this same semantics as in C? + TokenType.PLUS: 1100, + TokenType.MINUS: 1100, + TokenType.LSHIFT: 1000, + TokenType.RSHIFT: 1000, + TokenType.AMPERSAND: 900, # In C, this is lower than comparison ops + TokenType.CIRCUMFLEX: 800, + # In C, this is lower than comparison ops + TokenType.VBAR: 700, # In C, this is lower than comparison ops + # TokenType.COLON: 600, # Not in C + TokenType.LESSER: 500, + TokenType.LESSER_EQUAL: 500, + TokenType.GREATER: 500, + TokenType.GREATER_EQUAL: 500, + TokenType.EQUAL_EQUAL: 500, + # In C, this is lower than other comparison ops + TokenType.NOT_EQUAL: 500, + # In C, this is lower than other comparison ops + TokenType.AND: 400, + TokenType.OR: 300, + # The '&&'/'and' operator is 400 + # The '||'/'or' operator is 300 + # NOTE(dchu): I remove the ability to parse the '=' and ',' as operators since this would be confusing! + # TokenType.EQUAL: 200, + # TokenType.COMMA: 100, # Weakest + } + return precedence.get(op.get_token_type(), -1) + + @staticmethod + def parse_binop_rhs( + stream: TokenStream, + min_expression_precedence: int, + lhs: LolParserExpression + ) -> LolParserExpression: + """ + Inputs + ------ + + * min_expression_precedence: int - min operator precedence that function is + allowed to eat. + """ + while True: + binop_token = stream.get_token() + binop_token_precedence = Parser.get_binop_precedence(binop_token) + + # Exit if the token has a lower precedence than what we're allowed to + # consume. This could be for a variety of reasons: if we pass an invalid + # binop (which is OK), if the token is None (representing the end of the + # stream), or if it is a binop with too low precedence. + if binop_token_precedence < min_expression_precedence: + return lhs + + stream.next_token() + rhs = Parser.parse_primary(stream) + assert rhs + + # TODO(dchu): I have no idea what this is actually doing. I just copied + # it from https://llvm.org/docs/tutorial/MyFirstLanguageFrontend/LangImpl02.html + token = stream.get_token() + next_prec = Parser.get_binop_precedence(token) + if binop_token_precedence < next_prec: + rhs = Parser.parse_binop_rhs(stream, binop_token_precedence + 1, rhs) + assert rhs + + lhs = LolParserOperatorExpression( + binop_token.lexeme, LolParserOperatorType.BINARY_INFIX, [lhs, rhs] + ) + + @staticmethod + def parse_expression(stream: TokenStream) -> LolParserExpression: + """Helper functions for parsing identifiers, literals, and parenthetic expressions.""" + lhs = Parser.parse_primary(stream) + assert lhs is not None + return Parser.parse_binop_rhs(stream, 0, lhs) + + @staticmethod + def parse_type_expression(stream: TokenStream) -> LolParserTypeExpression: + # We only support single-token type expressions for now + return LolParserIdentifier( + eat_token(stream, TokenType.IDENTIFIER).as_str() + ) + + @staticmethod + def parse_value_expression(stream: TokenStream) -> LolParserValueExpression: + return Parser.parse_expression(stream) + + ############################################################################ + ### Functions + ############################################################################ + @staticmethod + def parse_parameter_definition(stream: TokenStream) -> LolParserParameterDefinition: + start_pos = stream.get_pos() + identifier = LolParserIdentifier(eat_token(stream, TokenType.IDENTIFIER).as_str()) + eat_token(stream, TokenType.COLON) + param_type = Parser.parse_type_expression(stream) + end_pos = stream.get_pos() + return LolParserParameterDefinition(identifier, param_type) + + @staticmethod + def parse_function_prototype(stream: TokenStream) -> Tuple[ + LolParserIdentifier, + List[LolParserParameterDefinition], + LolParserTypeExpression + ]: + _function = eat_token(stream, TokenType.FUNCTION) + func_identifier = LolParserIdentifier( + eat_token(stream, TokenType.IDENTIFIER).as_str() + ) + eat_token(stream, TokenType.LPAREN) + params: List[LolParserParameterDefinition] = [] + + token = stream.get_token() + if token.is_type(TokenType.RPAREN): + eat_token(stream, TokenType.RPAREN) + else: + while True: + params.append(Parser.parse_parameter_definition(stream)) + token = stream.get_token() + if token.is_type(TokenType.COMMA): + eat_token(stream, TokenType.COMMA) + elif token.is_type(TokenType.RPAREN): + eat_token(stream, TokenType.RPAREN) + break + else: + raise ValueError( + f"expected comma or right parenthesis, got {token.as_str()}" + ) + eat_token(stream, TokenType.ARROW) + ret_type = Parser.parse_type_expression(stream) + return func_identifier, params, ret_type + + @staticmethod + def parse_function_level_statement(stream: TokenStream) -> LolParserFunctionLevelStatement: + token = stream.get_token() + if token.is_type(TokenType.LET): # Local variable + return Parser.parse_variable_definition(stream) + elif token.is_type(TokenType.RETURN): + eat_token(stream, TokenType.RETURN) + ret_val = Parser.parse_value_expression(stream) + eat_token(stream, TokenType.SEMICOLON) + return LolParserReturnStatement(ret_val) + # TODO(dchu): if, while, for loops + elif token.is_type(TokenType.IF): + return Parser.parse_if(stream) + else: + result = Parser.parse_value_expression(stream) + eat_token(stream, TokenType.SEMICOLON) + return result + + @staticmethod + def parse_block_body(stream: TokenStream) -> List[LolParserFunctionLevelStatement]: + func_body: List[LolParserFunctionLevelStatement] = [] + eat_token(stream, TokenType.LBRACE) + while True: + func_body.append(Parser.parse_function_level_statement(stream)) + token = stream.get_token() + if token.is_type(TokenType.RBRACE): + break + eat_token(stream, TokenType.RBRACE) + return func_body + + @staticmethod + def parse_function_definition(stream: TokenStream): + start_pos = stream.get_pos() + func_identifier, params, ret_type = Parser.parse_function_prototype(stream) + func_body = Parser.parse_block_body(stream) + end_pos = stream.get_pos() + return LolParserFunctionDefinition(func_identifier, params, ret_type, func_body) + + ############################################################################ + ### VARIABLE DEFINITION + ############################################################################ + @staticmethod + def parse_variable_definition( + stream: TokenStream, + ): + start_pos = stream.get_pos() + _let = eat_token(stream, TokenType.LET) + identifier = LolParserIdentifier(eat_token(stream, TokenType.IDENTIFIER).as_str()) + eat_token(stream, TokenType.COLON) + data_type = Parser.parse_type_expression(stream) + eat_token(stream, TokenType.EQUAL) + value = Parser.parse_value_expression(stream) + eat_token(stream, TokenType.SEMICOLON) + end_pos = stream.get_pos() + return LolParserVariableDefinition(identifier, data_type, value) + + ############################################################################ + ### IMPORT + ############################################################################ + @staticmethod + def parse_import_module(stream: TokenStream) -> LolParserModuleLevelStatement: + """ + Parse import statement outside of a function. + + E.g. `module io = import("stdio.h");` + """ + # TODO(dchu): this is deprecated because eventually we will have + # namespaces and let statements all be one thing. + start_pos = stream.get_pos() + eat_token(stream, TokenType.MODULE) + # NOTE: This only allows a single identifier for the module alias + alias_name = eat_token(stream, TokenType.IDENTIFIER) + eat_token(stream, TokenType.EQUAL) + eat_token(stream, TokenType.IMPORT) + eat_token(stream, TokenType.LPAREN) + library_name = eat_token(stream, TokenType.STRING) + eat_token(stream, TokenType.RPAREN) + eat_token(stream, TokenType.SEMICOLON) + end_pos = stream.get_pos() + r = LolParserImportStatement( + LolParserIdentifier(alias_name.as_str()), + LolParserLiteral(LolParserLiteralType.STRING, library_name.as_str()) + ) + return r + + def parse_module_statements(self, stream: TokenStream) -> List[LolParserStatement]: + result = [] + token = stream.get_token() + while token is not None: + if token.is_type(TokenType.FUNCTION): + result.append(Parser.parse_function_definition(stream)) + elif token.is_type(TokenType.MODULE): + result.append(Parser.parse_import_module(stream)) + elif token.is_type(TokenType.LET): # Global variable + result.append(Parser.parse_variable_definition(stream)) + else: + raise ValueError(f"Unexpected token: {token}") + token = stream.get_token() + self.module_level_statements = result + return result + + +def parse(stream: TokenStream): + parser = Parser() + return parser.parse_module_statements(stream) diff --git a/src/compiler/parser/lol_parser_token_stream.py b/src/compiler/parser/lol_parser_token_stream.py new file mode 100644 index 0000000..1dbd82f --- /dev/null +++ b/src/compiler/parser/lol_parser_token_stream.py @@ -0,0 +1,35 @@ +from typing import List, Optional + +from compiler.lexer.lol_lexer import Token + + +class TokenStream: + """Semantics taken from CharacterStream""" + + def __init__(self, src: List[Token], text: str = None) -> None: + self.text = text + self.src = src + self.idx = 0 + + def get_text(self) -> str: + return self.text + + def get_token(self, *, offset: int = 0) -> Optional[Token]: + """ + Get the current token or return None if at the end. + + N.B. Does NOT advance the token! + """ + if self.idx + offset >= len(self.src): + return None + return self.src[self.idx + offset] + + def next_token(self): + """Advance to the next token.""" + t = self.get_token() + if t is None: + return + self.idx += 1 + + def get_pos(self): + return self.idx diff --git a/test/compiler/test_lolc.py b/test/compiler/test_lolc.py new file mode 100644 index 0000000..042d4ff --- /dev/null +++ b/test/compiler/test_lolc.py @@ -0,0 +1,30 @@ +import os + +from compiler.lol import LolModule + + +def lol_compile(input_file: str, output_dir: str = "results"): + print(f"> Compiling '{input_file}'") + module = LolModule(input_file=input_file, output_dir=output_dir) + module.read_input_file() + module.setup_output_dir() + + module.run_lexer() + module.save_lexer_output_only() + module.run_parser() + module.save_parser_output_only() + module.run_analyzer() + module.save_analyzer_output_only() + module.run_emitter() + module.save_emitter_output_only() + + +def main(): + for x in os.listdir('examples'): + file_name = os.path.join("examples", x) + if os.path.isfile(file_name): + lol_compile(file_name) + + +if __name__ == "__main__": + main()