diff --git a/.travis.yml b/.travis.yml index 45b78e45..bf8b2cc6 100644 --- a/.travis.yml +++ b/.travis.yml @@ -6,4 +6,5 @@ before_script: sudo: false cache: cargo script: + - cargo +nightly fmt --all -- --check - cargo test --all diff --git a/README.rst b/README.rst index 5f5e17db..7871b3ad 100644 --- a/README.rst +++ b/README.rst @@ -65,6 +65,12 @@ These instructions were tested on Ubuntu 18.4 (using Bash). It should also work cargo test --all cargo run --bin mango-cli + or to build a fast, release-mode native binary: + +.. code:: bash + + RUSTFLAGS="-C target-cpu=native" cargo build --release + * To deploy the web version in release mode, run the script `dev/build_web.sh` (or view it for the steps needed). It uses Python's SimpleHTTPServer, if you don't have that, you can still find the deployable code in `target/deploy`. * You're now ready to make changes! If you want to help, you're very welcome! Have a glance at CONTRIBUTING.rst_ if you have a minute. diff --git a/dev/hooks/pre-commit b/dev/hooks/pre-commit index a4b1873c..e99e8975 100755 --- a/dev/hooks/pre-commit +++ b/dev/hooks/pre-commit @@ -8,4 +8,4 @@ set -o pipefail util_dir="$(dirname "$(realpath "${BASH_SOURCE[0]}")")/utils" # Check that the formatting is correct -PYTHONPATH="$util_dir":$PYTHONPATH python3 "$util_dir/run_on_staged.py" 'cargo +nightly fmt --verbose --all -- --write-mode=diff' 'cargo test --all' +PYTHONPATH="$util_dir":$PYTHONPATH python3 "$util_dir/run_on_staged.py" 'cargo +nightly fmt --all -- --check' 'cargo test --all' diff --git a/dev/hooks/utils/run_on_staged.py b/dev/hooks/utils/run_on_staged.py index 036114de..a813b50f 100644 --- a/dev/hooks/utils/run_on_staged.py +++ b/dev/hooks/utils/run_on_staged.py @@ -21,7 +21,7 @@ def do_cmds(cmds): run(cmd, allow_stderr=True, log=True) except Exception as err: stderr.write(str(err)) - stderr.write('FAILED, cancelling commit\n') + stderr.write('\nFAILED, cancelling commit\n') return 1 return 0 diff --git a/dev/playground/src/enumhash.rs b/dev/playground/src/enumhash.rs index e6e46811..c64f240d 100644 --- a/dev/playground/src/enumhash.rs +++ b/dev/playground/src/enumhash.rs @@ -63,15 +63,9 @@ fn get_test_hash(x: &MyEnum) -> u64 { } fn main() { -<<<<<<< Updated upstream let a1: MyEnum = MyEnum::A(Alpha { val: "Hello World".to_owned() }); let a2: MyEnum = MyEnum::A(Alpha { val: "Bye World".to_owned() }); let a3: MyEnum = MyEnum::A(Alpha { val: "Bye World".to_owned() }); -======= - let a1: MyEnum = MyEnum::A(Alpha { val: "Hello World".to_string() }); - let a2: MyEnum = MyEnum::A(Alpha { val: "Bye World".to_string() }); - let a3: MyEnum = MyEnum::A(Alpha { val: "Bye World".to_string() }); ->>>>>>> Stashed changes let b: MyEnum = MyEnum::B(Beta { nr: 8, f: 2 }); let mut m = HashMap::new(); println!("{:?} {:?}", a1.to_text(), b.to_text()); diff --git a/dev/playground/src/hashin.rs b/dev/playground/src/hashing.rs similarity index 53% rename from dev/playground/src/hashin.rs rename to dev/playground/src/hashing.rs index 4acbdbc9..566e70ef 100644 --- a/dev/playground/src/hashin.rs +++ b/dev/playground/src/hashing.rs @@ -29,7 +29,6 @@ impl AnyHasher for H { } } -// TODO: but now I want this not for everything impl MyTrait for T { fn as_any(&self) -> &Any { self as &Any @@ -41,9 +40,6 @@ impl MyTrait for T { } } -//impl MyTrait for A {} -//impl MyTrait for B {} - impl Hash for MyTrait { fn hash(&self, hasher: &mut H) { self.my_hash(hasher) @@ -57,48 +53,3 @@ fn main() { let x: &MyTrait = &A(1); x.hash(&mut hasher); } - - -//trait PreS: Debug {} -// -//trait HasherAsAny { -// fn as_any(&self) -> &Any; -//} -// -//trait PostS { -// fn as_any(&self) -> &Any; -// -// fn _hash(&self, hasher: H); -//} -// -//impl HasherAsAny for T { -// fn as_any(&self) -> &Any { -// self as &Any -// } -//} -// -//impl PostS for T { -// fn as_any(&self) -> &Any { -// self as &Any -// } -// -// fn _hash(&self, hasher: H) { -// self.as_any().downcast_ref::().hash(hasher) -// } -//} -// -//impl PreS for A {} -// -//impl PreS for B {} -// -//impl Hash for PostS { -// fn hash(&self, hasher: &mut HasherAsAny) { -// self._hash(hasher.as_any().downcast_ref::()) -// } -//} -// -//fn main() { -// let x: &PostS = &A(1); -// let m = HashMap::new(); -// m.insert(x, 0); -//} diff --git a/rustfmt.toml b/rustfmt.toml index ce4866d5..eaf1d122 100644 --- a/rustfmt.toml +++ b/rustfmt.toml @@ -1,5 +1,2 @@ -reorder_extern_crates = true -reorder_extern_crates_in_group = true reorder_imports = true -reorder_imports_in_group = true -reorder_imported_names = true +max_width = 140 diff --git a/src/lib.rs b/src/lib.rs index 6d88dc87..1ff780b3 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,3 +1,5 @@ +#![feature(nll)] +//#![feature(generators, generator_trait)] #![feature(proc_macro, wasm_custom_section, wasm_import_module)] extern crate core; extern crate wasm_bindgen; @@ -10,6 +12,7 @@ extern crate derive_new; pub mod mango { // Utilities + pub mod io; pub mod jit; pub mod ui; pub mod util; diff --git a/src/mango/ast_full/node/assignment.rs b/src/mango/ast_full/node/assignment.rs index 558d6d3a..4b060303 100644 --- a/src/mango/ast_full/node/assignment.rs +++ b/src/mango/ast_full/node/assignment.rs @@ -21,11 +21,7 @@ impl AssignmentAST { impl ToText for AssignmentAST { fn to_text(&self) -> String { - return format!( - "{0:} = ({1:})", - self.assignee.to_text(), - self.value.to_text() - ); + return format!("{0:} = ({1:})", self.assignee.to_text(), self.value.to_text()); } } diff --git a/src/mango/ast_full/node/unary_operation.rs b/src/mango/ast_full/node/unary_operation.rs index 751105a3..36fc77b1 100644 --- a/src/mango/ast_full/node/unary_operation.rs +++ b/src/mango/ast_full/node/unary_operation.rs @@ -22,11 +22,7 @@ impl UnaryOperationAST { impl ToText for UnaryOperationAST { fn to_text(&self) -> String { - return format!( - "({0:} {1:})", - self.operator.to_text(), - self.subject.to_text() - ); + return format!("({0:} {1:})", self.operator.to_text(), self.subject.to_text()); } } diff --git a/src/mango/ast_full/terminal/literal.rs b/src/mango/ast_full/terminal/literal.rs index b585461d..74d907e8 100644 --- a/src/mango/ast_full/terminal/literal.rs +++ b/src/mango/ast_full/terminal/literal.rs @@ -31,9 +31,7 @@ pub struct StringLiteralAST { impl FloatLiteralAST { pub fn new(value: f64) -> Self { - FloatLiteralAST { - value: f64eq::new(value), - } + FloatLiteralAST { value: f64eq::new(value) } } } diff --git a/src/mango/io/fortest/mod.rs b/src/mango/io/fortest/mod.rs new file mode 100644 index 00000000..100916ac --- /dev/null +++ b/src/mango/io/fortest/mod.rs @@ -0,0 +1,2 @@ +pub mod stringreader; +pub use self::stringreader::*; diff --git a/src/mango/io/fortest/stringreader.rs b/src/mango/io/fortest/stringreader.rs new file mode 100644 index 00000000..9419fb5a --- /dev/null +++ b/src/mango/io/fortest/stringreader.rs @@ -0,0 +1,61 @@ +use mango::io::typ::Reader; +use mango::io::typ::ReaderResult; +use mango::io::util::REXCACHE; + +/// Implementation of [Reader] that reads from a pre-provided string. +/// Mostly for testing purposes. +#[derive(Debug)] +pub struct StringReader { + code: String, + index: usize, +} + +impl StringReader { + pub fn new(code: String) -> Self { + StringReader { code, index: 0 } + } +} + +impl Reader for StringReader { + fn matches(&mut self, subpattern: &str) -> ReaderResult { + // Check for subpattern + REXCACHE.with(|rl| { + let mut rexlib = rl.borrow_mut(); + // Check for end of file + // TODO: is there a better/faster way for this? maybe try this after a match and set a flag? + let regex = rexlib.make_or_get(r"\s*$"); + match regex.find(&self.code[self.index..]) { + Some(mtch) => { + if self.index + mtch.as_str().len() == self.code.len() { + self.index += mtch.as_str().len(); + return ReaderResult::EOF(); + } + } + None => (), + } + // Check for subpattern + let regex = rexlib.make_or_get(subpattern); + return match regex.find(&self.code[self.index..]) { + Some(mtch) => { + self.index += mtch.as_str().len(); + // Remove leading spaces + let mut k = 0; + for (i, byt) in mtch.as_str().chars().enumerate() { + if byt != ' ' { + break; + } + k = i + 1; + } + ReaderResult::Match((&mtch.as_str()[k..]).to_owned()) + } + None => ReaderResult::NoMatch(), + }; + }) + } + + fn get_progress(&self) -> usize { + self.index + } +} + +// TODO: tests (spaces, end) diff --git a/src/mango/io/mod.rs b/src/mango/io/mod.rs new file mode 100644 index 00000000..d9a219cf --- /dev/null +++ b/src/mango/io/mod.rs @@ -0,0 +1,5 @@ +pub mod typ; + +pub mod fortest; + +pub mod util; diff --git a/src/mango/io/typ.rs b/src/mango/io/typ.rs new file mode 100644 index 00000000..6aedf0d5 --- /dev/null +++ b/src/mango/io/typ.rs @@ -0,0 +1,28 @@ +// TODO: I should perhaps separate the splitting that happens here from the actual reading + +use std::fmt::Debug; + +pub enum ReaderResult { + Match(String), + NoMatch(), + EOF(), +} + +/// A reader represents a source 'file', which may be a file, webpage, string, ... +pub trait Reader: Debug { + /// Checks whether the `text` is found starting from the current position. + // fn equals(&mut self, texts: Vec<&str>) -> ReaderResult; + + /// Checks whether the code from the current position matches a regex pattern. + /// + /// This has to eventually return EOF, and keep returning EOF forever after that. + fn matches(&mut self, subpattern: &str) -> ReaderResult; + + /// Return a number that can be used to check whether the state has changed. + /// This need not correspond to a specific position, but should be unique for the progress. + fn get_progress(&self) -> usize; +} + +pub trait Writer { + // TODO +} diff --git a/src/mango/io/util.rs b/src/mango/io/util.rs new file mode 100644 index 00000000..e70c250e --- /dev/null +++ b/src/mango/io/util.rs @@ -0,0 +1,33 @@ +use regex::Regex; +use std::cell::RefCell; +use std::collections::HashMap; + +pub struct RegexCache { + cache: HashMap, +} + +impl RegexCache { + // Not public to prevent having more than one instance. + fn new() -> Self { + RegexCache { cache: HashMap::new() } + } + + pub fn make_or_get(&mut self, subpattern: &str) -> &Regex { + if !self.cache.contains_key(subpattern) { + match Regex::new(&format!(r"^ *{}", subpattern)) { + Err(err) => panic!(format!( + "Invalid regular expression '{}' while adding to library; this is a bug:\n{:?}", + subpattern, err + )), + Ok(regex) => { + self.cache.insert(subpattern.to_owned(), regex); + } + } + } + self.cache.get(subpattern).unwrap() + } +} + +thread_local! { + pub static REXCACHE: RefCell = RefCell::new(RegexCache::new()) +} diff --git a/src/mango/lexing/code_lexer.rs b/src/mango/lexing/code_lexer.rs new file mode 100644 index 00000000..8134247f --- /dev/null +++ b/src/mango/lexing/code_lexer.rs @@ -0,0 +1,190 @@ +use mango::io::typ::Reader; +use mango::io::typ::ReaderResult::*; +use mango::lexing::string_lexer::StringLexer; +use mango::lexing::typ::SubLexer; +use mango::lexing::typ::SubLexerResult; +use mango::token::special::UnlexableToken; +use mango::token::tokens::literal::LiteralToken; +use mango::token::tokens::AssociationToken; +use mango::token::tokens::EndBlockToken; +use mango::token::tokens::EndStatementToken; +use mango::token::tokens::IdentifierToken; +use mango::token::tokens::KeywordToken; +use mango::token::tokens::OperatorToken; +use mango::token::tokens::ParenthesisCloseToken; +use mango::token::tokens::ParenthesisOpenToken; +use mango::token::tokens::StartBlockToken; +use mango::token::Tokens; +use mango::util::strslice::char_ops::CharOps; +use mango::util::strslice::charsliceto; + +pub struct CodeLexer { + indent: i32, // -1: finished +} + +// TODO: keep the regexes in thread local global scope storage + +impl CodeLexer { + pub fn new() -> Self { + CodeLexer { indent: 0 } + } + + fn lex_indents(&mut self, reader: &mut Box) -> Vec { + let mut line_indent = 0; + while let Match(_) = reader.matches("\\t") { + line_indent += 1; + } + let mut tokens: Vec = Vec::with_capacity(8); + if line_indent < self.indent { + if let Match(_) = reader.matches(r"end") { + // If this is followed by an 'end' keyword, then that 'end' is redundant. + tokens.push(Tokens::EndBlock(EndBlockToken::new(true, true))); + } else { + tokens.push(Tokens::EndBlock(EndBlockToken::new(true, false))); + } + for _ in line_indent..(self.indent - 1) { + // This line is dedented, make end tokens. + tokens.push(Tokens::EndBlock(EndBlockToken::new(true, false))); + } + } else { + for _ in self.indent..line_indent { + // This line is indented, make start tokens. + // TODO: increasing indent by more than one should be a warning + tokens.push(Tokens::StartBlock(StartBlockToken::new())); + } + } + self.indent = line_indent; + tokens + } + + fn token_and_indents(&mut self, reader: &mut Box, token: Tokens) -> SubLexerResult { + let mut tokens: Vec = vec![token]; + // This is a new line, so there may be indents. + tokens.append(&mut self.lex_indents(reader)); + return SubLexerResult::Result(tokens); + } +} + +impl SubLexer for CodeLexer { + fn lex_pass(&mut self, reader: &mut Box) -> SubLexerResult { + use self::SubLexerResult::*; + + // End of line continuation + if let Match(_) = reader.matches(r"\.\.\.") { + // Line continuation has no token, it just continues on the next line, ignoring indents (for now). + if let Match(_) = reader.matches(r"\n\r?\t*") { + // There should always be a newline after continuations, so that they can be ignored together. + } else { + // The rest of this line is unparsable. + if let Match(word) = reader.matches("[^\\n]*\\n\\r?") { + // This is a new line, so there may be indents. + return self.token_and_indents(reader, Tokens::Unlexable(UnlexableToken::new(word))); + } else { + // TODO: I don't know yet how to deal with '...' followed by end-of-file + panic!() + } + } + } + // Newlines + if let Match(_) = reader.matches("\\n\\r?") { + // Newline WITHOUT line continuation. + // This is a new line, so there may be indents. + return self.token_and_indents(reader, Tokens::EndStatement(EndStatementToken::new_end_line())); + } + // End of statement + if let Match(_) = reader.matches(";") { + // Semicolon, which ends a statement. + if let Match(_) = reader.matches("\\n\\r?") { + // If semicolon is followed by a newline, it is redundant. Deal with indents (but ignore the newline itself). + return self.token_and_indents(reader, Tokens::EndStatement(EndStatementToken::new_semicolon())); + } else { + return SubLexerResult::single(Tokens::EndStatement(EndStatementToken::new_semicolon())); + } + } + // + // Indentation done; do the rest of lexing. + // + // Parse identifiers and keywords. This assumes that keywords are a subset of identifiers. + if let Match(word) = reader.matches(IdentifierToken::subpattern()) { + // TODO: maybe turn identifier into keyword to avoid a string copy? kind of elaborate... + if let Ok(keyword) = KeywordToken::from_str(word.clone()) { + return SubLexerResult::single(Tokens::Keyword(keyword)); + } + return SubLexerResult::single(Tokens::Identifier(IdentifierToken::from_str(word).unwrap())); + } + // Literal + if let Match(_) = reader.matches("[a-z]?\"") { + return Delegate(Box::new(StringLexer::new_double_quoted())); + } + if let Match(nr) = reader.matches(LiteralToken::subpattern_int()) { + let value = LiteralToken::parse_int(nr); + return SubLexerResult::single(Tokens::Literal(LiteralToken::Int(value))); + } + if let Match(nr) = reader.matches(LiteralToken::subpattern_real()) { + let value = LiteralToken::parse_real(nr); + return SubLexerResult::single(Tokens::Literal(LiteralToken::Real(value))); + } + + // Association (before operator) + if let Match(token) = reader.matches(&AssociationToken::subpattern()) { + debug_assert!(token.chars().last().unwrap() == '='); + if token.char_len() > 1 { + match AssociationToken::from_str(charsliceto(token, -1)) { + Ok(association) => return SubLexerResult::single(Tokens::Association(association)), + Err(msg) => panic!(format!("Invalid association prefix: {}", msg)), + } + } else { + return SubLexerResult::single(Tokens::Association(AssociationToken::from_unprefixed())); + } + } + // Operator (after association) + if let Match(token) = reader.matches(OperatorToken::subpattern()) { + return SubLexerResult::single(Tokens::Operator(OperatorToken::from_str(&token).unwrap())); + } + // Grouping symbols + if let Match(_) = reader.matches(r"\(") { + return SubLexerResult::single(Tokens::ParenthesisOpen(ParenthesisOpenToken::new())); + } + if let Match(_) = reader.matches(r"\)") { + return SubLexerResult::single(Tokens::ParenthesisClose(ParenthesisCloseToken::new())); + } + + // If the code gets here, it did not recognize the text as any token + return match reader.matches(r"[^\s]+") { + Match(word) => SubLexerResult::single(Tokens::Unlexable(UnlexableToken::new(word))), + NoMatch() => panic!("Do not know how to proceed with parsing"), + EOF() => { + if self.indent < 0 { + return SubLexerResult::End; + } + let mut tokens = vec![Tokens::EndStatement(EndStatementToken::new_end_line())]; + for _ in 0..self.indent { + // This line is dedented, make end tokens. + tokens.push(Tokens::EndBlock(EndBlockToken::new(true, false))); + } + self.indent = -1; + SubLexerResult::Result(tokens) + } + }; + } +} + +#[cfg(test)] +mod tests { + use mango::lexing::util::test_util::assert_text_to_tokens; + use mango::token::tokens::EndStatementToken; + use mango::token::tokens::KeywordToken; + use mango::token::Tokens; + + #[test] + fn test_lexing_individual() { + assert_text_to_tokens( + "if", + vec![ + Tokens::Keyword(KeywordToken::from_str("if".to_owned()).unwrap()), + Tokens::EndStatement(EndStatementToken::new_end_line()), + ], + ); + // todo: more + } +} diff --git a/src/mango/lexing/code_lexer_prev.rs b/src/mango/lexing/code_lexer_prev.rs new file mode 100644 index 00000000..7877a7e6 --- /dev/null +++ b/src/mango/lexing/code_lexer_prev.rs @@ -0,0 +1,304 @@ + +// TODO: dead code, no longer used + +use mango::io::typ::Reader; +use mango::io::typ::ReaderResult::*; +use mango::lexing::string_lexer::StringLexer; +use mango::lexing::typ::Lexer; +use mango::lexing::typ::MaybeToken; +use mango::token::special::UnlexableToken; +use mango::token::tokens::AssociationToken; +use mango::token::tokens::EndBlockToken; +use mango::token::tokens::EndStatementToken; +use mango::token::tokens::IdentifierToken; +use mango::token::tokens::KeywordToken; +use mango::token::tokens::OperatorToken; +use mango::token::tokens::ParenthesisCloseToken; +use mango::token::tokens::ParenthesisOpenToken; +use mango::token::tokens::StartBlockToken; +use mango::token::Tokens; +use mango::util::collection::Queue; +use std::cell::RefCell; +use std::rc::Rc; + +// TODO: Preferably there'd be only one Lexer at a time which has a Reader, but I did not get that to work, +// TODO: see this SO question: https://stackoverflow.com/questions/50535022/borrow-checker-problems-for-parser-that-can-delegate + +enum ReaderOrDelegate { + Reader(), + Delegate(Box), +} + +pub struct CodeLexer { + // reader: Rc>, + indent: i32, + + reader: Rc>, + // This delegate deals with nested structures, like string literals and comments. + reader_or_delegate: ReaderOrDelegate, + // This is unfortunate, would not be needed with 'yield' but is now for indents. + buffer: Queue, +} + +impl CodeLexer { + pub fn new(reader: Rc>) -> Self { + CodeLexer { + reader: reader, + reader_or_delegate: ReaderOrDelegate::Reader(), + indent: 0, + buffer: Queue::new(), + } + } + + fn lex_indents(&mut self) -> MaybeToken { + let mut line_indent = 0; + while let Match(_) = self.reader.borrow_mut().matches("\\t") { + line_indent += 1; + } + for _ in line_indent..self.indent { + // This line is dedented, make end tokens. + // TODO: turn this "new" into a constant + if let Match(_) = self.reader.borrow_mut().matches("end") { + // If this is followed by an 'end' keyword, then that 'end' is redundant. + self.buffer + .push(Tokens::EndBlock(EndBlockToken::new(true, true))); + } else { + self.buffer + .push(Tokens::EndBlock(EndBlockToken::new(true, false))); + } + } + for _ in self.indent..line_indent { + // This line is indented, make start tokens. + self.buffer.push(Tokens::StartBlock(StartBlockToken::new())); + } + self.indent = line_indent; + self.lex() + } +} + +impl Lexer for CodeLexer { + // TODO: TURN THIS AROUND: MAKE A FUNCTION THAT RETURNS FROM A QUEUE, AND CALLS ANOTHER TO FILL THE QUEUE IF NO RETURN + + fn lex(&mut self) -> MaybeToken { + use self::MaybeToken::*; + + // If currently delegating to a sub-lexer, return from that. + match self.reader_or_delegate { + ReaderOrDelegate::Delegate(ref mut delegate) => { + let delegated_token = delegate.lex(); + match delegated_token { + End => { + // Swap back from delegation to direct mode. + // let reader = delegate.get_reader().clone(); + self.reader_or_delegate = ReaderOrDelegate::Reader(); + self.lex() + } + Token(token) => Token(token), + } + // Code to stop delegation cannot be here, because `self` is still mutably borrowed through `delegate` + } + ReaderOrDelegate::Reader() => { + // todo: maybe this branch could be a separate function? + + // If there is a buffer due to indentation or continuations, return from that. + if let Some(token) = self.buffer.pop() { + return Token(token); + } + // Past this point, we assume that hte buffer is empty. When adding stuff, pop it or re-enter lex() soon. + let continue_match_res = self.reader.borrow_mut().matches("\\.\\.\\."); + if let Match(_) = continue_match_res { + // Line continuation has no token, it just continues on the next line. + let newline_match_res = self.reader.borrow_mut().matches("\\n\\r?"); + if let Match(_) = newline_match_res { + // There should always be a newline after continuations, so that they can be ignored together. + } else { + let newline_match_res = self.reader.borrow_mut().matches("[^\\n]*\\n\\r?"); + if let Match(word) = newline_match_res { + self.buffer + .push(Tokens::Unlexable(UnlexableToken::new(word))); + // This is a new line, so there may be indents. + self.lex_indents(); + return self.lex(); + } else { + // TODO: I don't know yet how to deal with '...' followed by end-of-file + panic!() + } + } + } + let newline_match_res = self.reader.borrow_mut().matches("\\n\\r?"); + if let Match(_) = newline_match_res { + // Newline WITHOUT line continuation. + // This is a new line, so there may be indents. + self.buffer + .push(Tokens::EndStatement(EndStatementToken::new_end_line())); + self.lex_indents(); + return self.lex(); + } + let end_statement_match_res = self.reader.borrow_mut().matches(";"); + if let Match(_) = end_statement_match_res { + // Semicolon, which ends a statement. + // Need to do some extra work with buffer, because there may be a newline followed by indentation, which ; should precede. + self.buffer + .push(Tokens::EndStatement(EndStatementToken::new_semicolon())); + let end_line_match_res = self.reader.borrow_mut().matches("\\n\\r?"); + if let Match(_) = end_line_match_res { + // If semicolon is followed by a newline (redundant), then we need to deal with indents (but ignore the newline itself). + // This will return the queue of tokens, including the semicolon. + return self.lex_indents(); + } + // No newline, can just return the semicolon (which is certainly on the queue, and should be the only thing, but it is fine here if not). + return Token(self.buffer.pop().unwrap()); + } + // + // Indentation done; do the rest of lexing. + // + // Parse identifiers and keywords. This assumes that keywords are a subset of identifiers. + if let Match(word) = self + .reader + .borrow_mut() + .matches(IdentifierToken::subpattern()) + { + // later: maybe turn identifier into keyword to avoid a string copy? kind of elaborate... + if let Ok(keyword) = KeywordToken::from_str(word.clone()) { + return Token(Tokens::Keyword(keyword)); + } + return Token(Tokens::Identifier(IdentifierToken::from_str(word).unwrap())); + } + // Literal + let string_match_res = self.reader.borrow_mut().matches("[a-z]?\""); + if let Match(_) = string_match_res { + let sublexer: Box = + Box::new(StringLexer::new_double_quoted(self.reader.clone())); + self.reader_or_delegate = ReaderOrDelegate::Delegate(sublexer); + return self.lex(); + } + // Association (before operator) + let association_match_res = self + .reader + .borrow_mut() + .matches(&AssociationToken::subpattern()); + if let Match(token) = association_match_res { + if token.chars().last().unwrap() == '=' { + // return Token(Tokens::Association(AssociationToken::from_str(token[..1]).unwrap())); + return Token(Tokens::Association(AssociationToken::from_unprefixed())); // TODO + } else { + return Token(Tokens::Association(AssociationToken::from_unprefixed())); + } + } + // Operator + let operator_match_res = self + .reader + .borrow_mut() + .matches(OperatorToken::subpattern()); + if let Match(token) = operator_match_res { + return Token(Tokens::Operator(OperatorToken::from_str(&token).unwrap())); + } + // Grouping symbols + if let Match(_) = self.reader.borrow_mut().matches(r"\(") { + return Token(Tokens::ParenthesisOpen(ParenthesisOpenToken::new())); + } + if let Match(_) = self.reader.borrow_mut().matches(r"\)") { + return Token(Tokens::ParenthesisClose(ParenthesisCloseToken::new())); + } + + let unknown_word = self.reader.borrow_mut().matches("[^\\s]+"); + match unknown_word { + Match(word) => return Token(Tokens::Unlexable(UnlexableToken::new(word))), + NoMatch() => { + println!("END {:?}", self.reader.borrow()); // TODO + panic!("Do not know how to proceed with parsing") + } + EOF() => { + // TODO: also dedent and end statement here + End + } + } + } + } + } + + fn get_reader(&self) -> Rc> { + match self.reader_or_delegate { + ReaderOrDelegate::Reader() => self.reader.clone(), + ReaderOrDelegate::Delegate(ref delegate) => delegate.get_reader(), + } + } +} + +#[cfg(test)] +mod tests { + use super::CodeLexer; + use mango::io::fortest::StringReader; + use mango::lexing::util::lex_all::{lex_all, LexList}; + use mango::token::tokens::AssociationToken; + use mango::token::tokens::EndBlockToken; + use mango::token::tokens::EndStatementToken; + use mango::token::tokens::IdentifierToken; + use mango::token::tokens::KeywordToken; + use mango::token::tokens::LiteralToken; + use mango::token::tokens::OperatorToken; + use mango::token::tokens::ParenthesisCloseToken; + use mango::token::tokens::ParenthesisOpenToken; + use mango::token::tokens::StartBlockToken; + use mango::token::Tokens; + use std::cell::RefCell; + use std::ops::Generator; + use std::rc::Rc; + + fn assert_text_to_tokens(text: &str, tokens: Vec) { + assert_eq!( + LexList::from_tokens(tokens), + lex_all(&mut CodeLexer::new(Rc::new(RefCell::new( + StringReader::new(text.to_owned()) + )))) + ) + } + + #[test] + fn test_lexing_individual() { + assert_text_to_tokens( + "if", + vec![Tokens::Keyword( + KeywordToken::from_str("if".to_owned()).unwrap(), + )], + ); + // todo: more + } + + #[test] + fn test_lexing_combined() { + assert_text_to_tokens( + "let x = 0\nfor x < 128\n\tx += 1", + vec![ + Tokens::Keyword(KeywordToken::from_str("let".to_owned()).unwrap()), + Tokens::Identifier(IdentifierToken::from_str("x".to_owned()).unwrap()), + Tokens::Association(AssociationToken::from_unprefixed()), + Tokens::Literal(LiteralToken::Int(0)), + Tokens::EndStatement(EndStatementToken::new_end_line()), + Tokens::Keyword(KeywordToken::from_str("for".to_owned()).unwrap()), + Tokens::Operator(OperatorToken::from_str("<").unwrap()), + Tokens::Literal(LiteralToken::Int(128)), + Tokens::EndStatement(EndStatementToken::new_end_line()), + Tokens::StartBlock(StartBlockToken::new()), + Tokens::Identifier(IdentifierToken::from_str("x".to_owned()).unwrap()), + Tokens::Association(AssociationToken::from_str("+".to_owned()).unwrap()), + Tokens::Literal(LiteralToken::Int(1)), + Tokens::EndBlock(EndBlockToken::new(true, false)), + ], + ); + } + + #[test] + fn test_lexing_delegation() {} + + #[test] + fn generators() { + let mut gen = || { + yield Tokens::Keyword(KeywordToken::from_str("let".to_owned()).unwrap()); + yield Tokens::Identifier(IdentifierToken::from_str("x".to_owned()).unwrap()); + yield Tokens::Association(AssociationToken::from_unprefixed()); + return; + }; + let first = unsafe { gen.resume() }; + } +} diff --git a/src/mango/lexing/combi_lexer.rs b/src/mango/lexing/combi_lexer.rs new file mode 100644 index 00000000..09d6a520 --- /dev/null +++ b/src/mango/lexing/combi_lexer.rs @@ -0,0 +1,123 @@ +use mango::io::typ::Reader; +use mango::lexing::code_lexer::CodeLexer; +use mango::lexing::typ::Lexer; +use mango::lexing::typ::MaybeToken; +use mango::lexing::typ::SubLexer; +use mango::lexing::typ::SubLexerResult; +use mango::token::Tokens; +use mango::util::collection::Queue; +use mango::util::collection::Stack; + +pub struct CombiLexer { + reader: Box, + lexers: Stack>, + buffer: Queue, +} + +impl CombiLexer { + pub fn new(reader: Box) -> Self { + let mut lexers: Stack> = Stack::new(); + lexers.push(Box::new(CodeLexer::new())); + CombiLexer { + reader: reader, + lexers: lexers, + buffer: Queue::new(), + } + } +} + +impl Lexer for CombiLexer { + fn lex(&mut self) -> MaybeToken { + // If there are tokens in the buffer, return from there; + if let Option::Some(token) = self.buffer.pop() { + return MaybeToken::Token(token); + } + + match self.lexers.borrow_mut() { + // No more lexers to delegate to; lexing is finished. + Option::None => MaybeToken::End, + Option::Some(ref mut lexer) => { + match lexer.lex_pass(&mut self.reader) { + SubLexerResult::Result(tokens) => { + if tokens.len() > 0 { + // The sublexer produced tokens, queue them. + self.buffer.append(tokens); + self.lex() // TODO: if every branch does this, move it down + } else { + // No tokens were produced; make sure the reader has advanced to prevent infinite loops. + // TODO: check reader state + self.lex() + } + } + SubLexerResult::Delegate(lexer) => { + // Switch to a different delegate lexer. + self.lexers.push(lexer); + self.lex() + } + SubLexerResult::End => { + // The sublexer is done, remove it from the stack and continue with the next. + self.lexers.pop(); // This needs non-lexical lifetimes + self.lex() + } + } + } + } + } +} + +#[cfg(test)] +mod tests { + use super::CombiLexer; + use mango::io::fortest::StringReader; + use mango::lexing::util::lex_all::{lex_all, LexList}; + use mango::token::tokens::AssociationToken; + use mango::token::tokens::EndBlockToken; + use mango::token::tokens::EndStatementToken; + use mango::token::tokens::IdentifierToken; + use mango::token::tokens::KeywordToken; + use mango::token::tokens::LiteralToken; + use mango::token::tokens::OperatorToken; + use mango::token::tokens::StartBlockToken; + use mango::token::Tokens; + use mango::util::encdec::to_text::ToText; + + fn assert_text_to_tokens(text: &str, tokens: Vec) { + let expected = LexList::from_tokens(tokens); + let actual = lex_all(&mut CombiLexer::new(Box::new(StringReader::new(text.to_owned())))); + assert_eq!( + expected, + actual, + "\nexpected:\n{}\nactual:\n{}", + expected.to_text(), + actual.to_text(), + ); + } + + #[test] + fn test_lexing_combined() { + assert_text_to_tokens( + "let x = 0\nfor x < 128\n\tx += 1", + vec![ + Tokens::Keyword(KeywordToken::from_str("let".to_owned()).unwrap()), + Tokens::Identifier(IdentifierToken::from_str("x".to_owned()).unwrap()), + Tokens::Association(AssociationToken::from_unprefixed()), + Tokens::Literal(LiteralToken::Int(0)), + Tokens::EndStatement(EndStatementToken::new_end_line()), + Tokens::Keyword(KeywordToken::from_str("for".to_owned()).unwrap()), + Tokens::Identifier(IdentifierToken::from_str("x".to_owned()).unwrap()), + Tokens::Operator(OperatorToken::from_str("<").unwrap()), + Tokens::Literal(LiteralToken::Int(128)), + Tokens::EndStatement(EndStatementToken::new_end_line()), + Tokens::StartBlock(StartBlockToken::new()), + Tokens::Identifier(IdentifierToken::from_str("x".to_owned()).unwrap()), + Tokens::Association(AssociationToken::from_str("+".to_owned()).unwrap()), + Tokens::Literal(LiteralToken::Int(1)), + Tokens::EndStatement(EndStatementToken::new_end_line()), + Tokens::EndBlock(EndBlockToken::new(true, false)), + ], + ); + } + + #[test] + fn test_lexing_delegation() {} +} diff --git a/src/mango/lexing/comment_lexer.rs b/src/mango/lexing/comment_lexer.rs new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/src/mango/lexing/comment_lexer.rs @@ -0,0 +1 @@ + diff --git a/src/mango/lexing/gen_code_lexer.rs b/src/mango/lexing/gen_code_lexer.rs new file mode 100644 index 00000000..35d9737c --- /dev/null +++ b/src/mango/lexing/gen_code_lexer.rs @@ -0,0 +1,226 @@ + +// TODO: dead code, no longer used + +use mango::io::typ::Reader; +use mango::io::typ::ReaderResult::*; +use mango::lexing::string_lexer::StringLexer; +use mango::lexing::typ::Lexer; +use mango::lexing::typ::MaybeToken; +use mango::token::special::UnlexableToken; +use mango::token::Tokens; +use mango::token::tokens::AssociationToken; +use mango::token::tokens::EndBlockToken; +use mango::token::tokens::EndStatementToken; +use mango::token::tokens::IdentifierToken; +use mango::token::tokens::KeywordToken; +use mango::token::tokens::LiteralToken; +use mango::token::tokens::OperatorToken; +use mango::token::tokens::ParenthesisCloseToken; +use mango::token::tokens::ParenthesisOpenToken; +use mango::token::tokens::StartBlockToken; +use mango::util::collection::Queue; +use std::cell::RefCell; +use std::ops::{Generator, GeneratorState}; +use std::rc::Rc; +use std::borrow::BorrowMut; +use mango::util::strslice::charsliceto; +use mango::util::strslice::slice::glyphat; + +/// This generator does the real lexing work, but is wrapped in a normal +/// class to satisfy an interface that doesn't expose nightly or unsafe features. +//struct GenCodeLexer> { +// generator: G +//} +// +//impl> GenCodeLexer { +// pub fn new() -> Self { +// let mut reader: Rc>; +// GenCodeLexer{ generator: 0 } +// } +//} + +// TODO: this is problematic because the generator wants references to the container, +// TODO: and the container obviously stores the generator + +// TODO: use generator: Box> directory +struct CodeLexer> { + indent: i32, + delegate: Option>, + reader: Rc>, + // TODO: https://stackoverflow.com/questions/50895121/rust-expects-two-levels-of-boxing-for-generator-while-i-only-specified-one + generator: G, +} + +impl CodeLexer>> { + + fn lex_indents(&mut self) -> Vec { + let mut line_indent = 0; + let mut res = Vec::with_capacity(12); + // TODO: I don't need * in MWE but I do here (and other places), can I get rid of it? + while let Match(_) = (*self.reader).borrow_mut().matches("\\t") { + line_indent += 1; + } + for _ in line_indent..self.indent { + // This line is dedented, make end tokens. + // TODO: turn this "new" into a constant + if let Match(_) = (*self.reader).borrow_mut().matches("end") { + // If this is followed by an 'end' keyword, then that 'end' is redundant. + res.push(Tokens::EndBlock(EndBlockToken::new(true, true))); + } else { + res.push(Tokens::EndBlock(EndBlockToken::new(true, false))); + } + } + for _ in self.indent..line_indent { + // This line is indented, make start tokens. + res.push(Tokens::StartBlock(StartBlockToken::new())); + } + self.indent = line_indent; + res + } + + pub fn new(reader: Rc>) -> Box { + let generator: Box + 'static> = Box::new(|| { + loop { + // Delegate to another lexer if one is set. + if let Option::Some(ref mut delegate) = self.delegate { + match delegate.lex() { + MaybeToken::Token(token) => { + yield token; + continue; + } + MaybeToken::End => { + self.delegate = Option::None; + } + } + } + +// // TODO: see if all these match_res can be removed (they couldn't before due to borrowchecker, even with non-lexical lifetimes) +// let continue_match_res = self.reader.borrow_mut().matches("\\.\\.\\."); +// if let Match(_) = continue_match_res { +// // Line continuation has no token, it just continues on the next line. +// let newline_match_res = self.reader.borrow_mut().matches("\\n\\r?"); +// if let Match(_) = newline_match_res { +// // There should always be a newline after continuations, so that they can be ignored together. +// } else { +// // All the text between ... and the end of the line is unlexable. +// let newline_match_res = self.reader.borrow_mut().matches("[^\\n]*\\n\\r?"); +// if let Match(word) = newline_match_res { +// yield Tokens::Unlexable(UnlexableToken::new(word)); +// // This is a new line, so there may be indents. +// // TODO: is there any yield-from like Python? +// for res in self.lex_indents() { +// yield res; +// } +// } else { +// // TODO: I don't know yet how to deal with '...' followed by end-of-file +// panic!() +// } +// } +// // TODO: are continues necessary? it seems more state-independent to restart for each token +// continue; +// } +// let newline_match_res = self.reader.borrow_mut().matches("\\n\\r?"); +// if let Match(_) = newline_match_res { +// // Newline WITHOUT line continuation. +// // This is a new line, so there may be indents. +// yield Tokens::EndStatement(EndStatementToken::new_end_line()); +// for res in self.lex_indents() { +// yield res; +// } +// continue; +// } +// let end_statement_match_res = self.reader.borrow_mut().matches(";"); +// if let Match(_) = end_statement_match_res { +// // Semicolon, which ends a statement. +// // Need to do some extra work with buffer, because there may be a newline followed by indentation, which ; should precede. +// yield Tokens::EndStatement(EndStatementToken::new_semicolon()); +// let end_line_match_res = self.reader.borrow_mut().matches("\\n\\r?"); +// if let Match(_) = end_line_match_res { +// // If semicolon is followed by a newline (redundant), then we need to deal with indents (but ignore the newline itself). +// // This will return the queue of tokens, including the semicolon. +// for res in self.lex_indents() { +// yield res; +// } +// } +// // No newline, can just return the semicolon (which is certainly on the queue, and should be the only thing, but it is fine here if not). +// continue; +// } +// +// // +// // Indentation done; do the rest of lexing. +// // +// // Parse identifiers and keywords. This assumes that keywords are a subset of identifiers. +// let word_match_res = self.reader.borrow_mut().matches(IdentifierToken::subpattern()); +// if let Match(word) = word_match_res { +// // Check if it is a keyword. +// // TODO: maybe turn identifier into keyword to avoid a string copy? kind of elaborate... +// if word == "end" { +// yield Tokens::EndBlock(EndBlockToken::new(false, true)); +// } else if let Ok(keyword) = KeywordToken::from_str(word.clone()) { +// yield Tokens::Keyword(keyword); +// } +// yield Tokens::Identifier(IdentifierToken::from_str(word).unwrap()); +// continue; +// } +// // String literal (delegated). +// let string_match_res = self.reader.borrow_mut().matches("[a-z]?\""); +// if let Match(_) = string_match_res { +// let sublexer: Box = Box::new(StringLexer::new_double_quoted(self.reader.clone())); +// self.delegate = Option::Some(sublexer); +// continue; +// } +// // Association (before operator). +// let association_match_res = self.reader.borrow_mut().matches(&AssociationToken::subpattern()); +// if let Match(token) = association_match_res { +// if glyphat(token, -1) == "=" { +// yield Tokens::Association(AssociationToken::from_unprefixed()); // TODO +// } else { +// yield Tokens::Association(AssociationToken::from_str(charsliceto(token, -1)).unwrap()); +// } +// continue; +// } +// // Operator. +// let operator_match_res = self.reader.borrow_mut().matches(OperatorToken::subpattern()); +// if let Match(token) = operator_match_res { +// yield Tokens::Operator(OperatorToken::from_str(&token).unwrap()); +// continue; +// } +// // Grouping symbols +// if let Match(_) = self.reader.borrow_mut().matches(r"\(") { +// yield Tokens::ParenthesisOpen(ParenthesisOpenToken::new()); +// continue; +// } +// if let Match(_) = self.reader.borrow_mut().matches(r"\)") { +// yield Tokens::ParenthesisClose(ParenthesisCloseToken::new()); +// continue; +// } +// +// +// let unknown_word = self.reader.borrow_mut().matches("[^\\s]+"); +// match unknown_word { +// Match(word) => yield Tokens::Unlexable(UnlexableToken::new(word)), +// NoMatch() => panic!("Do not know how to proceed with parsing"), +// EOF() => { +// // TODO: also dedent and end statement here +// return +// } +// } + } + + }); + Box::new(CodeLexer { + indent: 0, + reader: reader, + delegate: Option::None, + generator: generator, + }) + } + +// pub fn next(&mut self) -> Option { +// // Hide the unsafe part. +// match unsafe { self.generator.resume() } { +// GeneratorState::Yielded(nr) => Option::Some(nr), +// GeneratorState::Complete(_) => Option::None, +// } +// } +} diff --git a/src/mango/lexing/mod.rs b/src/mango/lexing/mod.rs index 8b137891..9d251b70 100644 --- a/src/mango/lexing/mod.rs +++ b/src/mango/lexing/mod.rs @@ -1 +1,11 @@ +mod typ; +mod combi_lexer; + +mod code_lexer; + +mod comment_lexer; + +mod string_lexer; + +mod util; diff --git a/src/mango/lexing/string_lexer.rs b/src/mango/lexing/string_lexer.rs new file mode 100644 index 00000000..270bced4 --- /dev/null +++ b/src/mango/lexing/string_lexer.rs @@ -0,0 +1,42 @@ +use mango::io::typ::Reader; +use mango::io::typ::ReaderResult::*; +use mango::lexing::typ::SubLexer; +use mango::lexing::typ::SubLexerResult; +use mango::token::tokens::LiteralToken; +use mango::token::Tokens; + +#[allow(dead_code)] // TODO: TMP +pub enum StringType { + SingleQuotedInline, + DoubleQuotedInline, + MultiLine, +} + +/// Lexes a string literal token. +// Starts after the opening quote and expected to consume until closing quote. +#[allow(dead_code)] // TODO: TMP +pub struct StringLexer { + typ: StringType, +} + +impl StringLexer { + // TODO: support other types of strings + pub fn new_double_quoted() -> Self { + StringLexer { + typ: StringType::DoubleQuotedInline, + } + } +} + +impl SubLexer for StringLexer { + fn lex_pass(&mut self, reader: &mut Box) -> SubLexerResult { + // TODO: perhaps there's a library that does parsing a string with escape characters + // TODO: doesn't handle escaping etc at all now + // TODO: this is going to have a problem if `matches` automatically eats whitespace + match reader.matches("[^\"\\n]*") { + Match(value) => return SubLexerResult::single(Tokens::Literal(LiteralToken::string(value))), + NoMatch() => panic!("failed to parse string"), // This can't really go wrong since empty pattern matches + EOF() => return SubLexerResult::single(Tokens::Literal(LiteralToken::string("".to_owned()))), // Unclosed string literal, let code parser deal with it + } + } +} diff --git a/src/mango/lexing/typ.rs b/src/mango/lexing/typ.rs new file mode 100644 index 00000000..6911c479 --- /dev/null +++ b/src/mango/lexing/typ.rs @@ -0,0 +1,32 @@ +use mango::io::typ::Reader; +use mango::token::Tokens; + +// TODO: I don't want this to be public outside the crate +pub enum SubLexerResult { + Result(Vec), + Delegate(Box), + End, +} + +impl SubLexerResult { + pub fn single(token: Tokens) -> Self { + SubLexerResult::Result(vec![token]) + } +} + +// TODO: I don't want this to be public outside the crate +pub trait SubLexer { + /// Does one iteration of a sublexer, which should either delegate or return tokens. + /// If an empty vector of tokens is returned, the reader should have advanced (to prevent infinite loops). + fn lex_pass(&mut self, reader: &mut Box) -> SubLexerResult; +} + +pub enum MaybeToken { + Token(Tokens), + End, +} + +pub trait Lexer { + /// Every call to lex returns a token until the end of the input. + fn lex(&mut self) -> MaybeToken; +} diff --git a/src/mango/lexing/util/lex_all.rs b/src/mango/lexing/util/lex_all.rs new file mode 100644 index 00000000..a6aee685 --- /dev/null +++ b/src/mango/lexing/util/lex_all.rs @@ -0,0 +1,36 @@ +use mango::lexing::typ::Lexer; +use mango::lexing::typ::MaybeToken; +use mango::token::Tokens; +use mango::util::encdec::ToText; + +/// Represents all the lex tokens in a source. +#[derive(PartialEq, Eq, Debug)] +pub struct LexList { + tokens: Vec, +} + +impl LexList { + pub fn from_tokens(tokens: Vec) -> Self { + LexList { tokens } + } + + #[allow(unused)] + pub fn from_reader(lexer: &mut Lexer) -> Self { + lex_all(lexer) + } +} + +impl ToText for LexList { + fn to_text(&self) -> String { + self.tokens.iter().map(|token| token.to_text()).collect::>().join(" ") + } +} + +pub fn lex_all(lexer: &mut Lexer) -> LexList { + let mut list = Vec::with_capacity(512); + while let MaybeToken::Token(token) = lexer.lex() { + list.push(token) + } + list.shrink_to_fit(); + LexList::from_tokens(list) +} diff --git a/src/mango/lexing/util/mod.rs b/src/mango/lexing/util/mod.rs new file mode 100644 index 00000000..37351b8a --- /dev/null +++ b/src/mango/lexing/util/mod.rs @@ -0,0 +1,3 @@ +pub mod lex_all; + +pub mod test_util; diff --git a/src/mango/lexing/util/test_util.rs b/src/mango/lexing/util/test_util.rs new file mode 100644 index 00000000..aa5c0ece --- /dev/null +++ b/src/mango/lexing/util/test_util.rs @@ -0,0 +1,19 @@ +use mango::io::fortest::stringreader::StringReader; +use mango::lexing::combi_lexer::CombiLexer; +use mango::lexing::util::lex_all::lex_all; +use mango::lexing::util::lex_all::LexList; +use mango::token::Tokens; +use mango::util::encdec::to_text::ToText; + +#[allow(dead_code)] +pub fn assert_text_to_tokens(text: &str, tokens: Vec) { + let expected = LexList::from_tokens(tokens); + let actual = lex_all(&mut CombiLexer::new(Box::new(StringReader::new(text.to_owned())))); + assert_eq!( + expected, + actual, + "\nexpected:\n{}\nactual:\n{}", + expected.to_text(), + actual.to_text(), + ); +} diff --git a/src/mango/token/collect/all.rs b/src/mango/token/collect/all.rs index 37cd7853..70576f40 100644 --- a/src/mango/token/collect/all.rs +++ b/src/mango/token/collect/all.rs @@ -1,5 +1,6 @@ use mango::token::special::UnlexableToken; use mango::token::tokens::AssociationToken; +use mango::token::tokens::EndBlockToken; use mango::token::tokens::EndStatementToken; use mango::token::tokens::IdentifierToken; use mango::token::tokens::KeywordToken; @@ -7,6 +8,7 @@ use mango::token::tokens::LiteralToken; use mango::token::tokens::OperatorToken; use mango::token::tokens::ParenthesisCloseToken; use mango::token::tokens::ParenthesisOpenToken; +use mango::token::tokens::StartBlockToken; use mango::util::encdec::ToText; /// Collection of all possible tokens. @@ -21,6 +23,8 @@ pub enum Tokens { ParenthesisClose(ParenthesisCloseToken), EndStatement(EndStatementToken), Unlexable(UnlexableToken), + StartBlock(StartBlockToken), + EndBlock(EndBlockToken), } impl ToText for Tokens { @@ -38,6 +42,19 @@ impl ToText for Tokens { ParenthesisClose(token) => token.to_text(), EndStatement(token) => token.to_text(), Unlexable(token) => token.to_text(), + StartBlock(token) => token.to_text(), + EndBlock(token) => token.to_text(), } } } + +#[cfg(test)] +mod tests { + use mango::token::Tokens; + use std::mem::size_of; + + #[test] + fn test_tokens_size() { + assert!(size_of::() <= 40, size_of::()); + } +} diff --git a/src/mango/token/mod.rs b/src/mango/token/mod.rs index 1d15baa1..94479f40 100644 --- a/src/mango/token/mod.rs +++ b/src/mango/token/mod.rs @@ -1,10 +1,10 @@ -mod tokens; +pub mod tokens; pub use self::tokens::*; -mod special; +pub mod special; pub use self::special::*; -mod collect; +pub mod collect; pub use self::collect::Token; pub use self::collect::Tokens; diff --git a/src/mango/token/special/mod.rs b/src/mango/token/special/mod.rs index 7c269fdb..e3ebf8d5 100644 --- a/src/mango/token/special/mod.rs +++ b/src/mango/token/special/mod.rs @@ -1,2 +1,2 @@ -mod unlexable; +pub mod unlexable; pub use self::unlexable::UnlexableToken; diff --git a/src/mango/token/tests.rs b/src/mango/token/tests.rs index 732d01a3..6b4ce6b8 100644 --- a/src/mango/token/tests.rs +++ b/src/mango/token/tests.rs @@ -15,7 +15,7 @@ fn test_tokens_eq() { Keyword(KeywordToken::from_str("let").unwrap()), Keyword(KeywordToken::from_str("mut").unwrap()), Identifier(IdentifierToken::from_name(my_var)), - Association(AssociationToken::from_unmutated()), + Association(AssociationToken::from_unprefixed()), Literal(LiteralToken::int(21)), EndStatement(EndStatementToken::new_semicolon()), Identifier(IdentifierToken::from_name(my_var)), diff --git a/src/mango/token/tokens/association.rs b/src/mango/token/tokens/association.rs index a8c44da2..4f086166 100644 --- a/src/mango/token/tokens/association.rs +++ b/src/mango/token/tokens/association.rs @@ -11,10 +11,8 @@ pub struct AssociationToken { } impl AssociationToken { - pub fn from_unmutated() -> Self { - AssociationToken { - symbol: Option::None, - } + pub fn from_unprefixed() -> Self { + AssociationToken { symbol: Option::None } } pub fn from_str>(symbol_txt: S) -> Result { @@ -26,6 +24,10 @@ impl AssociationToken { symbol: Option::Some(symbol), } } + + pub fn subpattern() -> String { + format!(r"(?:{})?=", Symbol::subpattern()) + } } impl ToText for AssociationToken { diff --git a/src/mango/token/tokens/block.rs b/src/mango/token/tokens/block.rs new file mode 100644 index 00000000..c538cf2a --- /dev/null +++ b/src/mango/token/tokens/block.rs @@ -0,0 +1,43 @@ +use mango::token::Token; +use mango::util::encdec::ToText; + +/// Start and end of blocks, signalled e.g. by indentation. +#[derive(Debug, PartialEq, Eq, Hash, Clone)] +pub struct StartBlockToken {} + +#[derive(Debug, PartialEq, Eq, Hash, Clone)] +pub struct EndBlockToken { + is_dedent: bool, + is_end_keyword: bool, +} + +impl StartBlockToken { + pub fn new() -> Self { + StartBlockToken {} + } +} + +impl EndBlockToken { + pub fn new(is_dedent: bool, is_end_keyword: bool) -> Self { + assert!(is_dedent || is_end_keyword); + EndBlockToken { is_dedent, is_end_keyword } + } +} + +impl ToText for StartBlockToken { + // TODO: needs context information to render indents + fn to_text(&self) -> String { + " { ".to_owned() + } +} + +impl ToText for EndBlockToken { + // TODO: needs context information to render indents + fn to_text(&self) -> String { + " } ".to_owned() + } +} + +impl Token for StartBlockToken {} + +impl Token for EndBlockToken {} diff --git a/src/mango/token/tokens/identifier.rs b/src/mango/token/tokens/identifier.rs index c83555d3..2485fe61 100644 --- a/src/mango/token/tokens/identifier.rs +++ b/src/mango/token/tokens/identifier.rs @@ -19,6 +19,10 @@ impl IdentifierToken { pub fn from_name(name: Name) -> Self { IdentifierToken { name } } + + pub fn subpattern() -> &'static str { + Name::subpattern() + } } impl ToText for IdentifierToken { diff --git a/src/mango/token/tokens/literal.rs b/src/mango/token/tokens/literal.rs index a0953882..ba1259b3 100644 --- a/src/mango/token/tokens/literal.rs +++ b/src/mango/token/tokens/literal.rs @@ -1,6 +1,8 @@ use mango::token::Token; use mango::util::encdec::ToText; use mango::util::numtype::f64eq; +use mango::util::parsetxt::int::parse_int; +use mango::util::parsetxt::real::parse_real; // LATER: it is likely that this will be refactored when the type system is in place. @@ -25,6 +27,34 @@ impl LiteralToken { pub fn real(value: f64) -> LiteralToken { LiteralToken::Real(f64eq::new(value)) } + + /// This matches integer literals, either just numbers in base 10, or base 2-36 with prefix. + /// The syntax for -37 in base 16 is -16b25 and 2748 is 16bABC. + /// Incorrect values like 4b7 or 0b0 are not handled at the lexing stage. + pub fn subpattern_int() -> &'static str { + r"(?:\+|-*)(?:[1-9][0-9]*b(?:_?[0-9a-zA-Z])+|[0-9](?:_?[0-9])*)" + } + + /// This matches real literals (base 10), which look like this: + /// sign / int1 / period / int2 / e / sign / int + /// Here int is a series of 0-9 digits separated by at most one underscore. + /// Signs are optional, everything from 'e' is optional, and int1 OR int2 is optional. + pub fn subpattern_real() -> &'static str { + // TODO: do I want to allow numbers to start with a period? + // TODO: for now, only base10 for reals (would 8b11e2 be 9*8^2 or 9*10^2?) + // TODO: does not deal with NaN of infinity + r"(?:\+|-*)(?:\d(?:_?\d)*\.\d(?:_?\d)*|\d(?:_?\d)*\.|\.\d(?:_?\d)*)(?:e(?:\+|-?)\d(?:_?\d)*)?" + } + + /// Parse a string matching [subpattern_int] to an i64 integer. Overflow is possible. + pub fn parse_int(text: String) -> i64 { + parse_int(text).unwrap() + } + + /// Parse a string matching [subpattern_real] to a f64 real. Loss of precision or overflow are possible. + pub fn parse_real(text: String) -> f64eq { + f64eq::new(parse_real(text).unwrap()) + } } impl ToText for LiteralToken { diff --git a/src/mango/token/tokens/mod.rs b/src/mango/token/tokens/mod.rs index 4508d768..3dfa133a 100644 --- a/src/mango/token/tokens/mod.rs +++ b/src/mango/token/tokens/mod.rs @@ -22,3 +22,6 @@ pub use self::keyword::KeywordToken; pub mod end_statement; pub use self::end_statement::EndStatementToken; + +pub mod block; +pub use self::block::{EndBlockToken, StartBlockToken}; diff --git a/src/mango/token/tokens/operator.rs b/src/mango/token/tokens/operator.rs index eb19db24..4515887e 100644 --- a/src/mango/token/tokens/operator.rs +++ b/src/mango/token/tokens/operator.rs @@ -34,6 +34,10 @@ impl OperatorToken { pub fn is_mult_div(&self) -> bool { self.symbol == Symbol::Asterisk || self.symbol == Symbol::Slash } + + pub fn subpattern() -> &'static str { + Symbol::subpattern() + } } impl ToText for OperatorToken { diff --git a/src/mango/towasm/control/block.rs b/src/mango/towasm/control/block.rs index 63ac5a06..770555aa 100644 --- a/src/mango/towasm/control/block.rs +++ b/src/mango/towasm/control/block.rs @@ -64,11 +64,7 @@ impl Block { impl Wasm for Block { fn as_wat(&self) -> String { - format!( - "(block {0:}\n{1:}\n) ;; block {0:}", - self.name.as_wat(), - self.group.as_wat() - ) + format!("(block {0:}\n{1:}\n) ;; block {0:}", self.name.as_wat(), self.group.as_wat()) } fn write_wasm(&self, file: &mut File) -> io::Result<()> { diff --git a/src/mango/towasm/control/repeat.rs b/src/mango/towasm/control/repeat.rs index 3672a981..594d974c 100644 --- a/src/mango/towasm/control/repeat.rs +++ b/src/mango/towasm/control/repeat.rs @@ -34,11 +34,7 @@ impl Loop { impl Wasm for Loop { fn as_wat(&self) -> String { - format!( - "loop {0:}\n{1:}\nend ;; loop {0:}", - self.name.as_wat(), - self.group.as_wat() - ) + format!("loop {0:}\n{1:}\nend ;; loop {0:}", self.name.as_wat(), self.group.as_wat()) } fn write_wasm(&self, file: &mut File) -> io::Result<()> { diff --git a/src/mango/towasm/numeric/arithmetic.rs b/src/mango/towasm/numeric/arithmetic.rs index 3b2fcede..7faac2bd 100644 --- a/src/mango/towasm/numeric/arithmetic.rs +++ b/src/mango/towasm/numeric/arithmetic.rs @@ -13,10 +13,7 @@ pub struct Add { impl Add { pub fn new(left: Box, right: Box) -> Box { assert!(left.typ() == right.typ()); - Box::new(Add { - left: left, - right: right, - }) + Box::new(Add { left: left, right: right }) } pub fn typ(&self) -> &Type { @@ -26,12 +23,7 @@ impl Add { impl Wasm for Add { fn as_wat(&self) -> String { - format!( - "{}\n{}\n{}.add", - self.left.as_wat(), - self.right.as_wat(), - self.typ().as_wat(), - ) + format!("{}\n{}\n{}.add", self.left.as_wat(), self.right.as_wat(), self.typ().as_wat(),) } fn write_wasm(&self, file: &mut File) -> io::Result<()> { @@ -54,10 +46,7 @@ pub struct Mul { impl Mul { pub fn new(left: Box, right: Box) -> Box { assert!(left.typ() == right.typ()); - Box::new(Mul { - left: left, - right: right, - }) + Box::new(Mul { left: left, right: right }) } pub fn typ(&self) -> &Type { @@ -67,12 +56,7 @@ impl Mul { impl Wasm for Mul { fn as_wat(&self) -> String { - format!( - "{}\n{}\n{}.mul", - self.left.as_wat(), - self.right.as_wat(), - self.typ().as_wat(), - ) + format!("{}\n{}\n{}.mul", self.left.as_wat(), self.right.as_wat(), self.typ().as_wat(),) } fn write_wasm(&self, file: &mut File) -> io::Result<()> { diff --git a/src/mango/towasm/numeric/logic.rs b/src/mango/towasm/numeric/logic.rs index eeb5bd51..92fa2d99 100644 --- a/src/mango/towasm/numeric/logic.rs +++ b/src/mango/towasm/numeric/logic.rs @@ -13,21 +13,13 @@ pub struct Gt { impl Gt { pub fn new(left: Box, right: Box) -> Box { assert!(left.typ() == right.typ()); - Box::new(Gt { - left: left, - right: right, - }) + Box::new(Gt { left: left, right: right }) } } impl Wasm for Gt { fn as_wat(&self) -> String { - format!( - "{}\n{}\n{}.gt_s", - self.left.as_wat(), - self.right.as_wat(), - self.typ().as_wat(), - ) + format!("{}\n{}\n{}.gt_s", self.left.as_wat(), self.right.as_wat(), self.typ().as_wat(),) } fn write_wasm(&self, file: &mut File) -> io::Result<()> { @@ -50,10 +42,7 @@ pub struct Lt { impl Lt { pub fn new(left: Box, right: Box) -> Box { assert!(left.typ() == right.typ()); - Box::new(Lt { - left: left, - right: right, - }) + Box::new(Lt { left: left, right: right }) } pub fn typ(&self) -> &Type { @@ -63,12 +52,7 @@ impl Lt { impl Wasm for Lt { fn as_wat(&self) -> String { - format!( - "{}\n{}\n{}.lt_s", - self.left.as_wat(), - self.right.as_wat(), - self.typ().as_wat(), - ) + format!("{}\n{}\n{}.lt_s", self.left.as_wat(), self.right.as_wat(), self.typ().as_wat(),) } fn write_wasm(&self, file: &mut File) -> io::Result<()> { diff --git a/src/mango/towasm/scope/function.rs b/src/mango/towasm/scope/function.rs index 3847c4f9..61c7690c 100644 --- a/src/mango/towasm/scope/function.rs +++ b/src/mango/towasm/scope/function.rs @@ -75,11 +75,7 @@ pub struct FunctionSignature { impl FunctionSignature { pub fn new(name: Rc, parameters: Vec>, results: Vec>) -> Self { assert!(results.len() <= 1); // - FunctionSignature { - name, - parameters, - results, - } + FunctionSignature { name, parameters, results } } } @@ -89,16 +85,8 @@ impl Wasm for FunctionSignature { "func {} (export \"{}\") {} {}", self.name.as_wat(), self.name.pure_name(), - self.parameters - .iter() - .map(|func| func.as_wat()) - .collect::>() - .join("\n"), - self.results - .iter() - .map(|func| func.as_wat()) - .collect::>() - .join("\n") + self.parameters.iter().map(|func| func.as_wat()).collect::>().join("\n"), + self.results.iter().map(|func| func.as_wat()).collect::>().join("\n") ) } @@ -114,12 +102,7 @@ pub struct Function { impl Function { // This uses group, so it has a label, but this isn't final... It might be useless. - pub fn new( - name: Rc, - parameters: Vec>, - results: Vec>, - statements_gen: F, - ) -> Box + pub fn new(name: Rc, parameters: Vec>, results: Vec>, statements_gen: F) -> Box where F: FnOnce(Label) -> Vec>, { diff --git a/src/mango/towasm/scope/module.rs b/src/mango/towasm/scope/module.rs index 95bcc155..c46cbe50 100644 --- a/src/mango/towasm/scope/module.rs +++ b/src/mango/towasm/scope/module.rs @@ -18,11 +18,7 @@ impl Wasm for Module { fn as_wat(&self) -> String { format!( "(module\n{}\n) ;; module", - self.functions - .iter() - .map(|func| func.as_wat()) - .collect::>() - .join("\n") + self.functions.iter().map(|func| func.as_wat()).collect::>().join("\n") ) } diff --git a/src/mango/towasm/tests.rs b/src/mango/towasm/tests.rs index 18f08dc2..f5941bde 100644 --- a/src/mango/towasm/tests.rs +++ b/src/mango/towasm/tests.rs @@ -1,6 +1,5 @@ use mango::towasm::arithmetic::Add; use mango::towasm::collect::datatype::Value; -use mango::towasm::collect::typ::Wasm; use mango::towasm::collect::Type; use mango::towasm::control::BranchIf; use mango::towasm::control::Label; @@ -18,6 +17,7 @@ use mango::towasm::values::Const; use mango::towasm::values::DeclareLocal; #[test] +#[allow(unused_variables)] fn test_example_1() { let param_n = Parameter::new(Name::new("n".to_owned()).unwrap(), Type::Int32); let var_n = param_n.local(); @@ -26,41 +26,27 @@ fn test_example_1() { vec![param_n], vec![Output::new(Type::Int32)], |func_label: Label| { - let fac_result_decl = - DeclareLocal::new(Name::new("fac_result".to_owned()).unwrap(), Type::Int32); + let fac_result_decl = DeclareLocal::new(Name::new("fac_result".to_owned()).unwrap(), Type::Int32); let fac_result = fac_result_decl.local(); - let loop_condition_decl = - DeclareLocal::new(Name::new("loop_condition".to_owned()).unwrap(), Type::Bool); + let loop_condition_decl = DeclareLocal::new(Name::new("loop_condition".to_owned()).unwrap(), Type::Bool); let loop_condition = loop_condition_decl.local(); vec![ // Function body fac_result_decl, loop_condition_decl, Assign::new(fac_result.clone(), Const::new(Type::Int32, Value::Int(1))), - Loop::new_named( - Name::new("fac_loop".to_owned()).unwrap(), - |loop_label: Label| { - vec![ - Assign::new( - fac_result.clone(), - Mul::new(fac_result.get(), var_n.get()), - ), - Assign::new( - loop_condition.clone(), - Gt::new(var_n.get(), Const::new(Type::Int32, Value::Int(2))), - ), - Assign::new( - var_n.clone(), - Add::new(var_n.get(), Const::new(Type::Int32, Value::Int(-1))), - ), - BranchIf::new(loop_condition.get(), loop_label), - ] - }, - ), + Loop::new_named(Name::new("fac_loop".to_owned()).unwrap(), |loop_label: Label| { + vec![ + Assign::new(fac_result.clone(), Mul::new(fac_result.get(), var_n.get())), + Assign::new(loop_condition.clone(), Gt::new(var_n.get(), Const::new(Type::Int32, Value::Int(2)))), + Assign::new(var_n.clone(), Add::new(var_n.get(), Const::new(Type::Int32, Value::Int(-1)))), + BranchIf::new(loop_condition.get(), loop_label), + ] + }), Return::new(func_label, fac_result.get()), ] }, )]); - println!("WAT:\n{}\n", module.as_wat()); + // println!("WAT:\n{}\n", module.as_wat()); } diff --git a/src/mango/towasm/values/assign.rs b/src/mango/towasm/values/assign.rs index 2044fefc..b4fd64bb 100644 --- a/src/mango/towasm/values/assign.rs +++ b/src/mango/towasm/values/assign.rs @@ -18,11 +18,7 @@ impl Assign { impl Wasm for Assign { fn as_wat(&self) -> String { - format!( - "{}\nset_local {}", - self.value.as_wat(), - self.assignee.as_wat() - ) + format!("{}\nset_local {}", self.value.as_wat(), self.assignee.as_wat()) // set_local $fac_result } diff --git a/src/mango/towasm/values/localvar.rs b/src/mango/towasm/values/localvar.rs index 6af061bc..47536f1d 100644 --- a/src/mango/towasm/values/localvar.rs +++ b/src/mango/towasm/values/localvar.rs @@ -40,11 +40,7 @@ impl DeclareLocal { impl Wasm for DeclareLocal { fn as_wat(&self) -> String { - format!( - "(local {} {})", - self.local.name().as_wat(), - self.local.typ().as_wat() - ) + format!("(local {} {})", self.local.name().as_wat(), self.local.typ().as_wat()) } fn write_wasm(&self, file: &mut File) -> io::Result<()> { @@ -69,9 +65,7 @@ pub struct Local { impl Local { pub fn get(&self) -> Box { Box::new(GetLocal { - local: Local { - inner: self.inner.clone(), - }, + local: Local { inner: self.inner.clone() }, }) } diff --git a/src/mango/util/codeparts/keyword.rs b/src/mango/util/codeparts/keyword.rs index 0998b74b..92605849 100644 --- a/src/mango/util/codeparts/keyword.rs +++ b/src/mango/util/codeparts/keyword.rs @@ -94,6 +94,8 @@ impl Keyword { "int" => Ok(Reserved("int".to_owned())), "interface" => Ok(Reserved("interface".to_owned())), "internal" => Ok(Reserved("internal".to_owned())), + "intersect" => Ok(Reserved("intersect".to_owned())), + "intersection" => Ok(Reserved("intersection".to_owned())), "is" => Ok(Reserved("is".to_owned())), "it" => Ok(Reserved("it".to_owned())), "lambda" => Ok(Reserved("lambda".to_owned())), @@ -149,6 +151,8 @@ impl Keyword { "try" => Ok(Reserved("try".to_owned())), "type" => Ok(Reserved("type".to_owned())), "unsafe" => Ok(Reserved("unsafe".to_owned())), + "unite" => Ok(Reserved("unite".to_owned())), + "union" => Ok(Reserved("union".to_owned())), "until" => Ok(Reserved("until".to_owned())), "use" => Ok(Reserved("use".to_owned())), "val" => Ok(Reserved("val".to_owned())), @@ -162,10 +166,7 @@ impl Keyword { "xor" => Ok(Reserved("xor".to_owned())), "yield" => Ok(Reserved("yield".to_owned())), - _ => Err(Msg::from_valid(&format!( - "Unknown keywords: '{}'", - ssymbol_txt - ))), + _ => Err(Msg::from_valid(&format!("Unknown keywords: '{}'", ssymbol_txt))), } } } diff --git a/src/mango/util/codeparts/operator.rs b/src/mango/util/codeparts/operator.rs index 00dae4f0..625331ca 100644 --- a/src/mango/util/codeparts/operator.rs +++ b/src/mango/util/codeparts/operator.rs @@ -11,34 +11,60 @@ pub enum Symbol { Dash, Asterisk, Slash, + LT, + GT, + Eq, + LE, + GE, + Exclamation, + Question, } impl Symbol { pub fn new>(symbol_txt: S) -> Result { + use self::Symbol::*; let ssymbol_txt = symbol_txt.into(); match &*ssymbol_txt { - "+" => Ok(Symbol::Plus), - "-" => Ok(Symbol::Dash), - "*" => Ok(Symbol::Asterisk), - "/" => Ok(Symbol::Slash), - _ => Err(Msg::from_valid(&format!( - "Unknown symbol: '{}'", - ssymbol_txt - ))), + "+" => Ok(Plus), + "-" => Ok(Dash), + "*" => Ok(Asterisk), + "/" => Ok(Slash), + // TODO: how do I know < is an operator, rather than e.g. a generic? + "<" => Ok(LT), + ">" => Ok(GT), + "==" => Ok(Eq), + "<=" => Ok(LE), + ">=" => Ok(GE), + "!" => Ok(Exclamation), + "?" => Ok(Question), + _ => Err(Msg::from_valid(&format!("Unknown symbol: '{}'", ssymbol_txt))), } } + + /// Generate an eager subpattern to match tokens, that can be composed in a regular expression. + pub fn subpattern() -> &'static str { + r"(?:\+|-|\*|/|<=|>=|==|>|<)" + } } impl Display for Symbol { fn fmt(&self, f: &mut Formatter) -> fResult { + use self::Symbol::*; write!( f, "{}", match *self { - Symbol::Plus => "+", - Symbol::Dash => "-", - Symbol::Asterisk => "*", - Symbol::Slash => "/", + Plus => "+", + Dash => "-", + Asterisk => "*", + Slash => "/", + LT => "<", + GT => ">", + Eq => "==", + LE => "<=", + GE => ">=", + Exclamation => "!", + Question => "?", } ) } diff --git a/src/mango/util/collection/mod.rs b/src/mango/util/collection/mod.rs new file mode 100644 index 00000000..f31304ed --- /dev/null +++ b/src/mango/util/collection/mod.rs @@ -0,0 +1,5 @@ +pub mod queue; +pub use self::queue::Queue; + +pub mod stack; +pub use self::stack::Stack; diff --git a/src/mango/util/collection/queue.rs b/src/mango/util/collection/queue.rs new file mode 100644 index 00000000..3f18fad4 --- /dev/null +++ b/src/mango/util/collection/queue.rs @@ -0,0 +1,45 @@ +use std::collections::VecDeque; + +/// A one-ended queue. See also [Stack]. +/// This is just a wrapper around vec so nobody pushes or pops the wrong end. +pub struct Queue { + items: VecDeque, +} + +impl Queue { + pub fn new() -> Self { + Queue { + items: VecDeque::with_capacity(16), + } + } + + pub fn push(&mut self, value: T) { + self.items.push_back(value) + } + + pub fn pop(&mut self) -> Option { + self.items.pop_front() + } + + /// Moves all the elements from a vector into the queue. + pub fn append(&mut self, other: Vec) { + for item in other.into_iter() { + self.items.push_back(item); + } + } +} + +#[cfg(test)] +mod tests { + use super::Queue; + + #[test] + fn test_queue() { + let mut queue: Queue = Queue::new(); + queue.push(1); + queue.push(2); + assert_eq!(1, queue.pop().unwrap()); + assert_eq!(2, queue.pop().unwrap()); + assert!(queue.pop().is_none()); + } +} diff --git a/src/mango/util/collection/stack.rs b/src/mango/util/collection/stack.rs new file mode 100644 index 00000000..942e43cd --- /dev/null +++ b/src/mango/util/collection/stack.rs @@ -0,0 +1,42 @@ +use std::collections::VecDeque; + +/// A one-ended stack. See also [Queue]. +/// This is just a wrapper around deque so nobody pushes or pops the wrong end. +pub struct Stack { + items: VecDeque, +} + +impl Stack { + pub fn new() -> Self { + Stack { + items: VecDeque::with_capacity(16), + } + } + + pub fn push(&mut self, value: T) { + self.items.push_back(value) + } + + pub fn pop(&mut self) -> Option { + self.items.pop_back() + } + + pub fn borrow_mut(&mut self) -> Option<&mut T> { + self.items.back_mut() + } +} + +#[cfg(test)] +mod tests { + use super::Stack; + + #[test] + fn test_stack() { + let mut stack: Stack = Stack::new(); + stack.push(1); + stack.push(2); + assert_eq!(2, stack.pop().unwrap()); + assert_eq!(1, stack.pop().unwrap()); + assert!(stack.pop().is_none()); + } +} diff --git a/src/mango/util/errors/code_problem.rs b/src/mango/util/errors/code_problem.rs index 6919824a..f28d20a5 100644 --- a/src/mango/util/errors/code_problem.rs +++ b/src/mango/util/errors/code_problem.rs @@ -109,17 +109,8 @@ mod tests { #[test] fn test_new_problem() { - CodeProblem::error( - Msg::copy_new("test problem").unwrap(), - Context::new("test context".to_string()), - ); - CodeProblem::warning( - Msg::copy_new("test problem").unwrap(), - Context::new("test context".to_string()), - ); - CodeProblem::debug( - Msg::copy_new("test problem").unwrap(), - Context::new("test context".to_string()), - ); + CodeProblem::error(Msg::copy_new("test problem").unwrap(), Context::new("test context".to_string())); + CodeProblem::warning(Msg::copy_new("test problem").unwrap(), Context::new("test context".to_string())); + CodeProblem::debug(Msg::copy_new("test problem").unwrap(), Context::new("test context".to_string())); } } diff --git a/src/mango/util/errors/collector.rs b/src/mango/util/errors/collector.rs index 619ed139..86a18828 100644 --- a/src/mango/util/errors/collector.rs +++ b/src/mango/util/errors/collector.rs @@ -49,33 +49,17 @@ mod tests { #[test] fn test_iter_collector() { let mut collector = ProblemCollector::new(); - collector.error( - Msg::copy_new("test problem").unwrap(), - Context::new("test context".to_string()), - ); + collector.error(Msg::copy_new("test problem").unwrap(), Context::new("test context".to_string())); let cnt = collector.into_iter().count(); assert_eq!(1, cnt, "No item in ProblemCollector"); - assert_eq!( - cnt, - collector.into_iter().count(), - "Failed to iterate over ProblemCollector twice" - ) + assert_eq!(cnt, collector.into_iter().count(), "Failed to iterate over ProblemCollector twice") } #[test] fn test_new_problem() { let mut collector = ProblemCollector::new(); - collector.error( - Msg::copy_new("test problem").unwrap(), - Context::new("test context".to_string()), - ); - collector.warning( - Msg::copy_new("test problem").unwrap(), - Context::new("test context".to_string()), - ); - collector.debug( - Msg::copy_new("test problem").unwrap(), - Context::new("test context".to_string()), - ); + collector.error(Msg::copy_new("test problem").unwrap(), Context::new("test context".to_string())); + collector.warning(Msg::copy_new("test problem").unwrap(), Context::new("test context".to_string())); + collector.debug(Msg::copy_new("test problem").unwrap(), Context::new("test context".to_string())); } } diff --git a/src/mango/util/format/strings.rs b/src/mango/util/format/strings.rs index f4aab63c..bcacf3ee 100644 --- a/src/mango/util/format/strings.rs +++ b/src/mango/util/format/strings.rs @@ -2,7 +2,8 @@ /// string when parsed by a typical language. pub fn to_double_quoted_str(txt: &str) -> String { // todo: performance? mostly I'd like to add the quotes as part of the stream, but it seems difficult - let esc: String = txt.chars() + let esc: String = txt + .chars() .map(|c| match c { '\\' => r"\\".to_string(), '\"' => "\\\"".to_string(), @@ -25,10 +26,7 @@ mod tests { assert_eq!("\"hello\\nworld\"", to_double_quoted_str("hello\nworld")); assert_eq!("\"hello\\\\ world\"", to_double_quoted_str("hello\\ world")); assert_eq!("\"hello\\\"world\"", to_double_quoted_str("hello\"world")); - assert_eq!( - "\"\\\"\\\"\\\"\\n\\\\\"", - to_double_quoted_str("\"\"\"\n\\") - ); + assert_eq!("\"\\\"\\\"\\\"\\n\\\\\"", to_double_quoted_str("\"\"\"\n\\")); assert_eq!("\"\\\\n\"", to_double_quoted_str("\\n")); assert_eq!("\"\\\\\\n\"", to_double_quoted_str("\\\n")); } diff --git a/src/mango/util/mod.rs b/src/mango/util/mod.rs index 11224671..a6371011 100644 --- a/src/mango/util/mod.rs +++ b/src/mango/util/mod.rs @@ -1,5 +1,9 @@ +pub mod collection; + pub mod strtype; +pub mod strslice; + pub mod numtype; pub mod signaltype; @@ -11,3 +15,5 @@ pub mod encdec; pub mod errors; pub mod codeparts; + +pub mod parsetxt; diff --git a/src/mango/util/numtype/eqfloat.rs b/src/mango/util/numtype/eqfloat.rs index df09318d..eb570d45 100644 --- a/src/mango/util/numtype/eqfloat.rs +++ b/src/mango/util/numtype/eqfloat.rs @@ -165,14 +165,8 @@ mod tests { assert_eq!(get_hash(f64eq::new(PI)), get_hash(f64eq::new(PI))); assert_ne!(get_hash(f64eq::new(42.)), get_hash(f64eq::new(-42.))); assert_eq!(get_hash(f64eq::new(0.)), get_hash(f64eq::new(-0.))); - assert_eq!( - get_hash(f64eq::new(INFINITY)), - get_hash(f64eq::new(INFINITY)) - ); - assert_ne!( - get_hash(f64eq::new(INFINITY)), - get_hash(f64eq::new(NEG_INFINITY)) - ); + assert_eq!(get_hash(f64eq::new(INFINITY)), get_hash(f64eq::new(INFINITY))); + assert_ne!(get_hash(f64eq::new(INFINITY)), get_hash(f64eq::new(NEG_INFINITY))); assert_ne!(get_hash(f64eq::new(42.)), get_hash(f64eq::new(NAN))); assert_ne!(get_hash(f64eq::new(NAN)), get_hash(f64eq::new(42.))); assert_eq!(get_hash(f64eq::new(NAN)), get_hash(f64eq::new(NAN))); diff --git a/src/mango/util/parsetxt/int.rs b/src/mango/util/parsetxt/int.rs new file mode 100644 index 00000000..89911ecf --- /dev/null +++ b/src/mango/util/parsetxt/int.rs @@ -0,0 +1,91 @@ +use mango::util::strslice::char_ops::CharOps; +use regex::Regex; + +#[derive(Debug)] +pub enum IntParseFailReason { + Invalid, + Overflow, + Underflow, +} + +/// This matches integer literals, either just numbers in base 10, or base 2-36 with prefix. +/// The syntax for -37 in base 16 is -16b25 and 2748 is 16bABC. +/// Incorrect values like 4b7 or 0b0 are not handled at the lexing stage. +pub fn int_pattern() -> &'static str { + r"(?:(?P(?:\+|-?)[1-9][0-9]*)b(?P(?:_?[0-9a-zA-Z])+)|(?P(?:\+|-?)[0-9](?:_?[0-9])*))" +} + +/// Convert a String that matches [int_pattern] to an i64 integer. Overflow is possible. +pub fn parse_int>(text: S) -> Result { + let text = text.into(); + match Regex::new(&format!("^{}$", int_pattern())).unwrap().captures(&text) { + None => return Err(IntParseFailReason::Invalid), + Some(captures) => { + // // Sign + // let sign_str = captures.name("sign").unwrap().as_str(); + // let sign = if sign_str == "+" || sign_str == "" { + // 1 // positive + // } else { + // -1 // negative + // }; + // Check if base10 or special + match captures.name("b10_val") { + None => { + // There is a base provided. + if let Some(base) = captures.name("base") { + if let Some(value) = captures.name("reb_val") { + // TODO: implement + panic!(format!( + "Do not yet know how to deal with {} in base {}", + value.as_str().without_char(&'_'), + base.as_str() + )) + } else { + panic!("Expected 'reb_val' match in regex") + } + } else { + panic!("Expected 'base' match in regex") + } + } + Some(value) => { + // This is a 'normal' (base10) value. + // TODO: check for over/underflow + return Ok(value.as_str().without_char(&'_').parse::().unwrap()); + } + } + } + } +} + +// TODO: possibly add a i32 version? + +#[cfg(test)] +mod tests { + use super::parse_int; + + #[test] + fn test_parse_b10_int() { + assert_eq!(42, parse_int("42").unwrap()); + assert_eq!(42, parse_int("4_2").unwrap()); + assert_eq!(123456789, parse_int("+1_2_3_4_5_6_7_8_9").unwrap()); + assert_eq!(-123456789, parse_int("-123456789").unwrap()); + assert_eq!(0, parse_int("-0").unwrap()); + assert_eq!(-1, parse_int("-1").unwrap()); + // Weird bases with 0 prefix are not supported. + assert_eq!(9, parse_int("09").unwrap()); + } + + #[test] + fn test_invalid_b10_ints() { + assert!(parse_int("0x9").is_err()); + assert!(parse_int("A").is_err()); + assert!(parse_int("_0").is_err()); + assert!(parse_int("0_").is_err()); + // TODO: over/underflow + } + + #[test] + fn test_parse_based_ints() { + // TODO: not implemented yet + } +} diff --git a/src/mango/util/parsetxt/mod.rs b/src/mango/util/parsetxt/mod.rs new file mode 100644 index 00000000..04611574 --- /dev/null +++ b/src/mango/util/parsetxt/mod.rs @@ -0,0 +1,3 @@ +pub mod int; + +pub mod real; diff --git a/src/mango/util/parsetxt/real.rs b/src/mango/util/parsetxt/real.rs new file mode 100644 index 00000000..20e805c7 --- /dev/null +++ b/src/mango/util/parsetxt/real.rs @@ -0,0 +1,99 @@ +use mango::util::strslice::char_ops::CharOps; +use regex::Regex; + +#[derive(Debug)] +pub enum RealParseFailReason { + Invalid, + Overflow, + Underflow, + PrecisionLoss(f64), +} + +/// This matches real literals (base 10), which look like this: +/// sign / int1 / period / int2 / e / sign / int +/// Here int is a series of 0-9 digits separated by at most one underscore. +/// Signs are optional, everything from 'e' is optional, and int1 OR int2 is optional. +pub fn real_pattern() -> &'static str { + // TODO: do I really want to allow numbers to start with a period? + // TODO: for now, only base10 for reals (would 8b11e2 be 9*8^2 or 9*10^2?) + // TODO: does not deal with NaN of infinity + r"(?P(?:\+|-?)(?:\d(?:_?\d)*\.\d(?:_?\d)*|\d(?:_?\d)*\.|\.\d(?:_?\d)*))(?:e(?P(?:\+|-?)\d(?:_?\d)*))?" +} + +/// Convert a String that matches [real_pattern] to an f64 real. Overflow and loss of precision is possible. +pub fn parse_real>(text: S) -> Result { + let text = text.into(); + match Regex::new(&format!("^{}$", real_pattern())).unwrap().captures(&text) { + None => return Err(RealParseFailReason::Invalid), + Some(captures) => { + let multiplier = captures + .name("multiplier") + .unwrap() + .as_str() + .without_char(&'_') + .parse::() + .unwrap(); + match captures.name("exponent") { + None => { + // This is a 'normal' real, no exponential notation + return Ok(multiplier); + } + Some(exponent_match) => { + // This real is in exponential notation + let exponent = exponent_match.as_str().without_char(&'_').parse::().unwrap(); + // TODO: is there a numerically smarter way to do this? + return Ok(10f64.powf(exponent) * multiplier); + } + } + } + } +} + +// TODO: possibly add a f32 version? + +#[cfg(test)] +mod tests { + use super::parse_real; + + fn close(x: f64, y: f64) -> bool { + (x - y).abs() < 1e-8 + } + + #[test] + fn test_parse_nonexp_real() { + assert!(close(42., parse_real("42.0").unwrap())); + assert!(close(-0.1, parse_real("-.1").unwrap())); + assert!(close(-1., parse_real("-1.").unwrap())); + assert!(close(12345.6789, parse_real("1_2_3_4_5.6_7_8_9").unwrap())); + } + + #[test] + fn test_parse_exp_real() { + assert!(close(42., parse_real("42.0e0").unwrap())); + assert!(close(-0.1, parse_real("-.1e0").unwrap())); + assert!(close(-1., parse_real("-1.e0").unwrap())); + assert!(close(42., parse_real("42.0e+0").unwrap())); + assert!(close(12345.6789, parse_real("1_2_3_4_5.6_7_8_9e0").unwrap())); + assert!(close(0.42, parse_real("42.0e-2").unwrap())); + assert!(close(-0.001, parse_real("-.1e-2").unwrap())); + assert!(close(-0.01, parse_real("-1.e-2").unwrap())); + assert!(close(123.456789, parse_real("1_2_3_4_5.6_7_8_9e-2").unwrap())); + assert!(close(42.0, parse_real("42.0e-0_0_0").unwrap())); + } + + #[test] + fn test_invalid_real() { + assert!(parse_real("+_42.0").is_err()); + assert!(parse_real("-_42.0").is_err()); + assert!(parse_real("_42.0").is_err()); + assert!(parse_real("42_.0").is_err()); + assert!(parse_real("42._0").is_err()); + assert!(parse_real("42.0_").is_err()); + assert!(parse_real("42.0e_0").is_err()); + assert!(parse_real("42.0e0_").is_err()); + assert!(parse_real("42.0e0b0").is_err()); + } + + // TODO: over/underflow + // TODO: loss of precision +} diff --git a/src/mango/util/strslice/char_ops.rs b/src/mango/util/strslice/char_ops.rs new file mode 100644 index 00000000..894994aa --- /dev/null +++ b/src/mango/util/strslice/char_ops.rs @@ -0,0 +1,27 @@ +pub trait CharOps { + /// Remove all matching characters from the string. + // Signature may be changed to support a set of characters, if the need arises. + fn without_char(&self, strip: &char) -> String; + + fn char_len(&self) -> usize; +} + +impl<'a> CharOps for &'a str { + fn without_char(&self, strip: &char) -> String { + self.chars().filter(|chr| chr != strip).collect() + } + + fn char_len(&self) -> usize { + self.chars().count() + } +} + +impl CharOps for String { + fn without_char(&self, strip: &char) -> String { + self.chars().filter(|chr| chr != strip).collect() + } + + fn char_len(&self) -> usize { + self.chars().count() + } +} diff --git a/src/mango/util/strslice/mod.rs b/src/mango/util/strslice/mod.rs new file mode 100644 index 00000000..08a0519c --- /dev/null +++ b/src/mango/util/strslice/mod.rs @@ -0,0 +1,6 @@ +pub mod slice; +pub use self::slice::charslice; +pub use self::slice::charslicefrom; +pub use self::slice::charsliceto; + +pub mod char_ops; diff --git a/src/mango/util/strslice/slice.rs b/src/mango/util/strslice/slice.rs new file mode 100644 index 00000000..35fb70da --- /dev/null +++ b/src/mango/util/strslice/slice.rs @@ -0,0 +1,68 @@ +/// Take a character-based slice of a string (as opposed to the default byte-slice). +/// Allows negative indices to slice from the end (but start must be before end). +/// This may not be very fast. +pub fn charslice>(text: S, start: isize, end: isize) -> String { + let stext = text.into(); + let from: usize; + let length: usize; + let charcount = stext.chars().count(); + if start < 0 { + // LATER: may remove this check and just default to 0 in the future. + assert!( + -start as usize <= charcount, + "charslice: if 'start' is negative, the magnitude may not exceed the length" + ); + from = (charcount as isize + start) as usize; + } else { + from = start as usize; + } + if end < 0 { + // LATER: may remove this check and just default to 0 in the future. + assert!( + -end as usize <= charcount, + "charslice: if 'end' is negative, the magnitude may not exceed the length" + ); + let new_end = (charcount as isize + end) as usize; + assert!(new_end >= from, "charslice: 'start' may not be before 'end' (end was negative)"); + length = new_end - from; + } else { + assert!( + end >= from as isize, + "charslice: 'start' may not be before 'end' (end was positive)" + ); + length = end as usize - from; + } + stext.chars().skip(from).take(length).collect() +} + +pub fn charslicefrom>(text: S, start: isize) -> String { + let stext = text.into(); + let len = stext.chars().count() as isize; + charslice(stext, start, len) +} + +pub fn charsliceto>(text: S, end: isize) -> String { + charslice(text, 0, end) +} + +pub fn glyphat>(text: S, pos: isize) -> String { + charslice(text, pos, pos + 1) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_slice() { + assert_eq!(42isize as usize, 42usize); + assert_eq!("你好", charslice("你好!", 0, 2)); + assert_eq!("!", charslicefrom("你好!", 2)); + assert_eq!("你好", charsliceto("你好!", 2)); + assert_eq!("好", glyphat("你好!", 1)); + // Negative indices should match Python 3 behaviour: + assert_eq!("你好", charslice("你好!", -3, -1)); + assert_eq!("!", charslicefrom("你好!", -1)); + assert_eq!("好", glyphat("你好!", -2)); + } +} diff --git a/src/mango/util/strtype/msg.rs b/src/mango/util/strtype/msg.rs index 4f20b0ae..d2e2f28b 100644 --- a/src/mango/util/strtype/msg.rs +++ b/src/mango/util/strtype/msg.rs @@ -34,9 +34,7 @@ impl StrType for Msg { fn validate(msg: &str) -> Result<(), Msg> { if !VALID_MESSAGE.is_match(&msg.to_string()) { // Make sure this is a valid string, otherwise it causes an infinite loop making error messages for it! - return Err(Msg::from_valid( - "Messages should consist of printable text.", - )); + return Err(Msg::from_valid("Messages should consist of printable text.")); } Ok(()) } diff --git a/src/mango/util/strtype/name.rs b/src/mango/util/strtype/name.rs index 0405d927..23533c4c 100644 --- a/src/mango/util/strtype/name.rs +++ b/src/mango/util/strtype/name.rs @@ -6,8 +6,9 @@ use std::fmt; use std::sync::Mutex; use string_interner::StringInterner; +const VALID_IDENTIFIER_SUBPATTERN: &'static str = r"[a-zA-Z_][a-zA-Z0-9_]*"; lazy_static! { - static ref VALID_IDENTIFIER: Regex = Regex::new(r"^[a-zA-Z_][a-zA-Z0-9_]*$").unwrap(); + static ref VALID_IDENTIFIER: Regex = Regex::new(&format!("{}{}{}", r"^", VALID_IDENTIFIER_SUBPATTERN, r"$")).unwrap(); } // TODO: this alias just for https://github.com/rust-lang-nursery/rustfmt/issues/2610 @@ -31,23 +32,19 @@ impl Name { pub fn value(&self) -> String { // Unwrap only fails if another thread panicked while locking, which shouldn't happen. // todo: I want this to return &str but that'd need the interner to be borrowed longer - INTERNER - .lock() - .unwrap() - .resolve(self.name_id) - .unwrap() - .to_string() + INTERNER.lock().unwrap().resolve(self.name_id).unwrap().to_string() + } + + /// Generate an eager subpattern to match names, that can be composed in a regular expression. + pub fn subpattern() -> &'static str { + &VALID_IDENTIFIER_SUBPATTERN.clone() } } impl fmt::Display for Name { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { // Use interner directly instead of .value(), because that creates a copy - write!( - f, - "{}", - INTERNER.lock().unwrap().resolve(self.name_id).unwrap() - ) + write!(f, "{}", INTERNER.lock().unwrap().resolve(self.name_id).unwrap()) } } @@ -66,9 +63,7 @@ impl StrType for Name { fn validate(name: &str) -> Result<(), Msg> { match name.chars().next() { Some(chr) => if chr.is_digit(10) { - return Err(Msg::from_valid( - "Identifier names may not start with a digit.", - )); + return Err(Msg::from_valid("Identifier names may not start with a digit.")); }, None => return Ok(()), // empty string } @@ -162,13 +157,7 @@ mod tests { #[test] fn test_name_interning() { - assert_eq!( - Name::copy_new("Hello").unwrap(), - Name::copy_new("Hello").unwrap() - ); - assert_ne!( - Name::copy_new("Hello").unwrap(), - Name::copy_new("Goodbye").unwrap() - ); + assert_eq!(Name::copy_new("Hello").unwrap(), Name::copy_new("Hello").unwrap()); + assert_ne!(Name::copy_new("Hello").unwrap(), Name::copy_new("Goodbye").unwrap()); } }