From 6d657ef8b5a8865370909163cb9e66b73f749667 Mon Sep 17 00:00:00 2001 From: Mark Date: Tue, 15 May 2018 22:03:55 +0200 Subject: [PATCH 01/49] Add a fake io reader to test lexing #56 --- src/lib.rs | 1 + src/mango/io/fortest/fromstr.rs | 29 +++++++++++++++++++++++++++++ src/mango/io/fortest/mod.rs | 2 ++ src/mango/io/mod.rs | 3 +++ src/mango/io/typ.rs | 14 ++++++++++++++ 5 files changed, 49 insertions(+) create mode 100644 src/mango/io/fortest/fromstr.rs create mode 100644 src/mango/io/fortest/mod.rs create mode 100644 src/mango/io/mod.rs create mode 100644 src/mango/io/typ.rs diff --git a/src/lib.rs b/src/lib.rs index c7fe180e..163b6c11 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -9,6 +9,7 @@ extern crate derive_new; pub mod mango { // Utilities pub mod cli; + pub mod io; pub mod jit; pub mod util; diff --git a/src/mango/io/fortest/fromstr.rs b/src/mango/io/fortest/fromstr.rs new file mode 100644 index 00000000..3a39b80c --- /dev/null +++ b/src/mango/io/fortest/fromstr.rs @@ -0,0 +1,29 @@ +use mango::io::typ::Reader; +use regex::Regex; + +/// Implementation of [Reader] that reads from a pre-provided string. +/// Mostly for testing purposes. +pub struct StringReader { + code: String, + index: usize, +} + +impl StringReader { + pub fn new(code: String) -> Self { + StringReader { code, index: 0 } + } +} + +impl Reader for StringReader { + fn equals(&mut self, text: &str) -> bool { + if &self.code[self.index..self.index + text.len()] == text { + self.index += text.len(); + return true; + } + false + } + + fn matches(&mut self, pattern: Regex) -> Option { + unimplemented!() // TODO + } +} diff --git a/src/mango/io/fortest/mod.rs b/src/mango/io/fortest/mod.rs new file mode 100644 index 00000000..9aa88ab0 --- /dev/null +++ b/src/mango/io/fortest/mod.rs @@ -0,0 +1,2 @@ +pub mod fromstr; +pub use self::fromstr::*; diff --git a/src/mango/io/mod.rs b/src/mango/io/mod.rs new file mode 100644 index 00000000..6dbbd816 --- /dev/null +++ b/src/mango/io/mod.rs @@ -0,0 +1,3 @@ +pub mod typ; + +pub mod fortest; diff --git a/src/mango/io/typ.rs b/src/mango/io/typ.rs new file mode 100644 index 00000000..00aa6f7a --- /dev/null +++ b/src/mango/io/typ.rs @@ -0,0 +1,14 @@ +use regex::Regex; + +/// A reader represents a source 'file', which may be a file, webpage, string, ... +pub trait Reader { + /// Checks whether the `text` is found starting from the current position. + fn equals(&mut self, text: &str) -> bool; + + /// Checks whether the code from the current position matches a regex pattern. + fn matches(&mut self, pattern: Regex) -> Option; +} + +pub trait Writer { + // TODO +} From 90bfe6bad7c2c32840f87894c277006691082b18 Mon Sep 17 00:00:00 2001 From: Mark Date: Wed, 16 May 2018 22:02:49 +0200 Subject: [PATCH 02/49] Add a regex cache (again; lost my work) #56 --- src/mango/io/fortest/fromstr.rs | 10 +++++++--- src/mango/io/mod.rs | 2 ++ src/mango/io/typ.rs | 2 +- src/mango/io/util.rs | 20 ++++++++++++++++++++ 4 files changed, 30 insertions(+), 4 deletions(-) create mode 100644 src/mango/io/util.rs diff --git a/src/mango/io/fortest/fromstr.rs b/src/mango/io/fortest/fromstr.rs index 3a39b80c..7c70c0d2 100644 --- a/src/mango/io/fortest/fromstr.rs +++ b/src/mango/io/fortest/fromstr.rs @@ -1,5 +1,5 @@ use mango::io::typ::Reader; -use regex::Regex; +use mango::io::util::REXCACHE; /// Implementation of [Reader] that reads from a pre-provided string. /// Mostly for testing purposes. @@ -23,7 +23,11 @@ impl Reader for StringReader { false } - fn matches(&mut self, pattern: Regex) -> Option { - unimplemented!() // TODO + fn matches(&mut self, subpattern: String) -> Option { + REXCACHE.with(|rl| { + let mut rexlib = rl.borrow_mut(); + // let rex = rexlib.make_or_get(subpattern); + }); + Option::None // TODO } } diff --git a/src/mango/io/mod.rs b/src/mango/io/mod.rs index 6dbbd816..d9a219cf 100644 --- a/src/mango/io/mod.rs +++ b/src/mango/io/mod.rs @@ -1,3 +1,5 @@ pub mod typ; pub mod fortest; + +pub mod util; diff --git a/src/mango/io/typ.rs b/src/mango/io/typ.rs index 00aa6f7a..a50008ce 100644 --- a/src/mango/io/typ.rs +++ b/src/mango/io/typ.rs @@ -6,7 +6,7 @@ pub trait Reader { fn equals(&mut self, text: &str) -> bool; /// Checks whether the code from the current position matches a regex pattern. - fn matches(&mut self, pattern: Regex) -> Option; + fn matches(&mut self, subpattern: String) -> Option; } pub trait Writer { diff --git a/src/mango/io/util.rs b/src/mango/io/util.rs new file mode 100644 index 00000000..e847d602 --- /dev/null +++ b/src/mango/io/util.rs @@ -0,0 +1,20 @@ +use regex::Regex; +use std::cell::RefCell; +use std::collections::HashMap; + +pub struct RegexCache { + cache: HashMap, +} + +impl RegexCache { + // Not public to prevent having more than one instance. + pub fn new() -> Self { + RegexCache { + cache: HashMap::new(), + } + } +} + +thread_local! { + pub static REXCACHE: RefCell = RefCell::new(RegexCache::new()) +} From c6ec834990332d3d162ca84c8217f35d593fa5aa Mon Sep 17 00:00:00 2001 From: Mark Date: Fri, 18 May 2018 22:08:46 +0200 Subject: [PATCH 03/49] Make the regex cache work #56 --- src/mango/io/fortest/fromstr.rs | 4 ++-- src/mango/io/typ.rs | 4 +--- src/mango/io/util.rs | 11 ++++++++++- 3 files changed, 13 insertions(+), 6 deletions(-) diff --git a/src/mango/io/fortest/fromstr.rs b/src/mango/io/fortest/fromstr.rs index 7c70c0d2..0afaf280 100644 --- a/src/mango/io/fortest/fromstr.rs +++ b/src/mango/io/fortest/fromstr.rs @@ -23,10 +23,10 @@ impl Reader for StringReader { false } - fn matches(&mut self, subpattern: String) -> Option { + fn matches(&mut self, subpattern: &str) -> Option { REXCACHE.with(|rl| { let mut rexlib = rl.borrow_mut(); - // let rex = rexlib.make_or_get(subpattern); + let rex = rexlib.make_or_get(subpattern); }); Option::None // TODO } diff --git a/src/mango/io/typ.rs b/src/mango/io/typ.rs index a50008ce..62667ad8 100644 --- a/src/mango/io/typ.rs +++ b/src/mango/io/typ.rs @@ -1,12 +1,10 @@ -use regex::Regex; - /// A reader represents a source 'file', which may be a file, webpage, string, ... pub trait Reader { /// Checks whether the `text` is found starting from the current position. fn equals(&mut self, text: &str) -> bool; /// Checks whether the code from the current position matches a regex pattern. - fn matches(&mut self, subpattern: String) -> Option; + fn matches(&mut self, subpattern: &str) -> Option; } pub trait Writer { diff --git a/src/mango/io/util.rs b/src/mango/io/util.rs index e847d602..66e1053d 100644 --- a/src/mango/io/util.rs +++ b/src/mango/io/util.rs @@ -1,3 +1,4 @@ +use regex::Error; use regex::Regex; use std::cell::RefCell; use std::collections::HashMap; @@ -8,11 +9,19 @@ pub struct RegexCache { impl RegexCache { // Not public to prevent having more than one instance. - pub fn new() -> Self { + fn new() -> Self { RegexCache { cache: HashMap::new(), } } + + pub fn make_or_get(&mut self, subpattern: &str) -> Result<&Regex, Error> { + if !self.cache.contains_key(subpattern) { + let regex = Regex::new(&format!("^ *{}", subpattern))?; + self.cache.insert(subpattern.to_owned(), regex); + } + Result::Ok(self.cache.get(subpattern).unwrap()) + } } thread_local! { From 97f7f3041a944fbcd094e1f486d4b709629659a1 Mon Sep 17 00:00:00 2001 From: Mark Date: Mon, 21 May 2018 12:40:53 +0200 Subject: [PATCH 04/49] Trying to let the lexer deal with indentation somewhat elegantly #56 --- README.rst | 6 ++++ src/mango/io/fortest/fromstr.rs | 21 ++++++----- src/mango/io/typ.rs | 12 +++++-- src/mango/lexing/code_lexer.rs | 59 +++++++++++++++++++++++++++++++ src/mango/lexing/comment_lexer.rs | 1 + src/mango/lexing/mod.rs | 4 +++ src/mango/lexing/typ.rs | 15 ++++++++ src/mango/token/mod.rs | 6 ++-- src/mango/token/special/mod.rs | 2 +- 9 files changed, 111 insertions(+), 15 deletions(-) create mode 100644 src/mango/lexing/code_lexer.rs create mode 100644 src/mango/lexing/comment_lexer.rs create mode 100644 src/mango/lexing/typ.rs diff --git a/README.rst b/README.rst index 2c822695..88110eb0 100644 --- a/README.rst +++ b/README.rst @@ -65,6 +65,12 @@ These instructions were tested on Ubuntu 18.4 (using Bash). It should also work cargo test --all cargo run --bin mango-cli + or to build a fast, release-mode native binary: + +.. code:: bash + + RUSTFLAGS="-C target-cpu=native" cargo build --release + * To deploy the web version in release mode: .. code:: bash diff --git a/src/mango/io/fortest/fromstr.rs b/src/mango/io/fortest/fromstr.rs index 0afaf280..2ca183b9 100644 --- a/src/mango/io/fortest/fromstr.rs +++ b/src/mango/io/fortest/fromstr.rs @@ -1,4 +1,5 @@ use mango::io::typ::Reader; +use mango::io::typ::ReaderResult; use mango::io::util::REXCACHE; /// Implementation of [Reader] that reads from a pre-provided string. @@ -15,19 +16,21 @@ impl StringReader { } impl Reader for StringReader { - fn equals(&mut self, text: &str) -> bool { - if &self.code[self.index..self.index + text.len()] == text { - self.index += text.len(); - return true; - } - false - } + // fn equals(&mut self, texts: Vec<&str>) -> ReaderResult { + // for text in texts { + // if &self.code[self.index..self.index + text.len()] == text { + // self.index += text.len(); + // return ReaderResult::Match(self.code[self.index..self.index + text.len()]) + // } + // } + // ReaderResult::NoMatch() + // } - fn matches(&mut self, subpattern: &str) -> Option { + fn matches(&mut self, subpattern: &str) -> ReaderResult { REXCACHE.with(|rl| { let mut rexlib = rl.borrow_mut(); let rex = rexlib.make_or_get(subpattern); }); - Option::None // TODO + ReaderResult::NoMatch() // TODO } } diff --git a/src/mango/io/typ.rs b/src/mango/io/typ.rs index 62667ad8..ebe523a5 100644 --- a/src/mango/io/typ.rs +++ b/src/mango/io/typ.rs @@ -1,10 +1,18 @@ +// TODO: I should perhaps separate the splitting that happens here from the actual reading + +pub enum ReaderResult { + Match(String), + NoMatch(), + EOF(), +} + /// A reader represents a source 'file', which may be a file, webpage, string, ... pub trait Reader { /// Checks whether the `text` is found starting from the current position. - fn equals(&mut self, text: &str) -> bool; + // fn equals(&mut self, texts: Vec<&str>) -> ReaderResult; /// Checks whether the code from the current position matches a regex pattern. - fn matches(&mut self, subpattern: &str) -> Option; + fn matches(&mut self, subpattern: &str) -> ReaderResult; } pub trait Writer { diff --git a/src/mango/lexing/code_lexer.rs b/src/mango/lexing/code_lexer.rs new file mode 100644 index 00000000..1fc3521f --- /dev/null +++ b/src/mango/lexing/code_lexer.rs @@ -0,0 +1,59 @@ +use mango::io::typ::Reader; +use mango::io::typ::ReaderResult::*; +use mango::lexing::typ::Lexer; +use mango::lexing::typ::MaybeToken; +use mango::token::special::UnlexableToken; +use mango::token::tokens::ParenthesisCloseToken; +use mango::token::tokens::ParenthesisOpenToken; +use mango::token::Tokens; +use mango::util::codeparts::Keyword; +use std::collections::VecDeque; + +pub struct CodeLexer<'r> { + reader: &'r mut Reader, + indent: i32, + // This is unfortunate, would not be needed with 'yield' but is now for indents + buffer: VecDeque, +} + +impl<'r> CodeLexer<'r> { + fn new(reader: &'r mut Reader) -> Self { + CodeLexer { + reader, + indent: 0, + buffer: VecDeque::with_capacity(16), + } + } +} + +impl<'r> Lexer<'r> for CodeLexer<'r> { + fn lex(&mut self) -> MaybeToken { + // If there is a buffer due to indentation or continuations, return from that. + if !self.buffer.is_empty() { + return MaybeToken::Token(self.buffer.pop_front().unwrap()); + } + if let Match(word) = self.reader.matches("\\.\\.\\.") { + // Line continuation has no token, it just continues on the next line. + if let Match(word) = self.reader.matches("\\n\\r?") { + // There should always be a newline after continuations, so that they can be ignored together. + } else if let Match(word) = self.reader.matches("[^\\n]*\\n\\r?") { + return MaybeToken::Token(Tokens::Unlexable(UnlexableToken::new(word))); + } else { + // TODO: I don't know yet how to deal with continuation followed by end of file + panic!() + } + } + // Indentation done; do the rest of lexing. + if let Match(word) = self.reader.matches("(") { + return MaybeToken::Token(Tokens::ParenthesisOpen(ParenthesisOpenToken::new())); + } + if let Match(word) = self.reader.matches(")") { + return MaybeToken::Token(Tokens::ParenthesisClose(ParenthesisCloseToken::new())); + } + + // TODO: a lot more + + // TODO: specify the unlexable word + return MaybeToken::Token(Tokens::Unlexable(UnlexableToken::new("TODO".to_owned()))); + } +} diff --git a/src/mango/lexing/comment_lexer.rs b/src/mango/lexing/comment_lexer.rs new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/src/mango/lexing/comment_lexer.rs @@ -0,0 +1 @@ + diff --git a/src/mango/lexing/mod.rs b/src/mango/lexing/mod.rs index 8b137891..7becc2f3 100644 --- a/src/mango/lexing/mod.rs +++ b/src/mango/lexing/mod.rs @@ -1 +1,5 @@ +pub mod typ; +pub mod code_lexer; + +pub mod comment_lexer; diff --git a/src/mango/lexing/typ.rs b/src/mango/lexing/typ.rs new file mode 100644 index 00000000..864db178 --- /dev/null +++ b/src/mango/lexing/typ.rs @@ -0,0 +1,15 @@ +use mango::io::typ::Reader; +use mango::token::Tokens; + +pub enum MaybeToken { + Token(Tokens), + End(), +} + +pub trait Lexer<'r> { + // /// Create a new lexer from a reader instance. + // fn new(reader: &'r mut Reader) -> Self; + + /// Every call to lex returns a token until the end of the input. + fn lex(&mut self) -> MaybeToken; +} diff --git a/src/mango/token/mod.rs b/src/mango/token/mod.rs index 1d15baa1..94479f40 100644 --- a/src/mango/token/mod.rs +++ b/src/mango/token/mod.rs @@ -1,10 +1,10 @@ -mod tokens; +pub mod tokens; pub use self::tokens::*; -mod special; +pub mod special; pub use self::special::*; -mod collect; +pub mod collect; pub use self::collect::Token; pub use self::collect::Tokens; diff --git a/src/mango/token/special/mod.rs b/src/mango/token/special/mod.rs index 7c269fdb..e3ebf8d5 100644 --- a/src/mango/token/special/mod.rs +++ b/src/mango/token/special/mod.rs @@ -1,2 +1,2 @@ -mod unlexable; +pub mod unlexable; pub use self::unlexable::UnlexableToken; From f33692b5b84a1595137c86c2eaf61a0b86bdbfe7 Mon Sep 17 00:00:00 2001 From: Mark Date: Tue, 22 May 2018 13:09:21 +0200 Subject: [PATCH 05/49] Add a Queue type (which just wraps deque) #so --- src/mango/util/collection/mod.rs | 2 ++ src/mango/util/collection/queue.rs | 23 +++++++++++++++++++++++ src/mango/util/mod.rs | 2 ++ 3 files changed, 27 insertions(+) create mode 100644 src/mango/util/collection/mod.rs create mode 100644 src/mango/util/collection/queue.rs diff --git a/src/mango/util/collection/mod.rs b/src/mango/util/collection/mod.rs new file mode 100644 index 00000000..5f327a57 --- /dev/null +++ b/src/mango/util/collection/mod.rs @@ -0,0 +1,2 @@ +pub mod queue; +pub use self::queue::Queue; diff --git a/src/mango/util/collection/queue.rs b/src/mango/util/collection/queue.rs new file mode 100644 index 00000000..02996c44 --- /dev/null +++ b/src/mango/util/collection/queue.rs @@ -0,0 +1,23 @@ +use std::collections::VecDeque; + +/// A one-ended queue. +/// This is just a wrapper around deque so nobody pushes or pops the wrong end. +pub struct Queue { + deque: VecDeque, +} + +impl Queue { + pub fn new() -> Self { + Queue { + deque: VecDeque::with_capacity(16), + } + } + + pub fn push(&mut self, value: T) { + self.deque.push_back(value) + } + + pub fn pop(&mut self) -> Option { + self.deque.pop_front() + } +} diff --git a/src/mango/util/mod.rs b/src/mango/util/mod.rs index 11224671..e6c649ab 100644 --- a/src/mango/util/mod.rs +++ b/src/mango/util/mod.rs @@ -1,3 +1,5 @@ +pub mod collection; + pub mod strtype; pub mod numtype; From 3728f4e7d27c88be05f7bf41e50617ae31666be8 Mon Sep 17 00:00:00 2001 From: Mark Date: Tue, 22 May 2018 14:23:23 +0200 Subject: [PATCH 06/49] Add block start/end token and lexing #56 --- src/mango/lexing/code_lexer.rs | 63 +++++++++++++++++++++++++++++---- src/mango/lexing/typ.rs | 1 - src/mango/token/collect/all.rs | 6 ++++ src/mango/token/tokens/block.rs | 46 ++++++++++++++++++++++++ src/mango/token/tokens/mod.rs | 3 ++ 5 files changed, 111 insertions(+), 8 deletions(-) create mode 100644 src/mango/token/tokens/block.rs diff --git a/src/mango/lexing/code_lexer.rs b/src/mango/lexing/code_lexer.rs index 1fc3521f..97a88b1c 100644 --- a/src/mango/lexing/code_lexer.rs +++ b/src/mango/lexing/code_lexer.rs @@ -3,17 +3,20 @@ use mango::io::typ::ReaderResult::*; use mango::lexing::typ::Lexer; use mango::lexing::typ::MaybeToken; use mango::token::special::UnlexableToken; +use mango::token::tokens::EndBlockToken; +use mango::token::tokens::EndStatementToken; use mango::token::tokens::ParenthesisCloseToken; use mango::token::tokens::ParenthesisOpenToken; +use mango::token::tokens::StartBlockToken; use mango::token::Tokens; use mango::util::codeparts::Keyword; -use std::collections::VecDeque; +use mango::util::collection::Queue; pub struct CodeLexer<'r> { reader: &'r mut Reader, indent: i32, - // This is unfortunate, would not be needed with 'yield' but is now for indents - buffer: VecDeque, + // This is unfortunate, would not be needed with 'yield' but is now for indents. + buffer: Queue, } impl<'r> CodeLexer<'r> { @@ -21,17 +24,42 @@ impl<'r> CodeLexer<'r> { CodeLexer { reader, indent: 0, - buffer: VecDeque::with_capacity(16), + buffer: Queue::new(), } } + + fn lex_indents(&mut self) -> MaybeToken { + let mut line_indent = 0; + while let Match(_) = self.reader.matches("\\t") { + line_indent += 1; + } + for _ in line_indent..self.indent { + // This line is dedented, make end tokens. + if let Match(_) = self.reader.matches("end") { + // If this is followed by an 'end' keyword, then that 'end' is redundant. + self.buffer + .push(Tokens::EndBlock(EndBlockToken::new(true, true))); + } else { + self.buffer + .push(Tokens::EndBlock(EndBlockToken::new(true, false))); + } + } + for _ in self.indent..line_indent { + // This line is indented, make start tokens. + self.buffer.push(Tokens::StartBlock(StartBlockToken::new())); + } + self.indent = line_indent; + self.lex() + } } impl<'r> Lexer<'r> for CodeLexer<'r> { fn lex(&mut self) -> MaybeToken { // If there is a buffer due to indentation or continuations, return from that. - if !self.buffer.is_empty() { - return MaybeToken::Token(self.buffer.pop_front().unwrap()); + if let Some(token) = self.buffer.pop() { + return MaybeToken::Token(token); } + // Past this point, we assume that hte buffer is empty. When adding stuff, pop it or re-enter lex() soon. if let Match(word) = self.reader.matches("\\.\\.\\.") { // Line continuation has no token, it just continues on the next line. if let Match(word) = self.reader.matches("\\n\\r?") { @@ -39,11 +67,32 @@ impl<'r> Lexer<'r> for CodeLexer<'r> { } else if let Match(word) = self.reader.matches("[^\\n]*\\n\\r?") { return MaybeToken::Token(Tokens::Unlexable(UnlexableToken::new(word))); } else { - // TODO: I don't know yet how to deal with continuation followed by end of file + // TODO: I don't know yet how to deal with ... followed by end-of-file panic!() } + // This is a new line, so there may be indents. + return self.lex_indents(); + } + if let Match(word) = self.reader.matches("\\n\\r?") { + // Newline WITHOUT line continuation. + return MaybeToken::Token(Tokens::EndStatement(EndStatementToken::new_end_line())); + } + if let Match(word) = self.reader.matches(";") { + // Semicolon, which ends a statement. + // Need to do some extra work with buffer, because there may be a newline followed by indentation, which ; should precede. + self.buffer + .push(Tokens::EndStatement(EndStatementToken::new_semicolon())); + if let Match(word) = self.reader.matches("\\n\\r?") { + // If semicolon is followed by a newline (redundant), then we need to deal with indents (but ignore the newline itself). + // This will return the queue of tokens, including the semicolon. + return self.lex_indents(); + } + // No newline, can just return the semicolon (which is certainly on the queue, and should be the only thing, but it is fine here if not). + return MaybeToken::Token(self.buffer.pop().unwrap()); } + // // Indentation done; do the rest of lexing. + // if let Match(word) = self.reader.matches("(") { return MaybeToken::Token(Tokens::ParenthesisOpen(ParenthesisOpenToken::new())); } diff --git a/src/mango/lexing/typ.rs b/src/mango/lexing/typ.rs index 864db178..39295f76 100644 --- a/src/mango/lexing/typ.rs +++ b/src/mango/lexing/typ.rs @@ -1,4 +1,3 @@ -use mango::io::typ::Reader; use mango::token::Tokens; pub enum MaybeToken { diff --git a/src/mango/token/collect/all.rs b/src/mango/token/collect/all.rs index 37cd7853..17571b09 100644 --- a/src/mango/token/collect/all.rs +++ b/src/mango/token/collect/all.rs @@ -1,5 +1,6 @@ use mango::token::special::UnlexableToken; use mango::token::tokens::AssociationToken; +use mango::token::tokens::EndBlockToken; use mango::token::tokens::EndStatementToken; use mango::token::tokens::IdentifierToken; use mango::token::tokens::KeywordToken; @@ -7,6 +8,7 @@ use mango::token::tokens::LiteralToken; use mango::token::tokens::OperatorToken; use mango::token::tokens::ParenthesisCloseToken; use mango::token::tokens::ParenthesisOpenToken; +use mango::token::tokens::StartBlockToken; use mango::util::encdec::ToText; /// Collection of all possible tokens. @@ -21,6 +23,8 @@ pub enum Tokens { ParenthesisClose(ParenthesisCloseToken), EndStatement(EndStatementToken), Unlexable(UnlexableToken), + StartBlock(StartBlockToken), + EndBlock(EndBlockToken), } impl ToText for Tokens { @@ -38,6 +42,8 @@ impl ToText for Tokens { ParenthesisClose(token) => token.to_text(), EndStatement(token) => token.to_text(), Unlexable(token) => token.to_text(), + StartBlock(token) => token.to_text(), + EndBlock(token) => token.to_text(), } } } diff --git a/src/mango/token/tokens/block.rs b/src/mango/token/tokens/block.rs new file mode 100644 index 00000000..64a3041f --- /dev/null +++ b/src/mango/token/tokens/block.rs @@ -0,0 +1,46 @@ +use mango::token::Token; +use mango::util::encdec::ToText; + +/// Start and end of blocks, signalled e.g. by indentation. +#[derive(Debug, PartialEq, Eq, Hash, Clone)] +pub struct StartBlockToken {} + +#[derive(Debug, PartialEq, Eq, Hash, Clone)] +pub struct EndBlockToken { + is_dedent: bool, + is_end_keyword: bool, +} + +impl StartBlockToken { + pub fn new() -> Self { + StartBlockToken {} + } +} + +impl EndBlockToken { + pub fn new(is_dedent: bool, is_end_keyword: bool) -> Self { + assert!(is_dedent || is_end_keyword); + EndBlockToken { + is_dedent, + is_end_keyword, + } + } +} + +impl ToText for StartBlockToken { + // TODO: needs context information to render indents + fn to_text(&self) -> String { + " { ".to_owned() + } +} + +impl ToText for EndBlockToken { + // TODO: needs context information to render indents + fn to_text(&self) -> String { + " } ".to_owned() + } +} + +impl Token for StartBlockToken {} + +impl Token for EndBlockToken {} diff --git a/src/mango/token/tokens/mod.rs b/src/mango/token/tokens/mod.rs index 4508d768..3dfa133a 100644 --- a/src/mango/token/tokens/mod.rs +++ b/src/mango/token/tokens/mod.rs @@ -22,3 +22,6 @@ pub use self::keyword::KeywordToken; pub mod end_statement; pub use self::end_statement::EndStatementToken; + +pub mod block; +pub use self::block::{EndBlockToken, StartBlockToken}; From ea5a4cd40801d2f964b71e255074b1cce9ed5514 Mon Sep 17 00:00:00 2001 From: Mark Date: Tue, 22 May 2018 17:39:30 +0200 Subject: [PATCH 07/49] Add identifier and keyword lexing #56 --- src/mango/lexing/code_lexer.rs | 54 +++++++++++++++++++--------- src/mango/token/tokens/identifier.rs | 4 +++ src/mango/util/strtype/name.rs | 9 ++++- 3 files changed, 49 insertions(+), 18 deletions(-) diff --git a/src/mango/lexing/code_lexer.rs b/src/mango/lexing/code_lexer.rs index 97a88b1c..00eda5b9 100644 --- a/src/mango/lexing/code_lexer.rs +++ b/src/mango/lexing/code_lexer.rs @@ -5,11 +5,12 @@ use mango::lexing::typ::MaybeToken; use mango::token::special::UnlexableToken; use mango::token::tokens::EndBlockToken; use mango::token::tokens::EndStatementToken; +use mango::token::tokens::IdentifierToken; +use mango::token::tokens::KeywordToken; use mango::token::tokens::ParenthesisCloseToken; use mango::token::tokens::ParenthesisOpenToken; use mango::token::tokens::StartBlockToken; use mango::token::Tokens; -use mango::util::codeparts::Keyword; use mango::util::collection::Queue; pub struct CodeLexer<'r> { @@ -35,6 +36,7 @@ impl<'r> CodeLexer<'r> { } for _ in line_indent..self.indent { // This line is dedented, make end tokens. + // TODO: turn this "new" into a constant if let Match(_) = self.reader.matches("end") { // If this is followed by an 'end' keyword, then that 'end' is redundant. self.buffer @@ -55,17 +57,19 @@ impl<'r> CodeLexer<'r> { impl<'r> Lexer<'r> for CodeLexer<'r> { fn lex(&mut self) -> MaybeToken { + use self::MaybeToken::*; + // If there is a buffer due to indentation or continuations, return from that. if let Some(token) = self.buffer.pop() { - return MaybeToken::Token(token); + return Token(token); } // Past this point, we assume that hte buffer is empty. When adding stuff, pop it or re-enter lex() soon. - if let Match(word) = self.reader.matches("\\.\\.\\.") { + if let Match(_) = self.reader.matches("\\.\\.\\.") { // Line continuation has no token, it just continues on the next line. - if let Match(word) = self.reader.matches("\\n\\r?") { + if let Match(_) = self.reader.matches("\\n\\r?") { // There should always be a newline after continuations, so that they can be ignored together. } else if let Match(word) = self.reader.matches("[^\\n]*\\n\\r?") { - return MaybeToken::Token(Tokens::Unlexable(UnlexableToken::new(word))); + return Token(Tokens::Unlexable(UnlexableToken::new(word))); } else { // TODO: I don't know yet how to deal with ... followed by end-of-file panic!() @@ -73,36 +77,52 @@ impl<'r> Lexer<'r> for CodeLexer<'r> { // This is a new line, so there may be indents. return self.lex_indents(); } - if let Match(word) = self.reader.matches("\\n\\r?") { + if let Match(_) = self.reader.matches("\\n\\r?") { // Newline WITHOUT line continuation. - return MaybeToken::Token(Tokens::EndStatement(EndStatementToken::new_end_line())); + return Token(Tokens::EndStatement(EndStatementToken::new_end_line())); } - if let Match(word) = self.reader.matches(";") { + if let Match(_) = self.reader.matches(";") { // Semicolon, which ends a statement. // Need to do some extra work with buffer, because there may be a newline followed by indentation, which ; should precede. self.buffer .push(Tokens::EndStatement(EndStatementToken::new_semicolon())); - if let Match(word) = self.reader.matches("\\n\\r?") { + if let Match(_) = self.reader.matches("\\n\\r?") { // If semicolon is followed by a newline (redundant), then we need to deal with indents (but ignore the newline itself). // This will return the queue of tokens, including the semicolon. return self.lex_indents(); } // No newline, can just return the semicolon (which is certainly on the queue, and should be the only thing, but it is fine here if not). - return MaybeToken::Token(self.buffer.pop().unwrap()); + return Token(self.buffer.pop().unwrap()); } // // Indentation done; do the rest of lexing. // - if let Match(word) = self.reader.matches("(") { - return MaybeToken::Token(Tokens::ParenthesisOpen(ParenthesisOpenToken::new())); + // Parse identifers and keywords. This assumes that keywords are a subset of identifiers. + if let Match(word) = self.reader.matches(IdentifierToken::subpattern()) { + // later: maybe turn identifier into keyword to avoid a string copy? kind of elaborate... + if let Ok(keyword) = KeywordToken::from_str(word.clone()) { + return Token(Tokens::Keyword(keyword)); + } + return Token(Tokens::Identifier(IdentifierToken::from_str(word).unwrap())); } - if let Match(word) = self.reader.matches(")") { - return MaybeToken::Token(Tokens::ParenthesisClose(ParenthesisCloseToken::new())); + // Literal + // todo + // if let Match(word) = self.reader.matches(LiteralToken::subpattern()) { + // return Token(LiteralToken::Literal(IdentifierToken::from_str(word).unwrap())); + // } + // Operator + // todo + // Association + // todo + // Grouping symbols + if let Match(_) = self.reader.matches("(") { + return Token(Tokens::ParenthesisOpen(ParenthesisOpenToken::new())); + } + if let Match(_) = self.reader.matches(")") { + return Token(Tokens::ParenthesisClose(ParenthesisCloseToken::new())); } - - // TODO: a lot more // TODO: specify the unlexable word - return MaybeToken::Token(Tokens::Unlexable(UnlexableToken::new("TODO".to_owned()))); + return Token(Tokens::Unlexable(UnlexableToken::new("TODO".to_owned()))); } } diff --git a/src/mango/token/tokens/identifier.rs b/src/mango/token/tokens/identifier.rs index c83555d3..2485fe61 100644 --- a/src/mango/token/tokens/identifier.rs +++ b/src/mango/token/tokens/identifier.rs @@ -19,6 +19,10 @@ impl IdentifierToken { pub fn from_name(name: Name) -> Self { IdentifierToken { name } } + + pub fn subpattern() -> &'static str { + Name::subpattern() + } } impl ToText for IdentifierToken { diff --git a/src/mango/util/strtype/name.rs b/src/mango/util/strtype/name.rs index 0405d927..5940158a 100644 --- a/src/mango/util/strtype/name.rs +++ b/src/mango/util/strtype/name.rs @@ -6,8 +6,10 @@ use std::fmt; use std::sync::Mutex; use string_interner::StringInterner; +const VALID_IDENTIFIER_SUBPATTERN: &'static str = r"[a-zA-Z_][a-zA-Z0-9_]*"; lazy_static! { - static ref VALID_IDENTIFIER: Regex = Regex::new(r"^[a-zA-Z_][a-zA-Z0-9_]*$").unwrap(); + static ref VALID_IDENTIFIER: Regex = + Regex::new(&format!("{}{}{}", r"^", VALID_IDENTIFIER_SUBPATTERN, r"$")).unwrap(); } // TODO: this alias just for https://github.com/rust-lang-nursery/rustfmt/issues/2610 @@ -38,6 +40,11 @@ impl Name { .unwrap() .to_string() } + + /// Generate an eager subpattern to match names, that can be composed in a regular expression. + pub fn subpattern() -> &'static str { + &VALID_IDENTIFIER_SUBPATTERN.clone() + } } impl fmt::Display for Name { From 03b152a090346de3faab1211f20d0c3498fdecf7 Mon Sep 17 00:00:00 2001 From: Mark Date: Tue, 22 May 2018 21:44:11 +0200 Subject: [PATCH 08/49] Progress on sublexers and new tokens, fighting borrow rules #56 --- src/mango/lexing/code_lexer.rs | 184 ++++++++++++++++++------------ src/mango/lexing/mod.rs | 2 + src/mango/lexing/string_lexer.rs | 45 ++++++++ src/mango/lexing/typ.rs | 7 +- src/mango/token/tokens/literal.rs | 4 + 5 files changed, 170 insertions(+), 72 deletions(-) create mode 100644 src/mango/lexing/string_lexer.rs diff --git a/src/mango/lexing/code_lexer.rs b/src/mango/lexing/code_lexer.rs index 00eda5b9..1fc8bb29 100644 --- a/src/mango/lexing/code_lexer.rs +++ b/src/mango/lexing/code_lexer.rs @@ -1,5 +1,6 @@ use mango::io::typ::Reader; use mango::io::typ::ReaderResult::*; +use mango::lexing::string_lexer::StringLexer; use mango::lexing::typ::Lexer; use mango::lexing::typ::MaybeToken; use mango::token::special::UnlexableToken; @@ -13,31 +14,49 @@ use mango::token::tokens::StartBlockToken; use mango::token::Tokens; use mango::util::collection::Queue; -pub struct CodeLexer<'r> { - reader: &'r mut Reader, +enum ReaderOrDelegate { + Reader(Box), + Delegate(Box), +} + +impl ReaderOrDelegate { + fn end_delegation(&mut self) { + *self = match self { + ReaderOrDelegate::Delegate(delegate) => ReaderOrDelegate::Reader(delegate.consume()), + ReaderOrDelegate::Reader(reader) => ReaderOrDelegate::Reader(*reader), + } + } +} + +pub struct CodeLexer { + // reader: Option<&'r mut Reader>, indent: i32, + // TODO: both of the next two would be unnecessary with generators... + // This delegate deals with nested structures, like string literals and comments. + // delegate: Option<&'r mut Lexer<'r>>, + reader_or_delegate: ReaderOrDelegate, // This is unfortunate, would not be needed with 'yield' but is now for indents. buffer: Queue, } -impl<'r> CodeLexer<'r> { - fn new(reader: &'r mut Reader) -> Self { +impl CodeLexer { + fn new(reader: Box) -> Self { CodeLexer { - reader, + reader_or_delegate: ReaderOrDelegate::Reader(reader), indent: 0, buffer: Queue::new(), } } - fn lex_indents(&mut self) -> MaybeToken { + fn lex_indents(&mut self, reader: &mut Box) -> MaybeToken { let mut line_indent = 0; - while let Match(_) = self.reader.matches("\\t") { + while let Match(_) = reader.matches("\\t") { line_indent += 1; } for _ in line_indent..self.indent { // This line is dedented, make end tokens. // TODO: turn this "new" into a constant - if let Match(_) = self.reader.matches("end") { + if let Match(_) = reader.matches("end") { // If this is followed by an 'end' keyword, then that 'end' is redundant. self.buffer .push(Tokens::EndBlock(EndBlockToken::new(true, true))); @@ -55,74 +74,99 @@ impl<'r> CodeLexer<'r> { } } -impl<'r> Lexer<'r> for CodeLexer<'r> { +impl Lexer for CodeLexer { fn lex(&mut self) -> MaybeToken { use self::MaybeToken::*; - // If there is a buffer due to indentation or continuations, return from that. - if let Some(token) = self.buffer.pop() { - return Token(token); - } - // Past this point, we assume that hte buffer is empty. When adding stuff, pop it or re-enter lex() soon. - if let Match(_) = self.reader.matches("\\.\\.\\.") { - // Line continuation has no token, it just continues on the next line. - if let Match(_) = self.reader.matches("\\n\\r?") { - // There should always be a newline after continuations, so that they can be ignored together. - } else if let Match(word) = self.reader.matches("[^\\n]*\\n\\r?") { - return Token(Tokens::Unlexable(UnlexableToken::new(word))); - } else { - // TODO: I don't know yet how to deal with ... followed by end-of-file - panic!() - } - // This is a new line, so there may be indents. - return self.lex_indents(); - } - if let Match(_) = self.reader.matches("\\n\\r?") { - // Newline WITHOUT line continuation. - return Token(Tokens::EndStatement(EndStatementToken::new_end_line())); - } - if let Match(_) = self.reader.matches(";") { - // Semicolon, which ends a statement. - // Need to do some extra work with buffer, because there may be a newline followed by indentation, which ; should precede. - self.buffer - .push(Tokens::EndStatement(EndStatementToken::new_semicolon())); - if let Match(_) = self.reader.matches("\\n\\r?") { - // If semicolon is followed by a newline (redundant), then we need to deal with indents (but ignore the newline itself). - // This will return the queue of tokens, including the semicolon. - return self.lex_indents(); + // If currently delegating to a sub-lexer, return from that. + match self.reader_or_delegate { + ReaderOrDelegate::Delegate(ref mut delegate) => { + match delegate.lex() { + Token(token) => Token(token), + End => self.lex(), + } + // Code to stop delegation cannot be here, because `self` is still mutably borrowed through `delegate` } - // No newline, can just return the semicolon (which is certainly on the queue, and should be the only thing, but it is fine here if not). - return Token(self.buffer.pop().unwrap()); - } - // - // Indentation done; do the rest of lexing. - // - // Parse identifers and keywords. This assumes that keywords are a subset of identifiers. - if let Match(word) = self.reader.matches(IdentifierToken::subpattern()) { - // later: maybe turn identifier into keyword to avoid a string copy? kind of elaborate... - if let Ok(keyword) = KeywordToken::from_str(word.clone()) { - return Token(Tokens::Keyword(keyword)); + ReaderOrDelegate::Reader(mut reader) => { + // todo: maybe this branch could be a separate function? + + // If there is a buffer due to indentation or continuations, return from that. + if let Some(token) = self.buffer.pop() { + return Token(token); + } + // Past this point, we assume that hte buffer is empty. When adding stuff, pop it or re-enter lex() soon. + if let Match(_) = reader.matches("\\.\\.\\.") { + // Line continuation has no token, it just continues on the next line. + if let Match(_) = reader.matches("\\n\\r?") { + // There should always be a newline after continuations, so that they can be ignored together. + } else if let Match(word) = reader.matches("[^\\n]*\\n\\r?") { + return Token(Tokens::Unlexable(UnlexableToken::new(word))); + } else { + // TODO: I don't know yet how to deal with ... followed by end-of-file + panic!() + } + // This is a new line, so there may be indents. + return self.lex_indents(&mut reader); + } + if let Match(_) = reader.matches("\\n\\r?") { + // Newline WITHOUT line continuation. + return Token(Tokens::EndStatement(EndStatementToken::new_end_line())); + } + if let Match(_) = reader.matches(";") { + // Semicolon, which ends a statement. + // Need to do some extra work with buffer, because there may be a newline followed by indentation, which ; should precede. + self.buffer + .push(Tokens::EndStatement(EndStatementToken::new_semicolon())); + if let Match(_) = reader.matches("\\n\\r?") { + // If semicolon is followed by a newline (redundant), then we need to deal with indents (but ignore the newline itself). + // This will return the queue of tokens, including the semicolon. + return self.lex_indents(&mut reader); + } + // No newline, can just return the semicolon (which is certainly on the queue, and should be the only thing, but it is fine here if not). + return Token(self.buffer.pop().unwrap()); + } + // + // Indentation done; do the rest of lexing. + // + // Parse identifers and keywords. This assumes that keywords are a subset of identifiers. + if let Match(word) = reader.matches(IdentifierToken::subpattern()) { + // later: maybe turn identifier into keyword to avoid a string copy? kind of elaborate... + if let Ok(keyword) = KeywordToken::from_str(word.clone()) { + return Token(Tokens::Keyword(keyword)); + } + return Token(Tokens::Identifier(IdentifierToken::from_str(word).unwrap())); + } + // Literal + if let Match(word) = reader.matches("[a-z]?\"") { + // TODO: need to keep delegating to this until it exhausts, how to do that? + self.reader_or_delegate = ReaderOrDelegate::Delegate(Box::new( + StringLexer::new_double_quoted(reader), + )); + return self.lex(); + } + // Operator + // todo + // Association + // todo + // Grouping symbols + if let Match(_) = reader.matches("(") { + return Token(Tokens::ParenthesisOpen(ParenthesisOpenToken::new())); + } + if let Match(_) = reader.matches(")") { + return Token(Tokens::ParenthesisClose(ParenthesisCloseToken::new())); + } + + // TODO: specify the unlexable word + return Token(Tokens::Unlexable(UnlexableToken::new("TODO".to_owned()))); } - return Token(Tokens::Identifier(IdentifierToken::from_str(word).unwrap())); - } - // Literal - // todo - // if let Match(word) = self.reader.matches(LiteralToken::subpattern()) { - // return Token(LiteralToken::Literal(IdentifierToken::from_str(word).unwrap())); - // } - // Operator - // todo - // Association - // todo - // Grouping symbols - if let Match(_) = self.reader.matches("(") { - return Token(Tokens::ParenthesisOpen(ParenthesisOpenToken::new())); - } - if let Match(_) = self.reader.matches(")") { - return Token(Tokens::ParenthesisClose(ParenthesisCloseToken::new())); } + } - // TODO: specify the unlexable word - return Token(Tokens::Unlexable(UnlexableToken::new("TODO".to_owned()))); + fn consume(self) -> Box { + assert!(false, "I do not think this is ever called, is it?"); + match self.reader_or_delegate { + ReaderOrDelegate::Reader(reader) => reader, + ReaderOrDelegate::Delegate(delegate) => delegate.consume(), + } } } diff --git a/src/mango/lexing/mod.rs b/src/mango/lexing/mod.rs index 7becc2f3..bb88a815 100644 --- a/src/mango/lexing/mod.rs +++ b/src/mango/lexing/mod.rs @@ -3,3 +3,5 @@ pub mod typ; pub mod code_lexer; pub mod comment_lexer; + +pub mod string_lexer; diff --git a/src/mango/lexing/string_lexer.rs b/src/mango/lexing/string_lexer.rs new file mode 100644 index 00000000..d3f0cf2b --- /dev/null +++ b/src/mango/lexing/string_lexer.rs @@ -0,0 +1,45 @@ +use mango::io::typ::Reader; +use mango::io::typ::ReaderResult::*; +use mango::lexing::typ::Lexer; +use mango::lexing::typ::MaybeToken; +use mango::token::tokens::LiteralToken; +use mango::token::Tokens; + +pub enum StringType { + SingleQuotedInline, + DoubleQuotedInline, + MultiLine, +} + +/// Lexes a string literal token. +// Starts after the opening quote and expected to consume until closing quote. +pub struct StringLexer { + reader: Box, + typ: StringType, +} + +impl StringLexer { + // TODO: support other types of strings + pub fn new_double_quoted(reader: Box) -> Self { + StringLexer { + reader, + typ: StringType::DoubleQuotedInline, + } + } +} + +impl Lexer for StringLexer { + fn lex(&mut self) -> MaybeToken { + // TODO: doesn't handle escaping etc at all now + // TODO: this is going to have a problem if `matches` automatically eats whitespace + match self.reader.matches("[^\"\\n]*") { + Match(value) => return MaybeToken::Token(Tokens::Literal(LiteralToken::string(value))), + NoMatch() => panic!("failed to parse string"), // This can't really go wrong since empty pattern matches + EOF() => return MaybeToken::Token(Tokens::Literal(LiteralToken::string("".to_owned()))), // Unclosed string literal, let code parser deal with it + } + } + + fn consume(self) -> Box { + self.reader + } +} diff --git a/src/mango/lexing/typ.rs b/src/mango/lexing/typ.rs index 39295f76..063389e4 100644 --- a/src/mango/lexing/typ.rs +++ b/src/mango/lexing/typ.rs @@ -1,14 +1,17 @@ +use mango::io::typ::Reader; use mango::token::Tokens; pub enum MaybeToken { Token(Tokens), - End(), + End, } -pub trait Lexer<'r> { +pub trait Lexer { // /// Create a new lexer from a reader instance. // fn new(reader: &'r mut Reader) -> Self; /// Every call to lex returns a token until the end of the input. fn lex(&mut self) -> MaybeToken; + + fn consume(self) -> Box; } diff --git a/src/mango/token/tokens/literal.rs b/src/mango/token/tokens/literal.rs index a0953882..2065dcda 100644 --- a/src/mango/token/tokens/literal.rs +++ b/src/mango/token/tokens/literal.rs @@ -25,6 +25,10 @@ impl LiteralToken { pub fn real(value: f64) -> LiteralToken { LiteralToken::Real(f64eq::new(value)) } + + pub fn subpattern_int() -> &'static str { + "[a-z]?\"" + } } impl ToText for LiteralToken { From 219463b30c2f6172e8cc7db0a0f8bd6a6fca3e10 Mon Sep 17 00:00:00 2001 From: Mark Date: Fri, 25 May 2018 18:37:14 +0200 Subject: [PATCH 09/49] Trying to make delegated lexer compile but not yet #56 --- src/mango/lexing/code_lexer.rs | 11 ++++++----- src/mango/lexing/string_lexer.rs | 1 + 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/src/mango/lexing/code_lexer.rs b/src/mango/lexing/code_lexer.rs index 1fc8bb29..26fccf7c 100644 --- a/src/mango/lexing/code_lexer.rs +++ b/src/mango/lexing/code_lexer.rs @@ -20,10 +20,11 @@ enum ReaderOrDelegate { } impl ReaderOrDelegate { - fn end_delegation(&mut self) { - *self = match self { - ReaderOrDelegate::Delegate(delegate) => ReaderOrDelegate::Reader(delegate.consume()), - ReaderOrDelegate::Reader(reader) => ReaderOrDelegate::Reader(*reader), + fn end_delegation(self) -> Self { + use self::ReaderOrDelegate::*; + match self { + Delegate(delegate) => Reader(delegate.consume()), + read => read, } } } @@ -87,7 +88,7 @@ impl Lexer for CodeLexer { } // Code to stop delegation cannot be here, because `self` is still mutably borrowed through `delegate` } - ReaderOrDelegate::Reader(mut reader) => { + ReaderOrDelegate::Reader(ref mut reader) => { // todo: maybe this branch could be a separate function? // If there is a buffer due to indentation or continuations, return from that. diff --git a/src/mango/lexing/string_lexer.rs b/src/mango/lexing/string_lexer.rs index d3f0cf2b..de5c72da 100644 --- a/src/mango/lexing/string_lexer.rs +++ b/src/mango/lexing/string_lexer.rs @@ -30,6 +30,7 @@ impl StringLexer { impl Lexer for StringLexer { fn lex(&mut self) -> MaybeToken { + // TODO: perhaps there's a library that does parsing a string with escape characters // TODO: doesn't handle escaping etc at all now // TODO: this is going to have a problem if `matches` automatically eats whitespace match self.reader.matches("[^\"\\n]*") { From ed266860a4632e809aeffb94ffcdaf82343d353e Mon Sep 17 00:00:00 2001 From: Mark Date: Tue, 29 May 2018 07:20:36 +0200 Subject: [PATCH 10/49] Partially rewrite the lexer delegation to use Rc #52 --- src/mango/lexing/code_lexer.rs | 61 ++++++++++++++++---------------- src/mango/lexing/string_lexer.rs | 12 ++++--- src/mango/lexing/typ.rs | 6 +++- 3 files changed, 42 insertions(+), 37 deletions(-) diff --git a/src/mango/lexing/code_lexer.rs b/src/mango/lexing/code_lexer.rs index 26fccf7c..8a2e3b1b 100644 --- a/src/mango/lexing/code_lexer.rs +++ b/src/mango/lexing/code_lexer.rs @@ -13,35 +13,29 @@ use mango::token::tokens::ParenthesisOpenToken; use mango::token::tokens::StartBlockToken; use mango::token::Tokens; use mango::util::collection::Queue; +use std::cell::RefCell; +use std::rc::Rc; + +// TODO: Preferably there'd be only one Lexer at a time which has a Reader, but I did not get that to work, +// TODO: see this SO question: https://stackoverflow.com/questions/50535022/borrow-checker-problems-for-parser-that-can-delegate enum ReaderOrDelegate { - Reader(Box), + Reader(Rc>), Delegate(Box), } -impl ReaderOrDelegate { - fn end_delegation(self) -> Self { - use self::ReaderOrDelegate::*; - match self { - Delegate(delegate) => Reader(delegate.consume()), - read => read, - } - } -} - pub struct CodeLexer { - // reader: Option<&'r mut Reader>, + // reader: Rc>, indent: i32, - // TODO: both of the next two would be unnecessary with generators... + // This delegate deals with nested structures, like string literals and comments. - // delegate: Option<&'r mut Lexer<'r>>, reader_or_delegate: ReaderOrDelegate, // This is unfortunate, would not be needed with 'yield' but is now for indents. buffer: Queue, } impl CodeLexer { - fn new(reader: Box) -> Self { + fn new(reader: Rc>) -> Self { CodeLexer { reader_or_delegate: ReaderOrDelegate::Reader(reader), indent: 0, @@ -49,7 +43,7 @@ impl CodeLexer { } } - fn lex_indents(&mut self, reader: &mut Box) -> MaybeToken { + fn lex_indents(&mut self, reader: &mut Reader) -> MaybeToken { let mut line_indent = 0; while let Match(_) = reader.matches("\\t") { line_indent += 1; @@ -82,13 +76,20 @@ impl Lexer for CodeLexer { // If currently delegating to a sub-lexer, return from that. match self.reader_or_delegate { ReaderOrDelegate::Delegate(ref mut delegate) => { - match delegate.lex() { + let delegated_token = delegate.lex(); + match delegated_token { + End => { + // Swap back from delegation to direct mode. + let reader = delegate.get_reader().clone(); + self.reader_or_delegate = ReaderOrDelegate::Reader(reader); + self.lex() + } Token(token) => Token(token), - End => self.lex(), } // Code to stop delegation cannot be here, because `self` is still mutably borrowed through `delegate` } - ReaderOrDelegate::Reader(ref mut reader) => { + ReaderOrDelegate::Reader(ref mut reader_refcell) => { + let mut reader = reader_refcell.borrow_mut(); // todo: maybe this branch could be a separate function? // If there is a buffer due to indentation or continuations, return from that. @@ -103,11 +104,11 @@ impl Lexer for CodeLexer { } else if let Match(word) = reader.matches("[^\\n]*\\n\\r?") { return Token(Tokens::Unlexable(UnlexableToken::new(word))); } else { - // TODO: I don't know yet how to deal with ... followed by end-of-file + // TODO: I don't know yet how to deal with '...' followed by end-of-file panic!() } // This is a new line, so there may be indents. - return self.lex_indents(&mut reader); + return self.lex_indents(reader); } if let Match(_) = reader.matches("\\n\\r?") { // Newline WITHOUT line continuation. @@ -121,7 +122,7 @@ impl Lexer for CodeLexer { if let Match(_) = reader.matches("\\n\\r?") { // If semicolon is followed by a newline (redundant), then we need to deal with indents (but ignore the newline itself). // This will return the queue of tokens, including the semicolon. - return self.lex_indents(&mut reader); + return self.lex_indents(reader); } // No newline, can just return the semicolon (which is certainly on the queue, and should be the only thing, but it is fine here if not). return Token(self.buffer.pop().unwrap()); @@ -138,11 +139,10 @@ impl Lexer for CodeLexer { return Token(Tokens::Identifier(IdentifierToken::from_str(word).unwrap())); } // Literal - if let Match(word) = reader.matches("[a-z]?\"") { - // TODO: need to keep delegating to this until it exhausts, how to do that? - self.reader_or_delegate = ReaderOrDelegate::Delegate(Box::new( - StringLexer::new_double_quoted(reader), - )); + if let Match(_) = reader.matches("[a-z]?\"") { + let sublexer: Box = + Box::new(StringLexer::new_double_quoted(reader_refcell.clone())); + self.reader_or_delegate = ReaderOrDelegate::Delegate(sublexer); return self.lex(); } // Operator @@ -163,11 +163,10 @@ impl Lexer for CodeLexer { } } - fn consume(self) -> Box { - assert!(false, "I do not think this is ever called, is it?"); + fn get_reader(&self) -> &Rc> { match self.reader_or_delegate { - ReaderOrDelegate::Reader(reader) => reader, - ReaderOrDelegate::Delegate(delegate) => delegate.consume(), + ReaderOrDelegate::Reader(reader) => &reader, + ReaderOrDelegate::Delegate(delegate) => delegate.get_reader(), } } } diff --git a/src/mango/lexing/string_lexer.rs b/src/mango/lexing/string_lexer.rs index de5c72da..4218b871 100644 --- a/src/mango/lexing/string_lexer.rs +++ b/src/mango/lexing/string_lexer.rs @@ -4,6 +4,8 @@ use mango::lexing::typ::Lexer; use mango::lexing::typ::MaybeToken; use mango::token::tokens::LiteralToken; use mango::token::Tokens; +use std::cell::RefCell; +use std::rc::Rc; pub enum StringType { SingleQuotedInline, @@ -14,13 +16,13 @@ pub enum StringType { /// Lexes a string literal token. // Starts after the opening quote and expected to consume until closing quote. pub struct StringLexer { - reader: Box, + reader: Rc>, typ: StringType, } impl StringLexer { // TODO: support other types of strings - pub fn new_double_quoted(reader: Box) -> Self { + pub fn new_double_quoted(reader: Rc>) -> Self { StringLexer { reader, typ: StringType::DoubleQuotedInline, @@ -33,14 +35,14 @@ impl Lexer for StringLexer { // TODO: perhaps there's a library that does parsing a string with escape characters // TODO: doesn't handle escaping etc at all now // TODO: this is going to have a problem if `matches` automatically eats whitespace - match self.reader.matches("[^\"\\n]*") { + match self.reader.borrow_mut().matches("[^\"\\n]*") { Match(value) => return MaybeToken::Token(Tokens::Literal(LiteralToken::string(value))), NoMatch() => panic!("failed to parse string"), // This can't really go wrong since empty pattern matches EOF() => return MaybeToken::Token(Tokens::Literal(LiteralToken::string("".to_owned()))), // Unclosed string literal, let code parser deal with it } } - fn consume(self) -> Box { - self.reader + fn get_reader(&self) -> &Rc> { + &self.reader } } diff --git a/src/mango/lexing/typ.rs b/src/mango/lexing/typ.rs index 063389e4..d82b19e1 100644 --- a/src/mango/lexing/typ.rs +++ b/src/mango/lexing/typ.rs @@ -1,5 +1,7 @@ use mango::io::typ::Reader; use mango::token::Tokens; +use std::cell::RefCell; +use std::rc::Rc; pub enum MaybeToken { Token(Tokens), @@ -10,8 +12,10 @@ pub trait Lexer { // /// Create a new lexer from a reader instance. // fn new(reader: &'r mut Reader) -> Self; + // fn new(reader: Rc>); + /// Every call to lex returns a token until the end of the input. fn lex(&mut self) -> MaybeToken; - fn consume(self) -> Box; + fn get_reader(&self) -> &Rc>; } From d3426ec4f27ef884bfeb6e0387af6f9f430e71d7 Mon Sep 17 00:00:00 2001 From: Mark Date: Tue, 29 May 2018 07:56:43 +0200 Subject: [PATCH 11/49] Trying to solve everything for Rc mode, but still lifetime problems #52 --- src/mango/lexing/code_lexer.rs | 12 +++++++----- src/mango/lexing/string_lexer.rs | 4 ++-- src/mango/lexing/typ.rs | 2 +- 3 files changed, 10 insertions(+), 8 deletions(-) diff --git a/src/mango/lexing/code_lexer.rs b/src/mango/lexing/code_lexer.rs index 8a2e3b1b..9e3505eb 100644 --- a/src/mango/lexing/code_lexer.rs +++ b/src/mango/lexing/code_lexer.rs @@ -1,3 +1,5 @@ +#![feature(nll)] + use mango::io::typ::Reader; use mango::io::typ::ReaderResult::*; use mango::lexing::string_lexer::StringLexer; @@ -108,7 +110,7 @@ impl Lexer for CodeLexer { panic!() } // This is a new line, so there may be indents. - return self.lex_indents(reader); + return self.lex_indents(&mut *reader); } if let Match(_) = reader.matches("\\n\\r?") { // Newline WITHOUT line continuation. @@ -122,7 +124,7 @@ impl Lexer for CodeLexer { if let Match(_) = reader.matches("\\n\\r?") { // If semicolon is followed by a newline (redundant), then we need to deal with indents (but ignore the newline itself). // This will return the queue of tokens, including the semicolon. - return self.lex_indents(reader); + return self.lex_indents(&mut *reader); } // No newline, can just return the semicolon (which is certainly on the queue, and should be the only thing, but it is fine here if not). return Token(self.buffer.pop().unwrap()); @@ -163,10 +165,10 @@ impl Lexer for CodeLexer { } } - fn get_reader(&self) -> &Rc> { + fn get_reader(&self) -> Rc> { match self.reader_or_delegate { - ReaderOrDelegate::Reader(reader) => &reader, - ReaderOrDelegate::Delegate(delegate) => delegate.get_reader(), + ReaderOrDelegate::Reader(ref reader) => reader.clone(), + ReaderOrDelegate::Delegate(ref delegate) => delegate.get_reader(), } } } diff --git a/src/mango/lexing/string_lexer.rs b/src/mango/lexing/string_lexer.rs index 4218b871..8e4adc83 100644 --- a/src/mango/lexing/string_lexer.rs +++ b/src/mango/lexing/string_lexer.rs @@ -42,7 +42,7 @@ impl Lexer for StringLexer { } } - fn get_reader(&self) -> &Rc> { - &self.reader + fn get_reader(&self) -> Rc> { + self.reader.clone() } } diff --git a/src/mango/lexing/typ.rs b/src/mango/lexing/typ.rs index d82b19e1..8ea53ba5 100644 --- a/src/mango/lexing/typ.rs +++ b/src/mango/lexing/typ.rs @@ -17,5 +17,5 @@ pub trait Lexer { /// Every call to lex returns a token until the end of the input. fn lex(&mut self) -> MaybeToken; - fn get_reader(&self) -> &Rc>; + fn get_reader(&self) -> Rc>; } From 7aeb9ff75da3acfb22b33f24621e3f734d080381 Mon Sep 17 00:00:00 2001 From: Mark Date: Wed, 30 May 2018 21:43:58 +0200 Subject: [PATCH 12/49] I really liked the idea of enum approach but very much fed up with fighting the borrow checker #52 --- src/lib.rs | 2 ++ src/mango/lexing/code_lexer.rs | 2 -- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 163b6c11..a7b08154 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,3 +1,5 @@ +#![feature(nll)] + extern crate core; #[macro_use] extern crate lazy_static; diff --git a/src/mango/lexing/code_lexer.rs b/src/mango/lexing/code_lexer.rs index 9e3505eb..c4351f23 100644 --- a/src/mango/lexing/code_lexer.rs +++ b/src/mango/lexing/code_lexer.rs @@ -1,5 +1,3 @@ -#![feature(nll)] - use mango::io::typ::Reader; use mango::io::typ::ReaderResult::*; use mango::lexing::string_lexer::StringLexer; From 12bee403999cc42b26b4fc9bcaadda06d5b19fe9 Mon Sep 17 00:00:00 2001 From: Mark Date: Wed, 30 May 2018 21:58:36 +0200 Subject: [PATCH 13/49] Compiles but still overlapping borrow at runtime #52 --- src/mango/lexing/code_lexer.rs | 48 +++++++++++++++++----------------- 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/src/mango/lexing/code_lexer.rs b/src/mango/lexing/code_lexer.rs index c4351f23..f410028d 100644 --- a/src/mango/lexing/code_lexer.rs +++ b/src/mango/lexing/code_lexer.rs @@ -20,7 +20,7 @@ use std::rc::Rc; // TODO: see this SO question: https://stackoverflow.com/questions/50535022/borrow-checker-problems-for-parser-that-can-delegate enum ReaderOrDelegate { - Reader(Rc>), + Reader(), Delegate(Box), } @@ -28,6 +28,7 @@ pub struct CodeLexer { // reader: Rc>, indent: i32, + reader: Rc>, // This delegate deals with nested structures, like string literals and comments. reader_or_delegate: ReaderOrDelegate, // This is unfortunate, would not be needed with 'yield' but is now for indents. @@ -37,21 +38,22 @@ pub struct CodeLexer { impl CodeLexer { fn new(reader: Rc>) -> Self { CodeLexer { - reader_or_delegate: ReaderOrDelegate::Reader(reader), + reader: reader, + reader_or_delegate: ReaderOrDelegate::Reader(), indent: 0, buffer: Queue::new(), } } - fn lex_indents(&mut self, reader: &mut Reader) -> MaybeToken { + fn lex_indents(&mut self) -> MaybeToken { let mut line_indent = 0; - while let Match(_) = reader.matches("\\t") { + while let Match(_) = self.reader.borrow_mut().matches("\\t") { line_indent += 1; } for _ in line_indent..self.indent { // This line is dedented, make end tokens. // TODO: turn this "new" into a constant - if let Match(_) = reader.matches("end") { + if let Match(_) = self.reader.borrow_mut().matches("end") { // If this is followed by an 'end' keyword, then that 'end' is redundant. self.buffer .push(Tokens::EndBlock(EndBlockToken::new(true, true))); @@ -81,15 +83,14 @@ impl Lexer for CodeLexer { End => { // Swap back from delegation to direct mode. let reader = delegate.get_reader().clone(); - self.reader_or_delegate = ReaderOrDelegate::Reader(reader); + self.reader_or_delegate = ReaderOrDelegate::Reader(); self.lex() } Token(token) => Token(token), } // Code to stop delegation cannot be here, because `self` is still mutably borrowed through `delegate` } - ReaderOrDelegate::Reader(ref mut reader_refcell) => { - let mut reader = reader_refcell.borrow_mut(); + ReaderOrDelegate::Reader() => { // todo: maybe this branch could be a separate function? // If there is a buffer due to indentation or continuations, return from that. @@ -97,32 +98,31 @@ impl Lexer for CodeLexer { return Token(token); } // Past this point, we assume that hte buffer is empty. When adding stuff, pop it or re-enter lex() soon. - if let Match(_) = reader.matches("\\.\\.\\.") { + if let Match(_) = self.reader.borrow_mut().matches("\\.\\.\\.") { // Line continuation has no token, it just continues on the next line. - if let Match(_) = reader.matches("\\n\\r?") { + if let Match(_) = self.reader.borrow_mut().matches("\\n\\r?") { // There should always be a newline after continuations, so that they can be ignored together. - } else if let Match(word) = reader.matches("[^\\n]*\\n\\r?") { + } else if let Match(word) = self.reader.borrow_mut().matches("[^\\n]*\\n\\r?") { return Token(Tokens::Unlexable(UnlexableToken::new(word))); } else { // TODO: I don't know yet how to deal with '...' followed by end-of-file panic!() } // This is a new line, so there may be indents. - return self.lex_indents(&mut *reader); + return self.lex_indents(); } - if let Match(_) = reader.matches("\\n\\r?") { + if let Match(_) = self.reader.borrow_mut().matches("\\n\\r?") { // Newline WITHOUT line continuation. return Token(Tokens::EndStatement(EndStatementToken::new_end_line())); } - if let Match(_) = reader.matches(";") { + if let Match(_) = self.reader.borrow_mut().matches(";") { // Semicolon, which ends a statement. // Need to do some extra work with buffer, because there may be a newline followed by indentation, which ; should precede. - self.buffer - .push(Tokens::EndStatement(EndStatementToken::new_semicolon())); - if let Match(_) = reader.matches("\\n\\r?") { + self.buffer.push(Tokens::EndStatement(EndStatementToken::new_semicolon())); + if let Match(_) = self.reader.borrow_mut().matches("\\n\\r?") { // If semicolon is followed by a newline (redundant), then we need to deal with indents (but ignore the newline itself). // This will return the queue of tokens, including the semicolon. - return self.lex_indents(&mut *reader); + return self.lex_indents(); } // No newline, can just return the semicolon (which is certainly on the queue, and should be the only thing, but it is fine here if not). return Token(self.buffer.pop().unwrap()); @@ -131,7 +131,7 @@ impl Lexer for CodeLexer { // Indentation done; do the rest of lexing. // // Parse identifers and keywords. This assumes that keywords are a subset of identifiers. - if let Match(word) = reader.matches(IdentifierToken::subpattern()) { + if let Match(word) = self.reader.borrow_mut().matches(IdentifierToken::subpattern()) { // later: maybe turn identifier into keyword to avoid a string copy? kind of elaborate... if let Ok(keyword) = KeywordToken::from_str(word.clone()) { return Token(Tokens::Keyword(keyword)); @@ -139,9 +139,9 @@ impl Lexer for CodeLexer { return Token(Tokens::Identifier(IdentifierToken::from_str(word).unwrap())); } // Literal - if let Match(_) = reader.matches("[a-z]?\"") { + if let Match(_) = self.reader.borrow_mut().matches("[a-z]?\"") { let sublexer: Box = - Box::new(StringLexer::new_double_quoted(reader_refcell.clone())); + Box::new(StringLexer::new_double_quoted(self.reader.clone())); self.reader_or_delegate = ReaderOrDelegate::Delegate(sublexer); return self.lex(); } @@ -150,10 +150,10 @@ impl Lexer for CodeLexer { // Association // todo // Grouping symbols - if let Match(_) = reader.matches("(") { + if let Match(_) = self.reader.borrow_mut().matches("(") { return Token(Tokens::ParenthesisOpen(ParenthesisOpenToken::new())); } - if let Match(_) = reader.matches(")") { + if let Match(_) = self.reader.borrow_mut().matches(")") { return Token(Tokens::ParenthesisClose(ParenthesisCloseToken::new())); } @@ -165,7 +165,7 @@ impl Lexer for CodeLexer { fn get_reader(&self) -> Rc> { match self.reader_or_delegate { - ReaderOrDelegate::Reader(ref reader) => reader.clone(), + ReaderOrDelegate::Reader() => self.reader.clone(), ReaderOrDelegate::Delegate(ref delegate) => delegate.get_reader(), } } From 58d9e67bd416f7f022a54b25a6d60dac652d03e7 Mon Sep 17 00:00:00 2001 From: Mark Date: Thu, 31 May 2018 20:55:29 +0200 Subject: [PATCH 14/49] Update test settings to match Rust upgrade #so --- .travis.yml | 1 + dev/hooks/pre-commit | 2 +- dev/hooks/utils/run_on_staged.py | 2 +- rustfmt.toml | 4 ---- 4 files changed, 3 insertions(+), 6 deletions(-) diff --git a/.travis.yml b/.travis.yml index 45b78e45..bf8b2cc6 100644 --- a/.travis.yml +++ b/.travis.yml @@ -6,4 +6,5 @@ before_script: sudo: false cache: cargo script: + - cargo +nightly fmt --all -- --check - cargo test --all diff --git a/dev/hooks/pre-commit b/dev/hooks/pre-commit index a4b1873c..e99e8975 100755 --- a/dev/hooks/pre-commit +++ b/dev/hooks/pre-commit @@ -8,4 +8,4 @@ set -o pipefail util_dir="$(dirname "$(realpath "${BASH_SOURCE[0]}")")/utils" # Check that the formatting is correct -PYTHONPATH="$util_dir":$PYTHONPATH python3 "$util_dir/run_on_staged.py" 'cargo +nightly fmt --verbose --all -- --write-mode=diff' 'cargo test --all' +PYTHONPATH="$util_dir":$PYTHONPATH python3 "$util_dir/run_on_staged.py" 'cargo +nightly fmt --all -- --check' 'cargo test --all' diff --git a/dev/hooks/utils/run_on_staged.py b/dev/hooks/utils/run_on_staged.py index 036114de..a813b50f 100644 --- a/dev/hooks/utils/run_on_staged.py +++ b/dev/hooks/utils/run_on_staged.py @@ -21,7 +21,7 @@ def do_cmds(cmds): run(cmd, allow_stderr=True, log=True) except Exception as err: stderr.write(str(err)) - stderr.write('FAILED, cancelling commit\n') + stderr.write('\nFAILED, cancelling commit\n') return 1 return 0 diff --git a/rustfmt.toml b/rustfmt.toml index ce4866d5..44148a2d 100644 --- a/rustfmt.toml +++ b/rustfmt.toml @@ -1,5 +1 @@ -reorder_extern_crates = true -reorder_extern_crates_in_group = true reorder_imports = true -reorder_imports_in_group = true -reorder_imported_names = true From 0be80ebeae069df1f95afd34db03e8989f71ef9e Mon Sep 17 00:00:00 2001 From: Mark Date: Thu, 31 May 2018 20:56:39 +0200 Subject: [PATCH 15/49] With sacrifices to design and brevity, it now works! #52 --- src/mango/lexing/code_lexer.rs | 21 +++++++++++++++------ src/mango/util/format/strings.rs | 3 ++- 2 files changed, 17 insertions(+), 7 deletions(-) diff --git a/src/mango/lexing/code_lexer.rs b/src/mango/lexing/code_lexer.rs index f410028d..41af68c9 100644 --- a/src/mango/lexing/code_lexer.rs +++ b/src/mango/lexing/code_lexer.rs @@ -98,7 +98,8 @@ impl Lexer for CodeLexer { return Token(token); } // Past this point, we assume that hte buffer is empty. When adding stuff, pop it or re-enter lex() soon. - if let Match(_) = self.reader.borrow_mut().matches("\\.\\.\\.") { + let continue_match_res = self.reader.borrow_mut().matches("\\.\\.\\."); + if let Match(_) = continue_match_res { // Line continuation has no token, it just continues on the next line. if let Match(_) = self.reader.borrow_mut().matches("\\n\\r?") { // There should always be a newline after continuations, so that they can be ignored together. @@ -115,11 +116,14 @@ impl Lexer for CodeLexer { // Newline WITHOUT line continuation. return Token(Tokens::EndStatement(EndStatementToken::new_end_line())); } - if let Match(_) = self.reader.borrow_mut().matches(";") { + let end_statement_match_res = self.reader.borrow_mut().matches(";"); + if let Match(_) = end_statement_match_res { // Semicolon, which ends a statement. // Need to do some extra work with buffer, because there may be a newline followed by indentation, which ; should precede. - self.buffer.push(Tokens::EndStatement(EndStatementToken::new_semicolon())); - if let Match(_) = self.reader.borrow_mut().matches("\\n\\r?") { + self.buffer + .push(Tokens::EndStatement(EndStatementToken::new_semicolon())); + let end_line_match_res = self.reader.borrow_mut().matches("\\n\\r?"); + if let Match(_) = end_line_match_res { // If semicolon is followed by a newline (redundant), then we need to deal with indents (but ignore the newline itself). // This will return the queue of tokens, including the semicolon. return self.lex_indents(); @@ -131,7 +135,11 @@ impl Lexer for CodeLexer { // Indentation done; do the rest of lexing. // // Parse identifers and keywords. This assumes that keywords are a subset of identifiers. - if let Match(word) = self.reader.borrow_mut().matches(IdentifierToken::subpattern()) { + if let Match(word) = self + .reader + .borrow_mut() + .matches(IdentifierToken::subpattern()) + { // later: maybe turn identifier into keyword to avoid a string copy? kind of elaborate... if let Ok(keyword) = KeywordToken::from_str(word.clone()) { return Token(Tokens::Keyword(keyword)); @@ -139,7 +147,8 @@ impl Lexer for CodeLexer { return Token(Tokens::Identifier(IdentifierToken::from_str(word).unwrap())); } // Literal - if let Match(_) = self.reader.borrow_mut().matches("[a-z]?\"") { + let string_match_res = self.reader.borrow_mut().matches("[a-z]?\""); + if let Match(_) = string_match_res { let sublexer: Box = Box::new(StringLexer::new_double_quoted(self.reader.clone())); self.reader_or_delegate = ReaderOrDelegate::Delegate(sublexer); diff --git a/src/mango/util/format/strings.rs b/src/mango/util/format/strings.rs index f4aab63c..7161c5a9 100644 --- a/src/mango/util/format/strings.rs +++ b/src/mango/util/format/strings.rs @@ -2,7 +2,8 @@ /// string when parsed by a typical language. pub fn to_double_quoted_str(txt: &str) -> String { // todo: performance? mostly I'd like to add the quotes as part of the stream, but it seems difficult - let esc: String = txt.chars() + let esc: String = txt + .chars() .map(|c| match c { '\\' => r"\\".to_string(), '\"' => "\\\"".to_string(), From d3a555b14627f4d886c3fe99b9afdb4213e57a72 Mon Sep 17 00:00:00 2001 From: Mark Date: Fri, 1 Jun 2018 21:51:27 +0200 Subject: [PATCH 16/49] Add operator lexing #52 --- src/mango/lexing/code_lexer.rs | 11 +++++++++-- src/mango/token/tokens/operator.rs | 4 ++++ src/mango/util/codeparts/operator.rs | 5 +++++ 3 files changed, 18 insertions(+), 2 deletions(-) diff --git a/src/mango/lexing/code_lexer.rs b/src/mango/lexing/code_lexer.rs index 41af68c9..5574f253 100644 --- a/src/mango/lexing/code_lexer.rs +++ b/src/mango/lexing/code_lexer.rs @@ -8,6 +8,7 @@ use mango::token::tokens::EndBlockToken; use mango::token::tokens::EndStatementToken; use mango::token::tokens::IdentifierToken; use mango::token::tokens::KeywordToken; +use mango::token::tokens::OperatorToken; use mango::token::tokens::ParenthesisCloseToken; use mango::token::tokens::ParenthesisOpenToken; use mango::token::tokens::StartBlockToken; @@ -134,7 +135,7 @@ impl Lexer for CodeLexer { // // Indentation done; do the rest of lexing. // - // Parse identifers and keywords. This assumes that keywords are a subset of identifiers. + // Parse identifiers and keywords. This assumes that keywords are a subset of identifiers. if let Match(word) = self .reader .borrow_mut() @@ -155,7 +156,13 @@ impl Lexer for CodeLexer { return self.lex(); } // Operator - // todo + let operator_match_res = self + .reader + .borrow_mut() + .matches(OperatorToken::subpattern()); + if let Match(token) = operator_match_res { + return Token(Tokens::Operator(OperatorToken::from_str(&token).unwrap())); + } // Association // todo // Grouping symbols diff --git a/src/mango/token/tokens/operator.rs b/src/mango/token/tokens/operator.rs index eb19db24..4515887e 100644 --- a/src/mango/token/tokens/operator.rs +++ b/src/mango/token/tokens/operator.rs @@ -34,6 +34,10 @@ impl OperatorToken { pub fn is_mult_div(&self) -> bool { self.symbol == Symbol::Asterisk || self.symbol == Symbol::Slash } + + pub fn subpattern() -> &'static str { + Symbol::subpattern() + } } impl ToText for OperatorToken { diff --git a/src/mango/util/codeparts/operator.rs b/src/mango/util/codeparts/operator.rs index 00dae4f0..71d37b33 100644 --- a/src/mango/util/codeparts/operator.rs +++ b/src/mango/util/codeparts/operator.rs @@ -27,6 +27,11 @@ impl Symbol { ))), } } + + /// Generate an eager subpattern to match tokens, that can be composed in a regular expression. + pub fn subpattern() -> &'static str { + r"(\+|\-|\*|\/)" + } } impl Display for Symbol { From a60d1444bc74e9b38242db81495d6746859d24f2 Mon Sep 17 00:00:00 2001 From: Mark Date: Fri, 1 Jun 2018 23:25:34 +0200 Subject: [PATCH 17/49] Add string slicing by character #52 --- src/mango/util/mod.rs | 2 + src/mango/util/strslice/mod.rs | 4 ++ src/mango/util/strslice/slice.rs | 64 ++++++++++++++++++++++++++++++++ 3 files changed, 70 insertions(+) create mode 100644 src/mango/util/strslice/mod.rs create mode 100644 src/mango/util/strslice/slice.rs diff --git a/src/mango/util/mod.rs b/src/mango/util/mod.rs index e6c649ab..8d0156e9 100644 --- a/src/mango/util/mod.rs +++ b/src/mango/util/mod.rs @@ -2,6 +2,8 @@ pub mod collection; pub mod strtype; +pub mod strslice; + pub mod numtype; pub mod signaltype; diff --git a/src/mango/util/strslice/mod.rs b/src/mango/util/strslice/mod.rs new file mode 100644 index 00000000..5846d94a --- /dev/null +++ b/src/mango/util/strslice/mod.rs @@ -0,0 +1,4 @@ +pub mod slice; +pub use self::slice::charslice; +pub use self::slice::charslicefrom; +pub use self::slice::charsliceto; diff --git a/src/mango/util/strslice/slice.rs b/src/mango/util/strslice/slice.rs new file mode 100644 index 00000000..97bb04e7 --- /dev/null +++ b/src/mango/util/strslice/slice.rs @@ -0,0 +1,64 @@ +/// Take a character-based slice of a string (as opposed to the default byte-slice). +/// Allows negative indices to slice from the end (but start must be before end). +/// This may not be very fast. +pub fn charslice>(text: S, start: isize, end: isize) -> String { + let stext = text.into(); + let from: usize; + let length: usize; + if start < 0 { + // LATER: may remove this check and just default to 0 in the future. + assert!( + -start as usize <= stext.len(), + "charslice: if 'start' is negative, the magnitude may not exceed the length" + ); + // TODO: off by one? + from = (stext.len() as isize + start) as usize; + } else { + from = start as usize; + } + if end < 0 { + // LATER: may remove this check and just default to 0 in the future. + assert!( + -end as usize <= stext.len(), + "charslice: if 'end' is negative, the magnitude may not exceed the length" + ); + // TODO: off by one? + let new_end = (stext.len() as isize + end) as usize; + assert!( + new_end >= from, + "charslice: 'start' may not be before 'end' (end was positive)" + ); + length = new_end - from; + } else { + assert!( + end >= from as isize, + "charslice: 'start' may not be before 'end' (end was positive)" + ); + length = end as usize - from; + } + stext.chars().skip(from).take(length).collect() +} + +pub fn charslicefrom>(text: S, start: isize) -> String { + let stext = text.into(); + let len = stext.len() as isize; + charslice(stext, start, len) +} + +pub fn charsliceto>(text: S, end: isize) -> String { + charslice(text, 0, end) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_slice() { + assert_eq!(42isize as usize, 42usize); + assert_eq!("你好", charslice("你好!", 0, 2)); + assert_eq!("!", charslicefrom("你好!", 2)); + assert_eq!("你好", charsliceto("你好!", 2)); + // TODO: test negative values + } +} From cab34baeeb77dc0075efaf294b30985ef74af081 Mon Sep 17 00:00:00 2001 From: Mark Date: Fri, 1 Jun 2018 23:32:33 +0200 Subject: [PATCH 18/49] Fix and add tests for negative slices #52 --- src/mango/util/strslice/slice.rs | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/src/mango/util/strslice/slice.rs b/src/mango/util/strslice/slice.rs index 97bb04e7..88eea284 100644 --- a/src/mango/util/strslice/slice.rs +++ b/src/mango/util/strslice/slice.rs @@ -5,25 +5,28 @@ pub fn charslice>(text: S, start: isize, end: isize) -> String { let stext = text.into(); let from: usize; let length: usize; + let charcount = stext.chars().count(); if start < 0 { // LATER: may remove this check and just default to 0 in the future. assert!( - -start as usize <= stext.len(), + -start as usize <= charcount, "charslice: if 'start' is negative, the magnitude may not exceed the length" ); - // TODO: off by one? - from = (stext.len() as isize + start) as usize; + println!( + ">> charcount as isize + start = {} + {}", + charcount as isize, start + ); + from = (charcount as isize + start) as usize; } else { from = start as usize; } if end < 0 { // LATER: may remove this check and just default to 0 in the future. assert!( - -end as usize <= stext.len(), + -end as usize <= charcount, "charslice: if 'end' is negative, the magnitude may not exceed the length" ); - // TODO: off by one? - let new_end = (stext.len() as isize + end) as usize; + let new_end = (charcount as isize + end) as usize; assert!( new_end >= from, "charslice: 'start' may not be before 'end' (end was positive)" @@ -36,12 +39,13 @@ pub fn charslice>(text: S, start: isize, end: isize) -> String { ); length = end as usize - from; } + println!("from: {}, length: {}", from, length); stext.chars().skip(from).take(length).collect() } pub fn charslicefrom>(text: S, start: isize) -> String { let stext = text.into(); - let len = stext.len() as isize; + let len = stext.chars().count() as isize; charslice(stext, start, len) } @@ -59,6 +63,9 @@ mod tests { assert_eq!("你好", charslice("你好!", 0, 2)); assert_eq!("!", charslicefrom("你好!", 2)); assert_eq!("你好", charsliceto("你好!", 2)); - // TODO: test negative values + // Negative indices should match Python 3 behaviour: + assert_eq!("你好", charslice("你好!", -3, -1)); + assert_eq!("!", charslicefrom("你好!", -1)); + assert_eq!("你好", charsliceto("你好!", -1)); } } From f945ff2eb5eaeca2a71ecd6bd821b982a292c824 Mon Sep 17 00:00:00 2001 From: Mark Date: Sat, 2 Jun 2018 12:54:04 +0200 Subject: [PATCH 19/49] Reserve a few more keywords #so --- src/mango/util/codeparts/keyword.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/mango/util/codeparts/keyword.rs b/src/mango/util/codeparts/keyword.rs index 0998b74b..f3f176d4 100644 --- a/src/mango/util/codeparts/keyword.rs +++ b/src/mango/util/codeparts/keyword.rs @@ -94,6 +94,8 @@ impl Keyword { "int" => Ok(Reserved("int".to_owned())), "interface" => Ok(Reserved("interface".to_owned())), "internal" => Ok(Reserved("internal".to_owned())), + "intersect" => Ok(Reserved("intersect".to_owned())), + "intersection" => Ok(Reserved("intersection".to_owned())), "is" => Ok(Reserved("is".to_owned())), "it" => Ok(Reserved("it".to_owned())), "lambda" => Ok(Reserved("lambda".to_owned())), @@ -149,6 +151,8 @@ impl Keyword { "try" => Ok(Reserved("try".to_owned())), "type" => Ok(Reserved("type".to_owned())), "unsafe" => Ok(Reserved("unsafe".to_owned())), + "unite" => Ok(Reserved("unite".to_owned())), + "union" => Ok(Reserved("union".to_owned())), "until" => Ok(Reserved("until".to_owned())), "use" => Ok(Reserved("use".to_owned())), "val" => Ok(Reserved("val".to_owned())), From 73bcf82d1e1364fe2df32a746a2482f1f6a4e679 Mon Sep 17 00:00:00 2001 From: Mark Date: Mon, 4 Jun 2018 21:49:44 +0200 Subject: [PATCH 20/49] More lexing possibilities #52 --- src/mango/lexing/code_lexer.rs | 16 ++++++++++++++-- src/mango/token/tests.rs | 2 +- src/mango/token/tokens/association.rs | 6 +++++- src/mango/util/strslice/slice.rs | 2 +- 4 files changed, 21 insertions(+), 5 deletions(-) diff --git a/src/mango/lexing/code_lexer.rs b/src/mango/lexing/code_lexer.rs index 5574f253..a371a7dc 100644 --- a/src/mango/lexing/code_lexer.rs +++ b/src/mango/lexing/code_lexer.rs @@ -4,6 +4,7 @@ use mango::lexing::string_lexer::StringLexer; use mango::lexing::typ::Lexer; use mango::lexing::typ::MaybeToken; use mango::token::special::UnlexableToken; +use mango::token::tokens::AssociationToken; use mango::token::tokens::EndBlockToken; use mango::token::tokens::EndStatementToken; use mango::token::tokens::IdentifierToken; @@ -155,6 +156,19 @@ impl Lexer for CodeLexer { self.reader_or_delegate = ReaderOrDelegate::Delegate(sublexer); return self.lex(); } + // Association (before operator) + let association_match_res = self + .reader + .borrow_mut() + .matches(&AssociationToken::subpattern()); + if let Match(token) = association_match_res { + if token.chars().last().unwrap() == '=' { + // return Token(Tokens::Association(AssociationToken::from_str(token[..1]).unwrap())); + return Token(Tokens::Association(AssociationToken::from_unprefixed())); // TODO + } else { + return Token(Tokens::Association(AssociationToken::from_unprefixed())); + } + } // Operator let operator_match_res = self .reader @@ -163,8 +177,6 @@ impl Lexer for CodeLexer { if let Match(token) = operator_match_res { return Token(Tokens::Operator(OperatorToken::from_str(&token).unwrap())); } - // Association - // todo // Grouping symbols if let Match(_) = self.reader.borrow_mut().matches("(") { return Token(Tokens::ParenthesisOpen(ParenthesisOpenToken::new())); diff --git a/src/mango/token/tests.rs b/src/mango/token/tests.rs index 732d01a3..6b4ce6b8 100644 --- a/src/mango/token/tests.rs +++ b/src/mango/token/tests.rs @@ -15,7 +15,7 @@ fn test_tokens_eq() { Keyword(KeywordToken::from_str("let").unwrap()), Keyword(KeywordToken::from_str("mut").unwrap()), Identifier(IdentifierToken::from_name(my_var)), - Association(AssociationToken::from_unmutated()), + Association(AssociationToken::from_unprefixed()), Literal(LiteralToken::int(21)), EndStatement(EndStatementToken::new_semicolon()), Identifier(IdentifierToken::from_name(my_var)), diff --git a/src/mango/token/tokens/association.rs b/src/mango/token/tokens/association.rs index a8c44da2..1c857ee2 100644 --- a/src/mango/token/tokens/association.rs +++ b/src/mango/token/tokens/association.rs @@ -11,7 +11,7 @@ pub struct AssociationToken { } impl AssociationToken { - pub fn from_unmutated() -> Self { + pub fn from_unprefixed() -> Self { AssociationToken { symbol: Option::None, } @@ -26,6 +26,10 @@ impl AssociationToken { symbol: Option::Some(symbol), } } + + pub fn subpattern() -> String { + format!("{}=", Symbol::subpattern()) + } } impl ToText for AssociationToken { diff --git a/src/mango/util/strslice/slice.rs b/src/mango/util/strslice/slice.rs index 88eea284..3260c401 100644 --- a/src/mango/util/strslice/slice.rs +++ b/src/mango/util/strslice/slice.rs @@ -29,7 +29,7 @@ pub fn charslice>(text: S, start: isize, end: isize) -> String { let new_end = (charcount as isize + end) as usize; assert!( new_end >= from, - "charslice: 'start' may not be before 'end' (end was positive)" + "charslice: 'start' may not be before 'end' (end was negative)" ); length = new_end - from; } else { From 04ab16ab12db6a0869b146595254de8c911221a9 Mon Sep 17 00:00:00 2001 From: Mark Date: Tue, 5 Jun 2018 19:24:47 +0200 Subject: [PATCH 21/49] Lexer infrastructure for testing #52 --- src/mango/io/fortest/fromstr.rs | 1 + src/mango/io/typ.rs | 2 ++ src/mango/lexing/code_lexer.rs | 34 ++++++++++++++++++++++++++++++-- src/mango/lexing/mod.rs | 2 ++ src/mango/lexing/util/lex_all.rs | 34 ++++++++++++++++++++++++++++++++ src/mango/lexing/util/mod.rs | 1 + src/mango/util/strslice/slice.rs | 4 ---- 7 files changed, 72 insertions(+), 6 deletions(-) create mode 100644 src/mango/lexing/util/lex_all.rs create mode 100644 src/mango/lexing/util/mod.rs diff --git a/src/mango/io/fortest/fromstr.rs b/src/mango/io/fortest/fromstr.rs index 2ca183b9..c4ee12b7 100644 --- a/src/mango/io/fortest/fromstr.rs +++ b/src/mango/io/fortest/fromstr.rs @@ -30,6 +30,7 @@ impl Reader for StringReader { REXCACHE.with(|rl| { let mut rexlib = rl.borrow_mut(); let rex = rexlib.make_or_get(subpattern); + println!("{:?}", rex); }); ReaderResult::NoMatch() // TODO } diff --git a/src/mango/io/typ.rs b/src/mango/io/typ.rs index ebe523a5..2c83c5ca 100644 --- a/src/mango/io/typ.rs +++ b/src/mango/io/typ.rs @@ -12,6 +12,8 @@ pub trait Reader { // fn equals(&mut self, texts: Vec<&str>) -> ReaderResult; /// Checks whether the code from the current position matches a regex pattern. + /// + /// This has to eventually return EOF, after which it should not be called again. fn matches(&mut self, subpattern: &str) -> ReaderResult; } diff --git a/src/mango/lexing/code_lexer.rs b/src/mango/lexing/code_lexer.rs index a371a7dc..bbf2bf86 100644 --- a/src/mango/lexing/code_lexer.rs +++ b/src/mango/lexing/code_lexer.rs @@ -38,7 +38,7 @@ pub struct CodeLexer { } impl CodeLexer { - fn new(reader: Rc>) -> Self { + pub fn new(reader: Rc>) -> Self { CodeLexer { reader: reader, reader_or_delegate: ReaderOrDelegate::Reader(), @@ -186,7 +186,13 @@ impl Lexer for CodeLexer { } // TODO: specify the unlexable word - return Token(Tokens::Unlexable(UnlexableToken::new("TODO".to_owned()))); + let unknown_word = self.reader.borrow_mut().matches(" *[^\\s]+"); + if let Match(word) = unknown_word { + return Token(Tokens::Unlexable(UnlexableToken::new(word))); + } else { + // todo: handle better someday + panic!("Do not know how to proceed with parsing"); + } } } } @@ -198,3 +204,27 @@ impl Lexer for CodeLexer { } } } + +#[cfg(test)] +mod tests { + use super::CodeLexer; + use mango::io::fortest::StringReader; + use mango::io::typ::Reader; + use mango::lexing::util::lex_all::{lex_all, LexList}; + use std::cell::RefCell; + use std::rc::Rc; + + #[test] + fn test_lexing() { + assert_eq!( + LexList::from_tokens(vec![]), + lex_all(Rc::new(RefCell::new(StringReader::new( + "let x = 0\nfor x < 128\n\tx += 1\n".to_owned(), + )))) + ) + // assert_eq!(1, cnt, "No item in ProblemCollector"); + } + + #[test] + fn test_lexing_delegation() {} +} diff --git a/src/mango/lexing/mod.rs b/src/mango/lexing/mod.rs index bb88a815..254d9a1c 100644 --- a/src/mango/lexing/mod.rs +++ b/src/mango/lexing/mod.rs @@ -5,3 +5,5 @@ pub mod code_lexer; pub mod comment_lexer; pub mod string_lexer; + +pub mod util; diff --git a/src/mango/lexing/util/lex_all.rs b/src/mango/lexing/util/lex_all.rs new file mode 100644 index 00000000..99481231 --- /dev/null +++ b/src/mango/lexing/util/lex_all.rs @@ -0,0 +1,34 @@ +use mango::io::typ::Reader; +use mango::lexing::code_lexer::CodeLexer; +use mango::lexing::typ::Lexer; +use mango::lexing::typ::MaybeToken; +use mango::token::Token; +use mango::token::Tokens; +use std::cell::RefCell; +use std::rc::Rc; + +/// Represents all the lex tokens in a source. +#[derive(PartialEq, Eq, Debug)] +pub struct LexList { + tokens: Vec, +} + +impl LexList { + pub fn from_tokens(tokens: Vec) -> Self { + LexList { tokens } + } + + pub fn from_reader(reader: Rc>) -> Self { + lex_all(reader) + } +} + +pub fn lex_all(reader: Rc>) -> LexList { + let mut list = Vec::with_capacity(512); + let mut lexer = CodeLexer::new(reader); + while let MaybeToken::Token(token) = lexer.lex() { + list.push(token) + } + list.shrink_to_fit(); + LexList { tokens: list } +} diff --git a/src/mango/lexing/util/mod.rs b/src/mango/lexing/util/mod.rs new file mode 100644 index 00000000..52be7fa1 --- /dev/null +++ b/src/mango/lexing/util/mod.rs @@ -0,0 +1 @@ +pub mod lex_all; diff --git a/src/mango/util/strslice/slice.rs b/src/mango/util/strslice/slice.rs index 3260c401..58022f26 100644 --- a/src/mango/util/strslice/slice.rs +++ b/src/mango/util/strslice/slice.rs @@ -12,10 +12,6 @@ pub fn charslice>(text: S, start: isize, end: isize) -> String { -start as usize <= charcount, "charslice: if 'start' is negative, the magnitude may not exceed the length" ); - println!( - ">> charcount as isize + start = {} + {}", - charcount as isize, start - ); from = (charcount as isize + start) as usize; } else { from = start as usize; From 7e32cbea6a696839e3654a1126f64a122be7acd0 Mon Sep 17 00:00:00 2001 From: Mark Date: Tue, 5 Jun 2018 19:43:52 +0200 Subject: [PATCH 22/49] More lexer infrastructure for testing #52 --- src/mango/io/fortest/fromstr.rs | 6 +++--- src/mango/io/typ.rs | 2 +- src/mango/io/util.rs | 13 +++++++++---- src/mango/lexing/code_lexer.rs | 14 ++++++-------- src/mango/util/codeparts/operator.rs | 2 +- 5 files changed, 20 insertions(+), 17 deletions(-) diff --git a/src/mango/io/fortest/fromstr.rs b/src/mango/io/fortest/fromstr.rs index c4ee12b7..8ed0c9db 100644 --- a/src/mango/io/fortest/fromstr.rs +++ b/src/mango/io/fortest/fromstr.rs @@ -29,9 +29,9 @@ impl Reader for StringReader { fn matches(&mut self, subpattern: &str) -> ReaderResult { REXCACHE.with(|rl| { let mut rexlib = rl.borrow_mut(); - let rex = rexlib.make_or_get(subpattern); - println!("{:?}", rex); + let regex = rexlib.make_or_get(subpattern); + println!("{:?}", regex); }); - ReaderResult::NoMatch() // TODO + ReaderResult::EOF() // TODO } } diff --git a/src/mango/io/typ.rs b/src/mango/io/typ.rs index 2c83c5ca..13b4e0ba 100644 --- a/src/mango/io/typ.rs +++ b/src/mango/io/typ.rs @@ -13,7 +13,7 @@ pub trait Reader { /// Checks whether the code from the current position matches a regex pattern. /// - /// This has to eventually return EOF, after which it should not be called again. + /// This has to eventually return EOF, and keep returning EOF forever after that. fn matches(&mut self, subpattern: &str) -> ReaderResult; } diff --git a/src/mango/io/util.rs b/src/mango/io/util.rs index 66e1053d..022ec795 100644 --- a/src/mango/io/util.rs +++ b/src/mango/io/util.rs @@ -15,12 +15,17 @@ impl RegexCache { } } - pub fn make_or_get(&mut self, subpattern: &str) -> Result<&Regex, Error> { + pub fn make_or_get(&mut self, subpattern: &str) -> &Regex { if !self.cache.contains_key(subpattern) { - let regex = Regex::new(&format!("^ *{}", subpattern))?; - self.cache.insert(subpattern.to_owned(), regex); + match Regex::new(&format!("^ *{}", subpattern)) { + Err(err) => panic!(format!("Invalid regular expression while adding to library; this is a bug:\n{:?}", err)), + Ok(regex) => { + self.cache.insert(subpattern.to_owned(), regex); + } + } + } - Result::Ok(self.cache.get(subpattern).unwrap()) + self.cache.get(subpattern).unwrap() } } diff --git a/src/mango/lexing/code_lexer.rs b/src/mango/lexing/code_lexer.rs index bbf2bf86..44293df7 100644 --- a/src/mango/lexing/code_lexer.rs +++ b/src/mango/lexing/code_lexer.rs @@ -178,20 +178,18 @@ impl Lexer for CodeLexer { return Token(Tokens::Operator(OperatorToken::from_str(&token).unwrap())); } // Grouping symbols - if let Match(_) = self.reader.borrow_mut().matches("(") { + if let Match(_) = self.reader.borrow_mut().matches(r"\(") { return Token(Tokens::ParenthesisOpen(ParenthesisOpenToken::new())); } - if let Match(_) = self.reader.borrow_mut().matches(")") { + if let Match(_) = self.reader.borrow_mut().matches(r"\)") { return Token(Tokens::ParenthesisClose(ParenthesisCloseToken::new())); } - // TODO: specify the unlexable word let unknown_word = self.reader.borrow_mut().matches(" *[^\\s]+"); - if let Match(word) = unknown_word { - return Token(Tokens::Unlexable(UnlexableToken::new(word))); - } else { - // todo: handle better someday - panic!("Do not know how to proceed with parsing"); + match unknown_word { + Match(word) => return Token(Tokens::Unlexable(UnlexableToken::new(word))), + NoMatch() => panic!("Do not know how to proceed with parsing"), + EOF() => End, } } } diff --git a/src/mango/util/codeparts/operator.rs b/src/mango/util/codeparts/operator.rs index 71d37b33..1a7e59fd 100644 --- a/src/mango/util/codeparts/operator.rs +++ b/src/mango/util/codeparts/operator.rs @@ -30,7 +30,7 @@ impl Symbol { /// Generate an eager subpattern to match tokens, that can be composed in a regular expression. pub fn subpattern() -> &'static str { - r"(\+|\-|\*|\/)" + r"(\+|\-|\*|/)" } } From 8d6a86f24145a04e5ca4737c19a575a3511f7cd0 Mon Sep 17 00:00:00 2001 From: Mark Date: Wed, 6 Jun 2018 07:58:23 +0200 Subject: [PATCH 23/49] Make StringReader works #52 --- dev/playground/src/enumhash.rs | 6 ---- src/mango/io/fortest/fromstr.rs | 37 --------------------- src/mango/io/fortest/mod.rs | 4 +-- src/mango/io/fortest/stringreader.rs | 49 ++++++++++++++++++++++++++++ src/mango/io/util.rs | 7 ++-- src/mango/lexing/code_lexer.rs | 14 ++++---- src/mango/lexing/util/lex_all.rs | 12 ++----- src/mango/towasm/tests.rs | 4 +-- src/mango/util/strslice/slice.rs | 1 - 9 files changed, 66 insertions(+), 68 deletions(-) delete mode 100644 src/mango/io/fortest/fromstr.rs create mode 100644 src/mango/io/fortest/stringreader.rs diff --git a/dev/playground/src/enumhash.rs b/dev/playground/src/enumhash.rs index e6e46811..c64f240d 100644 --- a/dev/playground/src/enumhash.rs +++ b/dev/playground/src/enumhash.rs @@ -63,15 +63,9 @@ fn get_test_hash(x: &MyEnum) -> u64 { } fn main() { -<<<<<<< Updated upstream let a1: MyEnum = MyEnum::A(Alpha { val: "Hello World".to_owned() }); let a2: MyEnum = MyEnum::A(Alpha { val: "Bye World".to_owned() }); let a3: MyEnum = MyEnum::A(Alpha { val: "Bye World".to_owned() }); -======= - let a1: MyEnum = MyEnum::A(Alpha { val: "Hello World".to_string() }); - let a2: MyEnum = MyEnum::A(Alpha { val: "Bye World".to_string() }); - let a3: MyEnum = MyEnum::A(Alpha { val: "Bye World".to_string() }); ->>>>>>> Stashed changes let b: MyEnum = MyEnum::B(Beta { nr: 8, f: 2 }); let mut m = HashMap::new(); println!("{:?} {:?}", a1.to_text(), b.to_text()); diff --git a/src/mango/io/fortest/fromstr.rs b/src/mango/io/fortest/fromstr.rs deleted file mode 100644 index 8ed0c9db..00000000 --- a/src/mango/io/fortest/fromstr.rs +++ /dev/null @@ -1,37 +0,0 @@ -use mango::io::typ::Reader; -use mango::io::typ::ReaderResult; -use mango::io::util::REXCACHE; - -/// Implementation of [Reader] that reads from a pre-provided string. -/// Mostly for testing purposes. -pub struct StringReader { - code: String, - index: usize, -} - -impl StringReader { - pub fn new(code: String) -> Self { - StringReader { code, index: 0 } - } -} - -impl Reader for StringReader { - // fn equals(&mut self, texts: Vec<&str>) -> ReaderResult { - // for text in texts { - // if &self.code[self.index..self.index + text.len()] == text { - // self.index += text.len(); - // return ReaderResult::Match(self.code[self.index..self.index + text.len()]) - // } - // } - // ReaderResult::NoMatch() - // } - - fn matches(&mut self, subpattern: &str) -> ReaderResult { - REXCACHE.with(|rl| { - let mut rexlib = rl.borrow_mut(); - let regex = rexlib.make_or_get(subpattern); - println!("{:?}", regex); - }); - ReaderResult::EOF() // TODO - } -} diff --git a/src/mango/io/fortest/mod.rs b/src/mango/io/fortest/mod.rs index 9aa88ab0..100916ac 100644 --- a/src/mango/io/fortest/mod.rs +++ b/src/mango/io/fortest/mod.rs @@ -1,2 +1,2 @@ -pub mod fromstr; -pub use self::fromstr::*; +pub mod stringreader; +pub use self::stringreader::*; diff --git a/src/mango/io/fortest/stringreader.rs b/src/mango/io/fortest/stringreader.rs new file mode 100644 index 00000000..69984d72 --- /dev/null +++ b/src/mango/io/fortest/stringreader.rs @@ -0,0 +1,49 @@ +use mango::io::typ::Reader; +use mango::io::typ::ReaderResult; +use mango::io::util::REXCACHE; + +/// Implementation of [Reader] that reads from a pre-provided string. +/// Mostly for testing purposes. +pub struct StringReader { + code: String, + index: usize, +} + +impl StringReader { + pub fn new(code: String) -> Self { + StringReader { code, index: 0 } + } +} + +impl Reader for StringReader { + fn matches(&mut self, subpattern: &str) -> ReaderResult { + // Check for subpattern + REXCACHE.with(|rl| { + let mut rexlib = rl.borrow_mut(); + { + // Check for end of file + // TODO: is there a better/faster way for this? maybe try this after a match and set a flag? + let regex = rexlib.make_or_get(r"\s*"); + match regex.find(&self.code[self.index..]) { + Some(mtch) => { + self.index += mtch.as_str().len(); + return ReaderResult::EOF(); + } + None => (), + } + } + { + // Check for subpattern + let regex = rexlib.make_or_get(subpattern); + return match regex.find(&self.code[self.index..]) { + Some(mtch) => { + self.index += mtch.as_str().len(); + println!(">>> {}", mtch.as_str()); + ReaderResult::Match(mtch.as_str().to_owned()) + } + None => ReaderResult::NoMatch(), + }; + } + }) + } +} diff --git a/src/mango/io/util.rs b/src/mango/io/util.rs index 022ec795..9d8710ef 100644 --- a/src/mango/io/util.rs +++ b/src/mango/io/util.rs @@ -1,4 +1,3 @@ -use regex::Error; use regex::Regex; use std::cell::RefCell; use std::collections::HashMap; @@ -18,12 +17,14 @@ impl RegexCache { pub fn make_or_get(&mut self, subpattern: &str) -> &Regex { if !self.cache.contains_key(subpattern) { match Regex::new(&format!("^ *{}", subpattern)) { - Err(err) => panic!(format!("Invalid regular expression while adding to library; this is a bug:\n{:?}", err)), + Err(err) => panic!(format!( + "Invalid regular expression while adding to library; this is a bug:\n{:?}", + err + )), Ok(regex) => { self.cache.insert(subpattern.to_owned(), regex); } } - } self.cache.get(subpattern).unwrap() } diff --git a/src/mango/lexing/code_lexer.rs b/src/mango/lexing/code_lexer.rs index 44293df7..112ff22e 100644 --- a/src/mango/lexing/code_lexer.rs +++ b/src/mango/lexing/code_lexer.rs @@ -84,7 +84,7 @@ impl Lexer for CodeLexer { match delegated_token { End => { // Swap back from delegation to direct mode. - let reader = delegate.get_reader().clone(); + // let reader = delegate.get_reader().clone(); self.reader_or_delegate = ReaderOrDelegate::Reader(); self.lex() } @@ -207,19 +207,17 @@ impl Lexer for CodeLexer { mod tests { use super::CodeLexer; use mango::io::fortest::StringReader; - use mango::io::typ::Reader; use mango::lexing::util::lex_all::{lex_all, LexList}; use std::cell::RefCell; use std::rc::Rc; #[test] fn test_lexing() { - assert_eq!( - LexList::from_tokens(vec![]), - lex_all(Rc::new(RefCell::new(StringReader::new( - "let x = 0\nfor x < 128\n\tx += 1\n".to_owned(), - )))) - ) + let lexed = lex_all(&mut CodeLexer::new(Rc::new(RefCell::new( + StringReader::new("let x = 0\nfor x < 128\n\tx += 1\n".to_owned()), + )))); + println!("LEXED: {:?}", lexed); + assert_eq!(LexList::from_tokens(vec![]), lexed) // assert_eq!(1, cnt, "No item in ProblemCollector"); } diff --git a/src/mango/lexing/util/lex_all.rs b/src/mango/lexing/util/lex_all.rs index 99481231..82ee0c1d 100644 --- a/src/mango/lexing/util/lex_all.rs +++ b/src/mango/lexing/util/lex_all.rs @@ -1,11 +1,6 @@ -use mango::io::typ::Reader; -use mango::lexing::code_lexer::CodeLexer; use mango::lexing::typ::Lexer; use mango::lexing::typ::MaybeToken; -use mango::token::Token; use mango::token::Tokens; -use std::cell::RefCell; -use std::rc::Rc; /// Represents all the lex tokens in a source. #[derive(PartialEq, Eq, Debug)] @@ -18,14 +13,13 @@ impl LexList { LexList { tokens } } - pub fn from_reader(reader: Rc>) -> Self { - lex_all(reader) + pub fn from_reader(lexer: &mut Lexer) -> Self { + lex_all(lexer) } } -pub fn lex_all(reader: Rc>) -> LexList { +pub fn lex_all(lexer: &mut Lexer) -> LexList { let mut list = Vec::with_capacity(512); - let mut lexer = CodeLexer::new(reader); while let MaybeToken::Token(token) = lexer.lex() { list.push(token) } diff --git a/src/mango/towasm/tests.rs b/src/mango/towasm/tests.rs index 18f08dc2..0bf72f76 100644 --- a/src/mango/towasm/tests.rs +++ b/src/mango/towasm/tests.rs @@ -1,6 +1,5 @@ use mango::towasm::arithmetic::Add; use mango::towasm::collect::datatype::Value; -use mango::towasm::collect::typ::Wasm; use mango::towasm::collect::Type; use mango::towasm::control::BranchIf; use mango::towasm::control::Label; @@ -18,6 +17,7 @@ use mango::towasm::values::Const; use mango::towasm::values::DeclareLocal; #[test] +#[allow(unused_variables)] fn test_example_1() { let param_n = Parameter::new(Name::new("n".to_owned()).unwrap(), Type::Int32); let var_n = param_n.local(); @@ -62,5 +62,5 @@ fn test_example_1() { }, )]); - println!("WAT:\n{}\n", module.as_wat()); + // println!("WAT:\n{}\n", module.as_wat()); } diff --git a/src/mango/util/strslice/slice.rs b/src/mango/util/strslice/slice.rs index 58022f26..acf32ef1 100644 --- a/src/mango/util/strslice/slice.rs +++ b/src/mango/util/strslice/slice.rs @@ -35,7 +35,6 @@ pub fn charslice>(text: S, start: isize, end: isize) -> String { ); length = end as usize - from; } - println!("from: {}, length: {}", from, length); stext.chars().skip(from).take(length).collect() } From 65a195aae48385e2ab7f11fbf963864861c12ff0 Mon Sep 17 00:00:00 2001 From: Mark Date: Wed, 6 Jun 2018 22:58:50 +0200 Subject: [PATCH 24/49] More progress on lexing tests, stuck on tab #52 --- src/mango/io/fortest/stringreader.rs | 44 ++++++++++++++++------------ src/mango/io/typ.rs | 4 ++- src/mango/lexing/code_lexer.rs | 17 ++++++----- 3 files changed, 39 insertions(+), 26 deletions(-) diff --git a/src/mango/io/fortest/stringreader.rs b/src/mango/io/fortest/stringreader.rs index 69984d72..fd5a1d4c 100644 --- a/src/mango/io/fortest/stringreader.rs +++ b/src/mango/io/fortest/stringreader.rs @@ -4,6 +4,7 @@ use mango::io::util::REXCACHE; /// Implementation of [Reader] that reads from a pre-provided string. /// Mostly for testing purposes. +#[derive(Debug)] pub struct StringReader { code: String, index: usize, @@ -20,30 +21,37 @@ impl Reader for StringReader { // Check for subpattern REXCACHE.with(|rl| { let mut rexlib = rl.borrow_mut(); - { - // Check for end of file - // TODO: is there a better/faster way for this? maybe try this after a match and set a flag? - let regex = rexlib.make_or_get(r"\s*"); - match regex.find(&self.code[self.index..]) { - Some(mtch) => { + // Check for end of file + // TODO: is there a better/faster way for this? maybe try this after a match and set a flag? + let regex = rexlib.make_or_get(r"\s*$"); + match regex.find(&self.code[self.index..]) { + Some(mtch) => { + if self.index + mtch.as_str().len() == self.code.len() { self.index += mtch.as_str().len(); return ReaderResult::EOF(); } - None => (), } + None => (), } - { - // Check for subpattern - let regex = rexlib.make_or_get(subpattern); - return match regex.find(&self.code[self.index..]) { - Some(mtch) => { - self.index += mtch.as_str().len(); - println!(">>> {}", mtch.as_str()); - ReaderResult::Match(mtch.as_str().to_owned()) + // Check for subpattern + let regex = rexlib.make_or_get(subpattern); + return match regex.find(&self.code[self.index..]) { + Some(mtch) => { + self.index += mtch.as_str().len(); + // Remove leading spaces + let mut k = 0; + for (i, byt) in mtch.as_str().chars().enumerate() { + if byt != ' ' { + break; + } + k = i + 1; } - None => ReaderResult::NoMatch(), - }; - } + ReaderResult::Match((&mtch.as_str()[k..]).to_owned()) + } + None => ReaderResult::NoMatch(), + }; }) } } + +// TODO: tests (spaces, end) diff --git a/src/mango/io/typ.rs b/src/mango/io/typ.rs index 13b4e0ba..5f8fd9f0 100644 --- a/src/mango/io/typ.rs +++ b/src/mango/io/typ.rs @@ -1,5 +1,7 @@ // TODO: I should perhaps separate the splitting that happens here from the actual reading +use std::fmt::Debug; + pub enum ReaderResult { Match(String), NoMatch(), @@ -7,7 +9,7 @@ pub enum ReaderResult { } /// A reader represents a source 'file', which may be a file, webpage, string, ... -pub trait Reader { +pub trait Reader: Debug { /// Checks whether the `text` is found starting from the current position. // fn equals(&mut self, texts: Vec<&str>) -> ReaderResult; diff --git a/src/mango/lexing/code_lexer.rs b/src/mango/lexing/code_lexer.rs index 112ff22e..0c5b6398 100644 --- a/src/mango/lexing/code_lexer.rs +++ b/src/mango/lexing/code_lexer.rs @@ -185,10 +185,13 @@ impl Lexer for CodeLexer { return Token(Tokens::ParenthesisClose(ParenthesisCloseToken::new())); } - let unknown_word = self.reader.borrow_mut().matches(" *[^\\s]+"); + let unknown_word = self.reader.borrow_mut().matches("[^\\s]+"); match unknown_word { Match(word) => return Token(Tokens::Unlexable(UnlexableToken::new(word))), - NoMatch() => panic!("Do not know how to proceed with parsing"), + NoMatch() => { + println!("END {:?}", self.reader.borrow()); + panic!("Do not know how to proceed with parsing") + } EOF() => End, } } @@ -213,11 +216,11 @@ mod tests { #[test] fn test_lexing() { - let lexed = lex_all(&mut CodeLexer::new(Rc::new(RefCell::new( - StringReader::new("let x = 0\nfor x < 128\n\tx += 1\n".to_owned()), - )))); - println!("LEXED: {:?}", lexed); - assert_eq!(LexList::from_tokens(vec![]), lexed) + // let lexed = lex_all(&mut CodeLexer::new(Rc::new(RefCell::new( + // StringReader::new("let x = 0\nfor x < 128\n\tx += 1\n".to_owned()), + // )))); + // println!("LEXED: {:?}", lexed); + // assert_eq!(LexList::from_tokens(vec![]), lexed) // assert_eq!(1, cnt, "No item in ProblemCollector"); } From cb6723570668be9f09cb217f31c672607835a1d4 Mon Sep 17 00:00:00 2001 From: Mark Date: Thu, 7 Jun 2018 07:38:07 +0200 Subject: [PATCH 25/49] Example unit test, not all functionality implemented #52 --- src/mango/lexing/code_lexer.rs | 87 ++++++++++++++++++++-------- src/mango/util/codeparts/operator.rs | 5 ++ 2 files changed, 68 insertions(+), 24 deletions(-) diff --git a/src/mango/lexing/code_lexer.rs b/src/mango/lexing/code_lexer.rs index 0c5b6398..b4fa5be4 100644 --- a/src/mango/lexing/code_lexer.rs +++ b/src/mango/lexing/code_lexer.rs @@ -4,6 +4,7 @@ use mango::lexing::string_lexer::StringLexer; use mango::lexing::typ::Lexer; use mango::lexing::typ::MaybeToken; use mango::token::special::UnlexableToken; +use mango::token::Tokens; use mango::token::tokens::AssociationToken; use mango::token::tokens::EndBlockToken; use mango::token::tokens::EndStatementToken; @@ -13,7 +14,6 @@ use mango::token::tokens::OperatorToken; use mango::token::tokens::ParenthesisCloseToken; use mango::token::tokens::ParenthesisOpenToken; use mango::token::tokens::StartBlockToken; -use mango::token::Tokens; use mango::util::collection::Queue; use std::cell::RefCell; use std::rc::Rc; @@ -103,20 +103,29 @@ impl Lexer for CodeLexer { let continue_match_res = self.reader.borrow_mut().matches("\\.\\.\\."); if let Match(_) = continue_match_res { // Line continuation has no token, it just continues on the next line. - if let Match(_) = self.reader.borrow_mut().matches("\\n\\r?") { + let newline_match_res = self.reader.borrow_mut().matches("\\n\\r?"); + if let Match(_) = newline_match_res { // There should always be a newline after continuations, so that they can be ignored together. - } else if let Match(word) = self.reader.borrow_mut().matches("[^\\n]*\\n\\r?") { - return Token(Tokens::Unlexable(UnlexableToken::new(word))); } else { - // TODO: I don't know yet how to deal with '...' followed by end-of-file - panic!() + let newline_match_res = self.reader.borrow_mut().matches("[^\\n]*\\n\\r?"); + if let Match(word) = newline_match_res { + self.buffer.push(Tokens::Unlexable(UnlexableToken::new(word))); + // This is a new line, so there may be indents. + self.lex_indents(); + return self.lex(); + } else { + // TODO: I don't know yet how to deal with '...' followed by end-of-file + panic!() + } } - // This is a new line, so there may be indents. - return self.lex_indents(); } - if let Match(_) = self.reader.borrow_mut().matches("\\n\\r?") { + let newline_match_res = self.reader.borrow_mut().matches("\\n\\r?"); + if let Match(_) = newline_match_res { // Newline WITHOUT line continuation. - return Token(Tokens::EndStatement(EndStatementToken::new_end_line())); + // This is a new line, so there may be indents. + self.buffer.push(Tokens::EndStatement(EndStatementToken::new_end_line())); + self.lex_indents(); + return self.lex(); } let end_statement_match_res = self.reader.borrow_mut().matches(";"); if let Match(_) = end_statement_match_res { @@ -141,13 +150,13 @@ impl Lexer for CodeLexer { .reader .borrow_mut() .matches(IdentifierToken::subpattern()) - { - // later: maybe turn identifier into keyword to avoid a string copy? kind of elaborate... - if let Ok(keyword) = KeywordToken::from_str(word.clone()) { - return Token(Tokens::Keyword(keyword)); + { + // later: maybe turn identifier into keyword to avoid a string copy? kind of elaborate... + if let Ok(keyword) = KeywordToken::from_str(word.clone()) { + return Token(Tokens::Keyword(keyword)); + } + return Token(Tokens::Identifier(IdentifierToken::from_str(word).unwrap())); } - return Token(Tokens::Identifier(IdentifierToken::from_str(word).unwrap())); - } // Literal let string_match_res = self.reader.borrow_mut().matches("[a-z]?\""); if let Match(_) = string_match_res { @@ -189,7 +198,7 @@ impl Lexer for CodeLexer { match unknown_word { Match(word) => return Token(Tokens::Unlexable(UnlexableToken::new(word))), NoMatch() => { - println!("END {:?}", self.reader.borrow()); + println!("END {:?}", self.reader.borrow()); // TODO panic!("Do not know how to proceed with parsing") } EOF() => End, @@ -208,20 +217,50 @@ impl Lexer for CodeLexer { #[cfg(test)] mod tests { - use super::CodeLexer; use mango::io::fortest::StringReader; use mango::lexing::util::lex_all::{lex_all, LexList}; + use mango::token::Tokens; use std::cell::RefCell; use std::rc::Rc; + use super::CodeLexer; + + use mango::token::tokens::AssociationToken; + use mango::token::tokens::EndBlockToken; + use mango::token::tokens::EndStatementToken; + use mango::token::tokens::IdentifierToken; + use mango::token::tokens::KeywordToken; + use mango::token::tokens::OperatorToken; + use mango::token::tokens::ParenthesisCloseToken; + use mango::token::tokens::ParenthesisOpenToken; + use mango::token::tokens::StartBlockToken; + use mango::token::tokens::LiteralToken; #[test] fn test_lexing() { - // let lexed = lex_all(&mut CodeLexer::new(Rc::new(RefCell::new( - // StringReader::new("let x = 0\nfor x < 128\n\tx += 1\n".to_owned()), - // )))); - // println!("LEXED: {:?}", lexed); - // assert_eq!(LexList::from_tokens(vec![]), lexed) - // assert_eq!(1, cnt, "No item in ProblemCollector"); + + // TODO: do indenting as a decorator? I do already have the indent on CodeLexer, and if I do a decorator I need a new type... + + assert_eq!( + LexList::from_tokens(vec![ + Tokens::Keyword(KeywordToken::from_str("let".to_owned()).unwrap()), + Tokens::Identifier(IdentifierToken::from_str("x".to_owned()).unwrap()), + Tokens::Association(AssociationToken::from_unprefixed()), + Tokens::Literal(LiteralToken::Int(0)), + Tokens::EndStatement(EndStatementToken::new_end_line()), + Tokens::Keyword(KeywordToken::from_str("for".to_owned()).unwrap()), + Tokens::Operator(OperatorToken::from_str("<").unwrap()), + Tokens::Literal(LiteralToken::Int(128)), + Tokens::EndStatement(EndStatementToken::new_end_line()), + Tokens::StartBlock(StartBlockToken::new()), + Tokens::Identifier(IdentifierToken::from_str("x".to_owned()).unwrap()), + Tokens::Association(AssociationToken::from_str("+".to_owned()).unwrap()), + Tokens::Literal(LiteralToken::Int(1)), + Tokens::EndBlock(EndBlockToken::new(true, false)), + ]), + lex_all(&mut CodeLexer::new(Rc::new(RefCell::new( + StringReader::new("let x = 0\nfor x < 128\n\tx += 1\n".to_owned()), + )))) + ) } #[test] diff --git a/src/mango/util/codeparts/operator.rs b/src/mango/util/codeparts/operator.rs index 1a7e59fd..22d39375 100644 --- a/src/mango/util/codeparts/operator.rs +++ b/src/mango/util/codeparts/operator.rs @@ -21,6 +21,11 @@ impl Symbol { "-" => Ok(Symbol::Dash), "*" => Ok(Symbol::Asterisk), "/" => Ok(Symbol::Slash), + "<" => Ok(Symbol::Slash), + ">" => Ok(Symbol::Slash), + "==" => Ok(Symbol::Slash), + ">=" => Ok(Symbol::Slash), + "<=" => Ok(Symbol::Slash), _ => Err(Msg::from_valid(&format!( "Unknown symbol: '{}'", ssymbol_txt From 19d69c98a29bc92c51c1820a525196adcf926f02 Mon Sep 17 00:00:00 2001 From: Mark Date: Fri, 8 Jun 2018 22:48:45 +0200 Subject: [PATCH 26/49] Try to make generators work (but they dont yet) #52 --- src/lib.rs | 1 + src/mango/lexing/code_lexer.rs | 85 +++++++++++++++++++++--------- src/mango/lexing/gen_code_lexer.rs | 34 ++++++++++++ src/mango/lexing/mod.rs | 1 + src/mango/token/collect/all.rs | 11 ++++ 5 files changed, 106 insertions(+), 26 deletions(-) create mode 100644 src/mango/lexing/gen_code_lexer.rs diff --git a/src/lib.rs b/src/lib.rs index a7b08154..eb74ae95 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,4 +1,5 @@ #![feature(nll)] +#![feature(generators, generator_trait)] extern crate core; #[macro_use] diff --git a/src/mango/lexing/code_lexer.rs b/src/mango/lexing/code_lexer.rs index b4fa5be4..ddc06f3f 100644 --- a/src/mango/lexing/code_lexer.rs +++ b/src/mango/lexing/code_lexer.rs @@ -4,7 +4,6 @@ use mango::lexing::string_lexer::StringLexer; use mango::lexing::typ::Lexer; use mango::lexing::typ::MaybeToken; use mango::token::special::UnlexableToken; -use mango::token::Tokens; use mango::token::tokens::AssociationToken; use mango::token::tokens::EndBlockToken; use mango::token::tokens::EndStatementToken; @@ -14,6 +13,7 @@ use mango::token::tokens::OperatorToken; use mango::token::tokens::ParenthesisCloseToken; use mango::token::tokens::ParenthesisOpenToken; use mango::token::tokens::StartBlockToken; +use mango::token::Tokens; use mango::util::collection::Queue; use std::cell::RefCell; use std::rc::Rc; @@ -74,6 +74,8 @@ impl CodeLexer { } impl Lexer for CodeLexer { + // TODO: TURN THIS AROUND: MAKE A FUNCTION THAT RETURNS FROM A QUEUE, AND CALLS ANOTHER TO FILL THE QUEUE IF NO RETURN + fn lex(&mut self) -> MaybeToken { use self::MaybeToken::*; @@ -109,7 +111,8 @@ impl Lexer for CodeLexer { } else { let newline_match_res = self.reader.borrow_mut().matches("[^\\n]*\\n\\r?"); if let Match(word) = newline_match_res { - self.buffer.push(Tokens::Unlexable(UnlexableToken::new(word))); + self.buffer + .push(Tokens::Unlexable(UnlexableToken::new(word))); // This is a new line, so there may be indents. self.lex_indents(); return self.lex(); @@ -123,7 +126,8 @@ impl Lexer for CodeLexer { if let Match(_) = newline_match_res { // Newline WITHOUT line continuation. // This is a new line, so there may be indents. - self.buffer.push(Tokens::EndStatement(EndStatementToken::new_end_line())); + self.buffer + .push(Tokens::EndStatement(EndStatementToken::new_end_line())); self.lex_indents(); return self.lex(); } @@ -150,13 +154,13 @@ impl Lexer for CodeLexer { .reader .borrow_mut() .matches(IdentifierToken::subpattern()) - { - // later: maybe turn identifier into keyword to avoid a string copy? kind of elaborate... - if let Ok(keyword) = KeywordToken::from_str(word.clone()) { - return Token(Tokens::Keyword(keyword)); - } - return Token(Tokens::Identifier(IdentifierToken::from_str(word).unwrap())); + { + // later: maybe turn identifier into keyword to avoid a string copy? kind of elaborate... + if let Ok(keyword) = KeywordToken::from_str(word.clone()) { + return Token(Tokens::Keyword(keyword)); } + return Token(Tokens::Identifier(IdentifierToken::from_str(word).unwrap())); + } // Literal let string_match_res = self.reader.borrow_mut().matches("[a-z]?\""); if let Match(_) = string_match_res { @@ -201,7 +205,10 @@ impl Lexer for CodeLexer { println!("END {:?}", self.reader.borrow()); // TODO panic!("Do not know how to proceed with parsing") } - EOF() => End, + EOF() => { + // TODO: also dedent and end statement here + End + } } } } @@ -217,31 +224,49 @@ impl Lexer for CodeLexer { #[cfg(test)] mod tests { + use super::CodeLexer; use mango::io::fortest::StringReader; use mango::lexing::util::lex_all::{lex_all, LexList}; - use mango::token::Tokens; - use std::cell::RefCell; - use std::rc::Rc; - use super::CodeLexer; - use mango::token::tokens::AssociationToken; use mango::token::tokens::EndBlockToken; use mango::token::tokens::EndStatementToken; use mango::token::tokens::IdentifierToken; use mango::token::tokens::KeywordToken; + use mango::token::tokens::LiteralToken; use mango::token::tokens::OperatorToken; use mango::token::tokens::ParenthesisCloseToken; use mango::token::tokens::ParenthesisOpenToken; use mango::token::tokens::StartBlockToken; - use mango::token::tokens::LiteralToken; + use mango::token::Tokens; + use std::cell::RefCell; + use std::ops::Generator; + use std::rc::Rc; - #[test] - fn test_lexing() { + fn assert_text_to_tokens(text: &str, tokens: Vec) { + assert_eq!( + LexList::from_tokens(tokens), + lex_all(&mut CodeLexer::new(Rc::new(RefCell::new( + StringReader::new(text.to_owned()) + )))) + ) + } - // TODO: do indenting as a decorator? I do already have the indent on CodeLexer, and if I do a decorator I need a new type... + #[test] + fn test_lexing_individual() { + assert_text_to_tokens( + "if", + vec![Tokens::Keyword( + KeywordToken::from_str("if".to_owned()).unwrap(), + )], + ); + // todo: more + } - assert_eq!( - LexList::from_tokens(vec![ + #[test] + fn test_lexing_combined() { + assert_text_to_tokens( + "let x = 0\nfor x < 128\n\tx += 1", + vec![ Tokens::Keyword(KeywordToken::from_str("let".to_owned()).unwrap()), Tokens::Identifier(IdentifierToken::from_str("x".to_owned()).unwrap()), Tokens::Association(AssociationToken::from_unprefixed()), @@ -256,13 +281,21 @@ mod tests { Tokens::Association(AssociationToken::from_str("+".to_owned()).unwrap()), Tokens::Literal(LiteralToken::Int(1)), Tokens::EndBlock(EndBlockToken::new(true, false)), - ]), - lex_all(&mut CodeLexer::new(Rc::new(RefCell::new( - StringReader::new("let x = 0\nfor x < 128\n\tx += 1\n".to_owned()), - )))) - ) + ], + ); } #[test] fn test_lexing_delegation() {} + + #[test] + fn generators() { + let mut gen = || { + yield Tokens::Keyword(KeywordToken::from_str("let".to_owned()).unwrap()); + yield Tokens::Identifier(IdentifierToken::from_str("x".to_owned()).unwrap()); + yield Tokens::Association(AssociationToken::from_unprefixed()); + return; + }; + let first = unsafe { gen.resume() }; + } } diff --git a/src/mango/lexing/gen_code_lexer.rs b/src/mango/lexing/gen_code_lexer.rs new file mode 100644 index 00000000..6c968d6a --- /dev/null +++ b/src/mango/lexing/gen_code_lexer.rs @@ -0,0 +1,34 @@ +use mango::io::typ::Reader; +use mango::token::tokens::LiteralToken; +use mango::token::Tokens; +use std::cell::RefCell; +use std::ops::Generator; +use std::rc::Rc; + +/// This generator does the real lexing work, but is wrapped in a normal +/// class to satisfy an interface that doesn't expose nightly or unsafe features. +//struct GenCodeLexer> { +// generator: G +//} +// +//impl> GenCodeLexer { +// pub fn new() -> Self { +// let mut reader: Rc>; +// GenCodeLexer{ generator: 0 } +// } +//} + +struct Container> { + generator: G, +} + +impl> Container { + pub fn new() -> Self { + let mut reader: Rc>; + Container { + generator: || { + yield 0; + }, + } + } +} diff --git a/src/mango/lexing/mod.rs b/src/mango/lexing/mod.rs index 254d9a1c..9fe3a49a 100644 --- a/src/mango/lexing/mod.rs +++ b/src/mango/lexing/mod.rs @@ -1,6 +1,7 @@ pub mod typ; pub mod code_lexer; +mod gen_code_lexer; pub mod comment_lexer; diff --git a/src/mango/token/collect/all.rs b/src/mango/token/collect/all.rs index 17571b09..0dd44600 100644 --- a/src/mango/token/collect/all.rs +++ b/src/mango/token/collect/all.rs @@ -47,3 +47,14 @@ impl ToText for Tokens { } } } + +#[cfg(test)] +mod tests { + use mango::token::Tokens; + use std::mem::size_of; + + #[test] + fn test_tokens_size() { + assert!(size_of::() < 32); + } +} From 5683e29751150f1adc380184fbd95ed8ae192b07 Mon Sep 17 00:00:00 2001 From: Mark Date: Sat, 9 Jun 2018 13:00:05 +0200 Subject: [PATCH 27/49] Fixed the MWE for generator #52 --- src/mango/lexing/gen_code_lexer.rs | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/src/mango/lexing/gen_code_lexer.rs b/src/mango/lexing/gen_code_lexer.rs index 6c968d6a..2804a522 100644 --- a/src/mango/lexing/gen_code_lexer.rs +++ b/src/mango/lexing/gen_code_lexer.rs @@ -2,7 +2,7 @@ use mango::io::typ::Reader; use mango::token::tokens::LiteralToken; use mango::token::Tokens; use std::cell::RefCell; -use std::ops::Generator; +use std::ops::{Generator, GeneratorState}; use std::rc::Rc; /// This generator does the real lexing work, but is wrapped in a normal @@ -22,13 +22,23 @@ struct Container> { generator: G, } -impl> Container { - pub fn new() -> Self { - let mut reader: Rc>; - Container { - generator: || { - yield 0; - }, +impl Container>> { + pub fn new() -> Box { + let q = 42; + Box::new(Container { + generator: Box::new(move || { + yield 1i32 * q; + yield 2i32 * q; + yield 3i32 * q; + }), + }) + } + + pub fn next(&mut self) -> Option { + // Hide the unsafe part. + match unsafe { self.generator.resume() } { + GeneratorState::Yielded(nr) => Option::Some(nr), + GeneratorState::Complete(_) => Option::None, } } } From 3c854720aa58240c72618213c3b18e6db40f9e0e Mon Sep 17 00:00:00 2001 From: Mark Date: Tue, 12 Jun 2018 07:04:03 +0200 Subject: [PATCH 28/49] -m --- src/mango/lexing/gen_code_lexer.rs | 154 ++++++++++++++++++++++++++--- 1 file changed, 140 insertions(+), 14 deletions(-) diff --git a/src/mango/lexing/gen_code_lexer.rs b/src/mango/lexing/gen_code_lexer.rs index 2804a522..ebf4b8e5 100644 --- a/src/mango/lexing/gen_code_lexer.rs +++ b/src/mango/lexing/gen_code_lexer.rs @@ -1,6 +1,21 @@ use mango::io::typ::Reader; -use mango::token::tokens::LiteralToken; +use mango::io::typ::ReaderResult::*; +use mango::lexing::string_lexer::StringLexer; +use mango::lexing::typ::Lexer; +use mango::lexing::typ::MaybeToken; +use mango::token::special::UnlexableToken; use mango::token::Tokens; +use mango::token::tokens::AssociationToken; +use mango::token::tokens::EndBlockToken; +use mango::token::tokens::EndStatementToken; +use mango::token::tokens::IdentifierToken; +use mango::token::tokens::KeywordToken; +use mango::token::tokens::LiteralToken; +use mango::token::tokens::OperatorToken; +use mango::token::tokens::ParenthesisCloseToken; +use mango::token::tokens::ParenthesisOpenToken; +use mango::token::tokens::StartBlockToken; +use mango::util::collection::Queue; use std::cell::RefCell; use std::ops::{Generator, GeneratorState}; use std::rc::Rc; @@ -18,27 +33,138 @@ use std::rc::Rc; // } //} -struct Container> { +struct Container> { generator: G, } -impl Container>> { - pub fn new() -> Box { +impl Container>> { + pub fn new(reader: Box) -> Box { let q = 42; Box::new(Container { generator: Box::new(move || { - yield 1i32 * q; - yield 2i32 * q; - yield 3i32 * q; + + // If there is a buffer due to indentation or continuations, return from that. + if let Some(token) = self.buffer.pop() { + yield token; + } + // Past this point, we assume that hte buffer is empty. When adding stuff, pop it or re-enter lex() soon. + let continue_match_res = self.reader.borrow_mut().matches("\\.\\.\\."); + if let Match(_) = continue_match_res { + // Line continuation has no token, it just continues on the next line. + let newline_match_res = self.reader.borrow_mut().matches("\\n\\r?"); + if let Match(_) = newline_match_res { + // There should always be a newline after continuations, so that they can be ignored together. + } else { + let newline_match_res = self.reader.borrow_mut().matches("[^\\n]*\\n\\r?"); + if let Match(word) = newline_match_res { + self.buffer + .push(Tokens::Unlexable(UnlexableToken::new(word))); + // This is a new line, so there may be indents. + self.lex_indents(); + yield self.lex(); + } else { + // TODO: I don't know yet how to deal with '...' followed by end-of-file + panic!() + } + } + } + let newline_match_res = self.reader.borrow_mut().matches("\\n\\r?"); + if let Match(_) = newline_match_res { + // Newline WITHOUT line continuation. + // This is a new line, so there may be indents. + self.buffer + .push(Tokens::EndStatement(EndStatementToken::new_end_line())); + self.lex_indents(); + yield self.lex(); + } + let end_statement_match_res = self.reader.borrow_mut().matches(";"); + if let Match(_) = end_statement_match_res { + // Semicolon, which ends a statement. + // Need to do some extra work with buffer, because there may be a newline followed by indentation, which ; should precede. + self.buffer + .push(Tokens::EndStatement(EndStatementToken::new_semicolon())); + let end_line_match_res = self.reader.borrow_mut().matches("\\n\\r?"); + if let Match(_) = end_line_match_res { + // If semicolon is followed by a newline (redundant), then we need to deal with indents (but ignore the newline itself). + // This will return the queue of tokens, including the semicolon. + yield self.lex_indents(); + } + // No newline, can just return the semicolon (which is certainly on the queue, and should be the only thing, but it is fine here if not). + yield self.buffer.pop().unwrap(); + } + // + // Indentation done; do the rest of lexing. + // + // Parse identifiers and keywords. This assumes that keywords are a subset of identifiers. + if let Match(word) = self + .reader + .borrow_mut() + .matches(IdentifierToken::subpattern()) + { + // later: maybe turn identifier into keyword to avoid a string copy? kind of elaborate... + if let Ok(keyword) = KeywordToken::from_str(word.clone()) { + yield Tokens::Keyword(keyword); + } + yield Tokens::Identifier(IdentifierToken::from_str(word).unwrap()); + } + // Literal + let string_match_res = self.reader.borrow_mut().matches("[a-z]?\""); + if let Match(_) = string_match_res { + let sublexer: Box = + Box::new(StringLexer::new_double_quoted(self.reader.clone())); + self.reader_or_delegate = ReaderOrDelegate::Delegate(sublexer); + yield self.lex(); + } + // Association (before operator) + let association_match_res = self + .reader + .borrow_mut() + .matches(&AssociationToken::subpattern()); + if let Match(token) = association_match_res { + if token.chars().last().unwrap() == '=' { + // return Token(Tokens::Association(AssociationToken::from_str(token[..1]).unwrap())); + yield Tokens::Association(AssociationToken::from_unprefixed()); // TODO + } else { + yield Tokens::Association(AssociationToken::from_unprefixed()); + } + } + // Operator + let operator_match_res = self + .reader + .borrow_mut() + .matches(OperatorToken::subpattern()); + if let Match(token) = operator_match_res { + yield Tokens::Operator(OperatorToken::from_str(&token).unwrap()); + } + // Grouping symbols + if let Match(_) = self.reader.borrow_mut().matches(r"\(") { + yield Tokens::ParenthesisOpen(ParenthesisOpenToken::new()); + } + if let Match(_) = self.reader.borrow_mut().matches(r"\)") { + yield Tokens::ParenthesisClose(ParenthesisCloseToken::new()); + } + + let unknown_word = self.reader.borrow_mut().matches("[^\\s]+"); + match unknown_word { + Match(word) => yield Tokens::Unlexable(UnlexableToken::new(word)), + NoMatch() => { + panic!("Do not know how to proceed with parsing") + } + EOF() => { + // TODO: also dedent and end statement here + return + } + } + }), }) } - pub fn next(&mut self) -> Option { - // Hide the unsafe part. - match unsafe { self.generator.resume() } { - GeneratorState::Yielded(nr) => Option::Some(nr), - GeneratorState::Complete(_) => Option::None, - } - } +// pub fn next(&mut self) -> Option { +// // Hide the unsafe part. +// match unsafe { self.generator.resume() } { +// GeneratorState::Yielded(nr) => Option::Some(nr), +// GeneratorState::Complete(_) => Option::None, +// } +// } } From e5ce31cfb99b6aa78f0e160563edd02a396d998b Mon Sep 17 00:00:00 2001 From: Mark Date: Tue, 12 Jun 2018 22:00:10 +0200 Subject: [PATCH 29/49] Restructured lexing using generator, now just borrow/type problem #52 --- src/mango/lexing/gen_code_lexer.rs | 251 +++++++++++++++++------------ src/mango/util/strslice/slice.rs | 7 +- 2 files changed, 155 insertions(+), 103 deletions(-) diff --git a/src/mango/lexing/gen_code_lexer.rs b/src/mango/lexing/gen_code_lexer.rs index ebf4b8e5..a045b377 100644 --- a/src/mango/lexing/gen_code_lexer.rs +++ b/src/mango/lexing/gen_code_lexer.rs @@ -19,6 +19,9 @@ use mango::util::collection::Queue; use std::cell::RefCell; use std::ops::{Generator, GeneratorState}; use std::rc::Rc; +use std::borrow::BorrowMut; +use mango::util::strslice::charsliceto; +use mango::util::strslice::slice::glyphat; /// This generator does the real lexing work, but is wrapped in a normal /// class to satisfy an interface that doesn't expose nightly or unsafe features. @@ -34,126 +37,170 @@ use std::rc::Rc; //} struct Container> { + delegate: Option>, + reader: Rc>, generator: G, } impl Container>> { - pub fn new(reader: Box) -> Box { + + fn lex_indents(&mut self) -> Vec { + let mut line_indent = 0; + while let Match(_) = self.reader.borrow_mut().matches("\\t") { + line_indent += 1; + } + for _ in line_indent..self.indent { + // This line is dedented, make end tokens. + // TODO: turn this "new" into a constant + if let Match(_) = self.reader.borrow_mut().matches("end") { + // If this is followed by an 'end' keyword, then that 'end' is redundant. + self.buffer + .push(Tokens::EndBlock(EndBlockToken::new(true, true))); + } else { + self.buffer + .push(Tokens::EndBlock(EndBlockToken::new(true, false))); + } + } + for _ in self.indent..line_indent { + // This line is indented, make start tokens. + self.buffer.push(Tokens::StartBlock(StartBlockToken::new())); + } + self.indent = line_indent; + self.lex() + } + + pub fn new(&mut self, reader: Rc>) -> Box { let q = 42; Box::new(Container { + reader: reader, + delegate: Option::None, generator: Box::new(move || { - // If there is a buffer due to indentation or continuations, return from that. - if let Some(token) = self.buffer.pop() { - yield token; - } - // Past this point, we assume that hte buffer is empty. When adding stuff, pop it or re-enter lex() soon. - let continue_match_res = self.reader.borrow_mut().matches("\\.\\.\\."); - if let Match(_) = continue_match_res { - // Line continuation has no token, it just continues on the next line. + loop { + + // Delegate to another lexer if one is set. + if let Option::Some(delegate) = self.delegate { + match delegate.lex() { + MaybeToken::Token(token) => { + yield token; + continue; + } + MaybeToken::End => { + self.delegate = Option::None; + } + } + } + + // TODO: see if all these match_res can be removed (they couldn't before due to borrowchecker, even with non-lexical lifetimes) + let continue_match_res = self.reader.borrow_mut().matches("\\.\\.\\."); + if let Match(_) = continue_match_res { + // Line continuation has no token, it just continues on the next line. + let newline_match_res = self.reader.borrow_mut().matches("\\n\\r?"); + if let Match(_) = newline_match_res { + // There should always be a newline after continuations, so that they can be ignored together. + } else { + // All the text between ... and the end of the line is unlexable. + let newline_match_res = self.reader.borrow_mut().matches("[^\\n]*\\n\\r?"); + if let Match(word) = newline_match_res { + yield Tokens::Unlexable(UnlexableToken::new(word)); + // This is a new line, so there may be indents. + // TODO: is there any yield-from like Python? + for res in self.lex_indents() { + yield res; + } + } else { + // TODO: I don't know yet how to deal with '...' followed by end-of-file + panic!() + } + } + // TODO: are continues necessary? it seems more state-independent to restart for each token + continue; + } let newline_match_res = self.reader.borrow_mut().matches("\\n\\r?"); if let Match(_) = newline_match_res { - // There should always be a newline after continuations, so that they can be ignored together. - } else { - let newline_match_res = self.reader.borrow_mut().matches("[^\\n]*\\n\\r?"); - if let Match(word) = newline_match_res { - self.buffer - .push(Tokens::Unlexable(UnlexableToken::new(word))); - // This is a new line, so there may be indents. - self.lex_indents(); - yield self.lex(); + // Newline WITHOUT line continuation. + // This is a new line, so there may be indents. + yield Tokens::EndStatement(EndStatementToken::new_end_line()); + for res in self.lex_indents() { + yield res; + } + continue; + } + let end_statement_match_res = self.reader.borrow_mut().matches(";"); + if let Match(_) = end_statement_match_res { + // Semicolon, which ends a statement. + // Need to do some extra work with buffer, because there may be a newline followed by indentation, which ; should precede. + yield Tokens::EndStatement(EndStatementToken::new_semicolon()); + let end_line_match_res = self.reader.borrow_mut().matches("\\n\\r?"); + if let Match(_) = end_line_match_res { + // If semicolon is followed by a newline (redundant), then we need to deal with indents (but ignore the newline itself). + // This will return the queue of tokens, including the semicolon. + for res in self.lex_indents() { + yield res; + } + } + // No newline, can just return the semicolon (which is certainly on the queue, and should be the only thing, but it is fine here if not). + continue; + } + + // + // Indentation done; do the rest of lexing. + // + // Parse identifiers and keywords. This assumes that keywords are a subset of identifiers. + let word_match_res = self.reader.borrow_mut().matches(IdentifierToken::subpattern()); + if let Match(word) = word_match_res { + // Check if it is a keyword. + // TODO: maybe turn identifier into keyword to avoid a string copy? kind of elaborate... + if let Ok(keyword) = KeywordToken::from_str(word.clone()) { + yield Tokens::Keyword(keyword); + } + yield Tokens::Identifier(IdentifierToken::from_str(word).unwrap()); + continue; + } + // String literal (delegated). + let string_match_res = self.reader.borrow_mut().matches("[a-z]?\""); + if let Match(_) = string_match_res { + let sublexer: Box = Box::new(StringLexer::new_double_quoted(self.reader.clone())); + self.delegate = Option::Some(sublexer); + continue; + } + // Association (before operator). + let association_match_res = self.reader.borrow_mut().matches(&AssociationToken::subpattern()); + if let Match(token) = association_match_res { + if glyphat(token, -1) == "=" { + yield Tokens::Association(AssociationToken::from_unprefixed()); // TODO } else { - // TODO: I don't know yet how to deal with '...' followed by end-of-file - panic!() + yield Tokens::Association(AssociationToken::from_str(charsliceto(token, -1)).unwrap()); } + continue; } - } - let newline_match_res = self.reader.borrow_mut().matches("\\n\\r?"); - if let Match(_) = newline_match_res { - // Newline WITHOUT line continuation. - // This is a new line, so there may be indents. - self.buffer - .push(Tokens::EndStatement(EndStatementToken::new_end_line())); - self.lex_indents(); - yield self.lex(); - } - let end_statement_match_res = self.reader.borrow_mut().matches(";"); - if let Match(_) = end_statement_match_res { - // Semicolon, which ends a statement. - // Need to do some extra work with buffer, because there may be a newline followed by indentation, which ; should precede. - self.buffer - .push(Tokens::EndStatement(EndStatementToken::new_semicolon())); - let end_line_match_res = self.reader.borrow_mut().matches("\\n\\r?"); - if let Match(_) = end_line_match_res { - // If semicolon is followed by a newline (redundant), then we need to deal with indents (but ignore the newline itself). - // This will return the queue of tokens, including the semicolon. - yield self.lex_indents(); + // Operator. + let operator_match_res = self.reader.borrow_mut().matches(OperatorToken::subpattern()); + if let Match(token) = operator_match_res { + yield Tokens::Operator(OperatorToken::from_str(&token).unwrap()); + continue; } - // No newline, can just return the semicolon (which is certainly on the queue, and should be the only thing, but it is fine here if not). - yield self.buffer.pop().unwrap(); - } - // - // Indentation done; do the rest of lexing. - // - // Parse identifiers and keywords. This assumes that keywords are a subset of identifiers. - if let Match(word) = self - .reader - .borrow_mut() - .matches(IdentifierToken::subpattern()) - { - // later: maybe turn identifier into keyword to avoid a string copy? kind of elaborate... - if let Ok(keyword) = KeywordToken::from_str(word.clone()) { - yield Tokens::Keyword(keyword); + // Grouping symbols + if let Match(_) = self.reader.borrow_mut().matches(r"\(") { + yield Tokens::ParenthesisOpen(ParenthesisOpenToken::new()); + continue; } - yield Tokens::Identifier(IdentifierToken::from_str(word).unwrap()); - } - // Literal - let string_match_res = self.reader.borrow_mut().matches("[a-z]?\""); - if let Match(_) = string_match_res { - let sublexer: Box = - Box::new(StringLexer::new_double_quoted(self.reader.clone())); - self.reader_or_delegate = ReaderOrDelegate::Delegate(sublexer); - yield self.lex(); - } - // Association (before operator) - let association_match_res = self - .reader - .borrow_mut() - .matches(&AssociationToken::subpattern()); - if let Match(token) = association_match_res { - if token.chars().last().unwrap() == '=' { - // return Token(Tokens::Association(AssociationToken::from_str(token[..1]).unwrap())); - yield Tokens::Association(AssociationToken::from_unprefixed()); // TODO - } else { - yield Tokens::Association(AssociationToken::from_unprefixed()); + if let Match(_) = self.reader.borrow_mut().matches(r"\)") { + yield Tokens::ParenthesisClose(ParenthesisCloseToken::new()); + continue; } - } - // Operator - let operator_match_res = self - .reader - .borrow_mut() - .matches(OperatorToken::subpattern()); - if let Match(token) = operator_match_res { - yield Tokens::Operator(OperatorToken::from_str(&token).unwrap()); - } - // Grouping symbols - if let Match(_) = self.reader.borrow_mut().matches(r"\(") { - yield Tokens::ParenthesisOpen(ParenthesisOpenToken::new()); - } - if let Match(_) = self.reader.borrow_mut().matches(r"\)") { - yield Tokens::ParenthesisClose(ParenthesisCloseToken::new()); - } - let unknown_word = self.reader.borrow_mut().matches("[^\\s]+"); - match unknown_word { - Match(word) => yield Tokens::Unlexable(UnlexableToken::new(word)), - NoMatch() => { - panic!("Do not know how to proceed with parsing") - } - EOF() => { - // TODO: also dedent and end statement here - return + + let unknown_word = self.reader.borrow_mut().matches("[^\\s]+"); + match unknown_word { + Match(word) => yield Tokens::Unlexable(UnlexableToken::new(word)), + NoMatch() => panic!("Do not know how to proceed with parsing"), + EOF() => { + // TODO: also dedent and end statement here + return + } } + continue; } }), diff --git a/src/mango/util/strslice/slice.rs b/src/mango/util/strslice/slice.rs index acf32ef1..95055439 100644 --- a/src/mango/util/strslice/slice.rs +++ b/src/mango/util/strslice/slice.rs @@ -48,6 +48,10 @@ pub fn charsliceto>(text: S, end: isize) -> String { charslice(text, 0, end) } +pub fn glyphat>(text: S, pos: isize) -> String { + charslice(text, pos, pos+1) +} + #[cfg(test)] mod tests { use super::*; @@ -58,9 +62,10 @@ mod tests { assert_eq!("你好", charslice("你好!", 0, 2)); assert_eq!("!", charslicefrom("你好!", 2)); assert_eq!("你好", charsliceto("你好!", 2)); + assert_eq!("好", glyphat("你好!", 1)); // Negative indices should match Python 3 behaviour: assert_eq!("你好", charslice("你好!", -3, -1)); assert_eq!("!", charslicefrom("你好!", -1)); - assert_eq!("你好", charsliceto("你好!", -1)); + assert_eq!("好", glyphat("你好!", -2)); } } From f95c8927e8dd5536c86973a2cad85c45ba741908 Mon Sep 17 00:00:00 2001 From: Mark Date: Thu, 14 Jun 2018 15:16:24 +0200 Subject: [PATCH 30/49] Progress on rewriting lexer #52 --- src/mango/lexing/gen_code_lexer.rs | 230 +++++++++++++++-------------- 1 file changed, 116 insertions(+), 114 deletions(-) diff --git a/src/mango/lexing/gen_code_lexer.rs b/src/mango/lexing/gen_code_lexer.rs index a045b377..e8a86956 100644 --- a/src/mango/lexing/gen_code_lexer.rs +++ b/src/mango/lexing/gen_code_lexer.rs @@ -37,6 +37,7 @@ use mango::util::strslice::slice::glyphat; //} struct Container> { + indent: i32, delegate: Option>, reader: Rc>, generator: G, @@ -46,6 +47,7 @@ impl Container>> { fn lex_indents(&mut self) -> Vec { let mut line_indent = 0; + let mut res = Vec::with_capacity(12); while let Match(_) = self.reader.borrow_mut().matches("\\t") { line_indent += 1; } @@ -54,11 +56,9 @@ impl Container>> { // TODO: turn this "new" into a constant if let Match(_) = self.reader.borrow_mut().matches("end") { // If this is followed by an 'end' keyword, then that 'end' is redundant. - self.buffer - .push(Tokens::EndBlock(EndBlockToken::new(true, true))); + yield Tokens::EndBlock(EndBlockToken::new(true, true)); } else { - self.buffer - .push(Tokens::EndBlock(EndBlockToken::new(true, false))); + yield Tokens::EndBlock(EndBlockToken::new(true, false)); } } for _ in self.indent..line_indent { @@ -72,6 +72,7 @@ impl Container>> { pub fn new(&mut self, reader: Rc>) -> Box { let q = 42; Box::new(Container { + indent: 0, reader: reader, delegate: Option::None, generator: Box::new(move || { @@ -91,116 +92,117 @@ impl Container>> { } } - // TODO: see if all these match_res can be removed (they couldn't before due to borrowchecker, even with non-lexical lifetimes) - let continue_match_res = self.reader.borrow_mut().matches("\\.\\.\\."); - if let Match(_) = continue_match_res { - // Line continuation has no token, it just continues on the next line. - let newline_match_res = self.reader.borrow_mut().matches("\\n\\r?"); - if let Match(_) = newline_match_res { - // There should always be a newline after continuations, so that they can be ignored together. - } else { - // All the text between ... and the end of the line is unlexable. - let newline_match_res = self.reader.borrow_mut().matches("[^\\n]*\\n\\r?"); - if let Match(word) = newline_match_res { - yield Tokens::Unlexable(UnlexableToken::new(word)); - // This is a new line, so there may be indents. - // TODO: is there any yield-from like Python? - for res in self.lex_indents() { - yield res; - } - } else { - // TODO: I don't know yet how to deal with '...' followed by end-of-file - panic!() - } - } - // TODO: are continues necessary? it seems more state-independent to restart for each token - continue; - } - let newline_match_res = self.reader.borrow_mut().matches("\\n\\r?"); - if let Match(_) = newline_match_res { - // Newline WITHOUT line continuation. - // This is a new line, so there may be indents. - yield Tokens::EndStatement(EndStatementToken::new_end_line()); - for res in self.lex_indents() { - yield res; - } - continue; - } - let end_statement_match_res = self.reader.borrow_mut().matches(";"); - if let Match(_) = end_statement_match_res { - // Semicolon, which ends a statement. - // Need to do some extra work with buffer, because there may be a newline followed by indentation, which ; should precede. - yield Tokens::EndStatement(EndStatementToken::new_semicolon()); - let end_line_match_res = self.reader.borrow_mut().matches("\\n\\r?"); - if let Match(_) = end_line_match_res { - // If semicolon is followed by a newline (redundant), then we need to deal with indents (but ignore the newline itself). - // This will return the queue of tokens, including the semicolon. - for res in self.lex_indents() { - yield res; - } - } - // No newline, can just return the semicolon (which is certainly on the queue, and should be the only thing, but it is fine here if not). - continue; - } - - // - // Indentation done; do the rest of lexing. - // - // Parse identifiers and keywords. This assumes that keywords are a subset of identifiers. - let word_match_res = self.reader.borrow_mut().matches(IdentifierToken::subpattern()); - if let Match(word) = word_match_res { - // Check if it is a keyword. - // TODO: maybe turn identifier into keyword to avoid a string copy? kind of elaborate... - if let Ok(keyword) = KeywordToken::from_str(word.clone()) { - yield Tokens::Keyword(keyword); - } - yield Tokens::Identifier(IdentifierToken::from_str(word).unwrap()); - continue; - } - // String literal (delegated). - let string_match_res = self.reader.borrow_mut().matches("[a-z]?\""); - if let Match(_) = string_match_res { - let sublexer: Box = Box::new(StringLexer::new_double_quoted(self.reader.clone())); - self.delegate = Option::Some(sublexer); - continue; - } - // Association (before operator). - let association_match_res = self.reader.borrow_mut().matches(&AssociationToken::subpattern()); - if let Match(token) = association_match_res { - if glyphat(token, -1) == "=" { - yield Tokens::Association(AssociationToken::from_unprefixed()); // TODO - } else { - yield Tokens::Association(AssociationToken::from_str(charsliceto(token, -1)).unwrap()); - } - continue; - } - // Operator. - let operator_match_res = self.reader.borrow_mut().matches(OperatorToken::subpattern()); - if let Match(token) = operator_match_res { - yield Tokens::Operator(OperatorToken::from_str(&token).unwrap()); - continue; - } - // Grouping symbols - if let Match(_) = self.reader.borrow_mut().matches(r"\(") { - yield Tokens::ParenthesisOpen(ParenthesisOpenToken::new()); - continue; - } - if let Match(_) = self.reader.borrow_mut().matches(r"\)") { - yield Tokens::ParenthesisClose(ParenthesisCloseToken::new()); - continue; - } - - - let unknown_word = self.reader.borrow_mut().matches("[^\\s]+"); - match unknown_word { - Match(word) => yield Tokens::Unlexable(UnlexableToken::new(word)), - NoMatch() => panic!("Do not know how to proceed with parsing"), - EOF() => { - // TODO: also dedent and end statement here - return - } - } - continue; +// // TODO: see if all these match_res can be removed (they couldn't before due to borrowchecker, even with non-lexical lifetimes) +// let continue_match_res = self.reader.borrow_mut().matches("\\.\\.\\."); +// if let Match(_) = continue_match_res { +// // Line continuation has no token, it just continues on the next line. +// let newline_match_res = self.reader.borrow_mut().matches("\\n\\r?"); +// if let Match(_) = newline_match_res { +// // There should always be a newline after continuations, so that they can be ignored together. +// } else { +// // All the text between ... and the end of the line is unlexable. +// let newline_match_res = self.reader.borrow_mut().matches("[^\\n]*\\n\\r?"); +// if let Match(word) = newline_match_res { +// yield Tokens::Unlexable(UnlexableToken::new(word)); +// // This is a new line, so there may be indents. +// // TODO: is there any yield-from like Python? +// for res in self.lex_indents() { +// yield res; +// } +// } else { +// // TODO: I don't know yet how to deal with '...' followed by end-of-file +// panic!() +// } +// } +// // TODO: are continues necessary? it seems more state-independent to restart for each token +// continue; +// } +// let newline_match_res = self.reader.borrow_mut().matches("\\n\\r?"); +// if let Match(_) = newline_match_res { +// // Newline WITHOUT line continuation. +// // This is a new line, so there may be indents. +// yield Tokens::EndStatement(EndStatementToken::new_end_line()); +// for res in self.lex_indents() { +// yield res; +// } +// continue; +// } +// let end_statement_match_res = self.reader.borrow_mut().matches(";"); +// if let Match(_) = end_statement_match_res { +// // Semicolon, which ends a statement. +// // Need to do some extra work with buffer, because there may be a newline followed by indentation, which ; should precede. +// yield Tokens::EndStatement(EndStatementToken::new_semicolon()); +// let end_line_match_res = self.reader.borrow_mut().matches("\\n\\r?"); +// if let Match(_) = end_line_match_res { +// // If semicolon is followed by a newline (redundant), then we need to deal with indents (but ignore the newline itself). +// // This will return the queue of tokens, including the semicolon. +// for res in self.lex_indents() { +// yield res; +// } +// } +// // No newline, can just return the semicolon (which is certainly on the queue, and should be the only thing, but it is fine here if not). +// continue; +// } +// +// // +// // Indentation done; do the rest of lexing. +// // +// // Parse identifiers and keywords. This assumes that keywords are a subset of identifiers. +// let word_match_res = self.reader.borrow_mut().matches(IdentifierToken::subpattern()); +// if let Match(word) = word_match_res { +// // Check if it is a keyword. +// // TODO: maybe turn identifier into keyword to avoid a string copy? kind of elaborate... +// if word == "end" { +// yield Tokens::EndBlock(EndBlockToken::new(false, true)); +// } else if let Ok(keyword) = KeywordToken::from_str(word.clone()) { +// yield Tokens::Keyword(keyword); +// } +// yield Tokens::Identifier(IdentifierToken::from_str(word).unwrap()); +// continue; +// } +// // String literal (delegated). +// let string_match_res = self.reader.borrow_mut().matches("[a-z]?\""); +// if let Match(_) = string_match_res { +// let sublexer: Box = Box::new(StringLexer::new_double_quoted(self.reader.clone())); +// self.delegate = Option::Some(sublexer); +// continue; +// } +// // Association (before operator). +// let association_match_res = self.reader.borrow_mut().matches(&AssociationToken::subpattern()); +// if let Match(token) = association_match_res { +// if glyphat(token, -1) == "=" { +// yield Tokens::Association(AssociationToken::from_unprefixed()); // TODO +// } else { +// yield Tokens::Association(AssociationToken::from_str(charsliceto(token, -1)).unwrap()); +// } +// continue; +// } +// // Operator. +// let operator_match_res = self.reader.borrow_mut().matches(OperatorToken::subpattern()); +// if let Match(token) = operator_match_res { +// yield Tokens::Operator(OperatorToken::from_str(&token).unwrap()); +// continue; +// } +// // Grouping symbols +// if let Match(_) = self.reader.borrow_mut().matches(r"\(") { +// yield Tokens::ParenthesisOpen(ParenthesisOpenToken::new()); +// continue; +// } +// if let Match(_) = self.reader.borrow_mut().matches(r"\)") { +// yield Tokens::ParenthesisClose(ParenthesisCloseToken::new()); +// continue; +// } +// +// +// let unknown_word = self.reader.borrow_mut().matches("[^\\s]+"); +// match unknown_word { +// Match(word) => yield Tokens::Unlexable(UnlexableToken::new(word)), +// NoMatch() => panic!("Do not know how to proceed with parsing"), +// EOF() => { +// // TODO: also dedent and end statement here +// return +// } +// } } }), From ce4dc6210bdb488af99ced591b5c6b6d70e3e452 Mon Sep 17 00:00:00 2001 From: Mark Date: Sun, 17 Jun 2018 11:56:57 +0200 Subject: [PATCH 31/49] Circular reference problem with generator and container #52 --- src/lib.rs | 2 +- src/mango/lexing/gen_code_lexer.rs | 63 ++++++++++++++++-------------- 2 files changed, 34 insertions(+), 31 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index eb74ae95..c9a815a3 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,4 +1,4 @@ -#![feature(nll)] +//#![feature(nll)] #![feature(generators, generator_trait)] extern crate core; diff --git a/src/mango/lexing/gen_code_lexer.rs b/src/mango/lexing/gen_code_lexer.rs index e8a86956..a004e04a 100644 --- a/src/mango/lexing/gen_code_lexer.rs +++ b/src/mango/lexing/gen_code_lexer.rs @@ -36,61 +36,59 @@ use mango::util::strslice::slice::glyphat; // } //} -struct Container> { +// TODO: this is problematic because the generator wants references to the container, +// TODO: and the container obviously stores the generator + +struct CodeLexer> { indent: i32, delegate: Option>, reader: Rc>, + // TODO: https://stackoverflow.com/questions/50895121/rust-expects-two-levels-of-boxing-for-generator-while-i-only-specified-one generator: G, } -impl Container>> { +impl CodeLexer>> { fn lex_indents(&mut self) -> Vec { let mut line_indent = 0; let mut res = Vec::with_capacity(12); - while let Match(_) = self.reader.borrow_mut().matches("\\t") { + // TODO: I don't need * in MWE but I do here (and other places), can I get rid of it? + while let Match(_) = (*self.reader).borrow_mut().matches("\\t") { line_indent += 1; } for _ in line_indent..self.indent { // This line is dedented, make end tokens. // TODO: turn this "new" into a constant - if let Match(_) = self.reader.borrow_mut().matches("end") { + if let Match(_) = (*self.reader).borrow_mut().matches("end") { // If this is followed by an 'end' keyword, then that 'end' is redundant. - yield Tokens::EndBlock(EndBlockToken::new(true, true)); + res.push(Tokens::EndBlock(EndBlockToken::new(true, true))); } else { - yield Tokens::EndBlock(EndBlockToken::new(true, false)); + res.push(Tokens::EndBlock(EndBlockToken::new(true, false))); } } for _ in self.indent..line_indent { // This line is indented, make start tokens. - self.buffer.push(Tokens::StartBlock(StartBlockToken::new())); + res.push(Tokens::StartBlock(StartBlockToken::new())); } self.indent = line_indent; - self.lex() + res } - pub fn new(&mut self, reader: Rc>) -> Box { - let q = 42; - Box::new(Container { - indent: 0, - reader: reader, - delegate: Option::None, - generator: Box::new(move || { - - loop { - - // Delegate to another lexer if one is set. - if let Option::Some(delegate) = self.delegate { - match delegate.lex() { - MaybeToken::Token(token) => { - yield token; - continue; - } - MaybeToken::End => { - self.delegate = Option::None; - } + pub fn new(reader: Rc>) -> Box { + let generator: Box + 'static> = Box::new(|| { + loop { + // Delegate to another lexer if one is set. + if let Option::Some(ref mut delegate) = self.delegate { + match delegate.lex() { + MaybeToken::Token(token) => { + yield token; + continue; + } + MaybeToken::End => { + self.delegate = Option::None; } } + } // // TODO: see if all these match_res can be removed (they couldn't before due to borrowchecker, even with non-lexical lifetimes) // let continue_match_res = self.reader.borrow_mut().matches("\\.\\.\\."); @@ -203,9 +201,14 @@ impl Container>> { // return // } // } - } + } - }), + }); + Box::new(CodeLexer { + indent: 0, + reader: reader, + delegate: Option::None, + generator: generator, }) } From e66e934340aa8b213d74a225999bdc42715b4f1f Mon Sep 17 00:00:00 2001 From: Mark Date: Sun, 17 Jun 2018 12:03:59 +0200 Subject: [PATCH 32/49] Deprecate two lexer implementations #52 --- src/lib.rs | 4 +- src/mango/lexing/code_lexer_prev.rs | 304 ++++++++++++++++++++++++++++ src/mango/lexing/gen_code_lexer.rs | 3 + src/mango/lexing/mod.rs | 1 - 4 files changed, 309 insertions(+), 3 deletions(-) create mode 100644 src/mango/lexing/code_lexer_prev.rs diff --git a/src/lib.rs b/src/lib.rs index c9a815a3..c76db182 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,5 +1,5 @@ -//#![feature(nll)] -#![feature(generators, generator_trait)] +#![feature(nll)] +//#![feature(generators, generator_trait)] extern crate core; #[macro_use] diff --git a/src/mango/lexing/code_lexer_prev.rs b/src/mango/lexing/code_lexer_prev.rs new file mode 100644 index 00000000..7877a7e6 --- /dev/null +++ b/src/mango/lexing/code_lexer_prev.rs @@ -0,0 +1,304 @@ + +// TODO: dead code, no longer used + +use mango::io::typ::Reader; +use mango::io::typ::ReaderResult::*; +use mango::lexing::string_lexer::StringLexer; +use mango::lexing::typ::Lexer; +use mango::lexing::typ::MaybeToken; +use mango::token::special::UnlexableToken; +use mango::token::tokens::AssociationToken; +use mango::token::tokens::EndBlockToken; +use mango::token::tokens::EndStatementToken; +use mango::token::tokens::IdentifierToken; +use mango::token::tokens::KeywordToken; +use mango::token::tokens::OperatorToken; +use mango::token::tokens::ParenthesisCloseToken; +use mango::token::tokens::ParenthesisOpenToken; +use mango::token::tokens::StartBlockToken; +use mango::token::Tokens; +use mango::util::collection::Queue; +use std::cell::RefCell; +use std::rc::Rc; + +// TODO: Preferably there'd be only one Lexer at a time which has a Reader, but I did not get that to work, +// TODO: see this SO question: https://stackoverflow.com/questions/50535022/borrow-checker-problems-for-parser-that-can-delegate + +enum ReaderOrDelegate { + Reader(), + Delegate(Box), +} + +pub struct CodeLexer { + // reader: Rc>, + indent: i32, + + reader: Rc>, + // This delegate deals with nested structures, like string literals and comments. + reader_or_delegate: ReaderOrDelegate, + // This is unfortunate, would not be needed with 'yield' but is now for indents. + buffer: Queue, +} + +impl CodeLexer { + pub fn new(reader: Rc>) -> Self { + CodeLexer { + reader: reader, + reader_or_delegate: ReaderOrDelegate::Reader(), + indent: 0, + buffer: Queue::new(), + } + } + + fn lex_indents(&mut self) -> MaybeToken { + let mut line_indent = 0; + while let Match(_) = self.reader.borrow_mut().matches("\\t") { + line_indent += 1; + } + for _ in line_indent..self.indent { + // This line is dedented, make end tokens. + // TODO: turn this "new" into a constant + if let Match(_) = self.reader.borrow_mut().matches("end") { + // If this is followed by an 'end' keyword, then that 'end' is redundant. + self.buffer + .push(Tokens::EndBlock(EndBlockToken::new(true, true))); + } else { + self.buffer + .push(Tokens::EndBlock(EndBlockToken::new(true, false))); + } + } + for _ in self.indent..line_indent { + // This line is indented, make start tokens. + self.buffer.push(Tokens::StartBlock(StartBlockToken::new())); + } + self.indent = line_indent; + self.lex() + } +} + +impl Lexer for CodeLexer { + // TODO: TURN THIS AROUND: MAKE A FUNCTION THAT RETURNS FROM A QUEUE, AND CALLS ANOTHER TO FILL THE QUEUE IF NO RETURN + + fn lex(&mut self) -> MaybeToken { + use self::MaybeToken::*; + + // If currently delegating to a sub-lexer, return from that. + match self.reader_or_delegate { + ReaderOrDelegate::Delegate(ref mut delegate) => { + let delegated_token = delegate.lex(); + match delegated_token { + End => { + // Swap back from delegation to direct mode. + // let reader = delegate.get_reader().clone(); + self.reader_or_delegate = ReaderOrDelegate::Reader(); + self.lex() + } + Token(token) => Token(token), + } + // Code to stop delegation cannot be here, because `self` is still mutably borrowed through `delegate` + } + ReaderOrDelegate::Reader() => { + // todo: maybe this branch could be a separate function? + + // If there is a buffer due to indentation or continuations, return from that. + if let Some(token) = self.buffer.pop() { + return Token(token); + } + // Past this point, we assume that hte buffer is empty. When adding stuff, pop it or re-enter lex() soon. + let continue_match_res = self.reader.borrow_mut().matches("\\.\\.\\."); + if let Match(_) = continue_match_res { + // Line continuation has no token, it just continues on the next line. + let newline_match_res = self.reader.borrow_mut().matches("\\n\\r?"); + if let Match(_) = newline_match_res { + // There should always be a newline after continuations, so that they can be ignored together. + } else { + let newline_match_res = self.reader.borrow_mut().matches("[^\\n]*\\n\\r?"); + if let Match(word) = newline_match_res { + self.buffer + .push(Tokens::Unlexable(UnlexableToken::new(word))); + // This is a new line, so there may be indents. + self.lex_indents(); + return self.lex(); + } else { + // TODO: I don't know yet how to deal with '...' followed by end-of-file + panic!() + } + } + } + let newline_match_res = self.reader.borrow_mut().matches("\\n\\r?"); + if let Match(_) = newline_match_res { + // Newline WITHOUT line continuation. + // This is a new line, so there may be indents. + self.buffer + .push(Tokens::EndStatement(EndStatementToken::new_end_line())); + self.lex_indents(); + return self.lex(); + } + let end_statement_match_res = self.reader.borrow_mut().matches(";"); + if let Match(_) = end_statement_match_res { + // Semicolon, which ends a statement. + // Need to do some extra work with buffer, because there may be a newline followed by indentation, which ; should precede. + self.buffer + .push(Tokens::EndStatement(EndStatementToken::new_semicolon())); + let end_line_match_res = self.reader.borrow_mut().matches("\\n\\r?"); + if let Match(_) = end_line_match_res { + // If semicolon is followed by a newline (redundant), then we need to deal with indents (but ignore the newline itself). + // This will return the queue of tokens, including the semicolon. + return self.lex_indents(); + } + // No newline, can just return the semicolon (which is certainly on the queue, and should be the only thing, but it is fine here if not). + return Token(self.buffer.pop().unwrap()); + } + // + // Indentation done; do the rest of lexing. + // + // Parse identifiers and keywords. This assumes that keywords are a subset of identifiers. + if let Match(word) = self + .reader + .borrow_mut() + .matches(IdentifierToken::subpattern()) + { + // later: maybe turn identifier into keyword to avoid a string copy? kind of elaborate... + if let Ok(keyword) = KeywordToken::from_str(word.clone()) { + return Token(Tokens::Keyword(keyword)); + } + return Token(Tokens::Identifier(IdentifierToken::from_str(word).unwrap())); + } + // Literal + let string_match_res = self.reader.borrow_mut().matches("[a-z]?\""); + if let Match(_) = string_match_res { + let sublexer: Box = + Box::new(StringLexer::new_double_quoted(self.reader.clone())); + self.reader_or_delegate = ReaderOrDelegate::Delegate(sublexer); + return self.lex(); + } + // Association (before operator) + let association_match_res = self + .reader + .borrow_mut() + .matches(&AssociationToken::subpattern()); + if let Match(token) = association_match_res { + if token.chars().last().unwrap() == '=' { + // return Token(Tokens::Association(AssociationToken::from_str(token[..1]).unwrap())); + return Token(Tokens::Association(AssociationToken::from_unprefixed())); // TODO + } else { + return Token(Tokens::Association(AssociationToken::from_unprefixed())); + } + } + // Operator + let operator_match_res = self + .reader + .borrow_mut() + .matches(OperatorToken::subpattern()); + if let Match(token) = operator_match_res { + return Token(Tokens::Operator(OperatorToken::from_str(&token).unwrap())); + } + // Grouping symbols + if let Match(_) = self.reader.borrow_mut().matches(r"\(") { + return Token(Tokens::ParenthesisOpen(ParenthesisOpenToken::new())); + } + if let Match(_) = self.reader.borrow_mut().matches(r"\)") { + return Token(Tokens::ParenthesisClose(ParenthesisCloseToken::new())); + } + + let unknown_word = self.reader.borrow_mut().matches("[^\\s]+"); + match unknown_word { + Match(word) => return Token(Tokens::Unlexable(UnlexableToken::new(word))), + NoMatch() => { + println!("END {:?}", self.reader.borrow()); // TODO + panic!("Do not know how to proceed with parsing") + } + EOF() => { + // TODO: also dedent and end statement here + End + } + } + } + } + } + + fn get_reader(&self) -> Rc> { + match self.reader_or_delegate { + ReaderOrDelegate::Reader() => self.reader.clone(), + ReaderOrDelegate::Delegate(ref delegate) => delegate.get_reader(), + } + } +} + +#[cfg(test)] +mod tests { + use super::CodeLexer; + use mango::io::fortest::StringReader; + use mango::lexing::util::lex_all::{lex_all, LexList}; + use mango::token::tokens::AssociationToken; + use mango::token::tokens::EndBlockToken; + use mango::token::tokens::EndStatementToken; + use mango::token::tokens::IdentifierToken; + use mango::token::tokens::KeywordToken; + use mango::token::tokens::LiteralToken; + use mango::token::tokens::OperatorToken; + use mango::token::tokens::ParenthesisCloseToken; + use mango::token::tokens::ParenthesisOpenToken; + use mango::token::tokens::StartBlockToken; + use mango::token::Tokens; + use std::cell::RefCell; + use std::ops::Generator; + use std::rc::Rc; + + fn assert_text_to_tokens(text: &str, tokens: Vec) { + assert_eq!( + LexList::from_tokens(tokens), + lex_all(&mut CodeLexer::new(Rc::new(RefCell::new( + StringReader::new(text.to_owned()) + )))) + ) + } + + #[test] + fn test_lexing_individual() { + assert_text_to_tokens( + "if", + vec![Tokens::Keyword( + KeywordToken::from_str("if".to_owned()).unwrap(), + )], + ); + // todo: more + } + + #[test] + fn test_lexing_combined() { + assert_text_to_tokens( + "let x = 0\nfor x < 128\n\tx += 1", + vec![ + Tokens::Keyword(KeywordToken::from_str("let".to_owned()).unwrap()), + Tokens::Identifier(IdentifierToken::from_str("x".to_owned()).unwrap()), + Tokens::Association(AssociationToken::from_unprefixed()), + Tokens::Literal(LiteralToken::Int(0)), + Tokens::EndStatement(EndStatementToken::new_end_line()), + Tokens::Keyword(KeywordToken::from_str("for".to_owned()).unwrap()), + Tokens::Operator(OperatorToken::from_str("<").unwrap()), + Tokens::Literal(LiteralToken::Int(128)), + Tokens::EndStatement(EndStatementToken::new_end_line()), + Tokens::StartBlock(StartBlockToken::new()), + Tokens::Identifier(IdentifierToken::from_str("x".to_owned()).unwrap()), + Tokens::Association(AssociationToken::from_str("+".to_owned()).unwrap()), + Tokens::Literal(LiteralToken::Int(1)), + Tokens::EndBlock(EndBlockToken::new(true, false)), + ], + ); + } + + #[test] + fn test_lexing_delegation() {} + + #[test] + fn generators() { + let mut gen = || { + yield Tokens::Keyword(KeywordToken::from_str("let".to_owned()).unwrap()); + yield Tokens::Identifier(IdentifierToken::from_str("x".to_owned()).unwrap()); + yield Tokens::Association(AssociationToken::from_unprefixed()); + return; + }; + let first = unsafe { gen.resume() }; + } +} diff --git a/src/mango/lexing/gen_code_lexer.rs b/src/mango/lexing/gen_code_lexer.rs index a004e04a..655ad282 100644 --- a/src/mango/lexing/gen_code_lexer.rs +++ b/src/mango/lexing/gen_code_lexer.rs @@ -1,3 +1,6 @@ + +// TODO: dead code, no longer used + use mango::io::typ::Reader; use mango::io::typ::ReaderResult::*; use mango::lexing::string_lexer::StringLexer; diff --git a/src/mango/lexing/mod.rs b/src/mango/lexing/mod.rs index 9fe3a49a..254d9a1c 100644 --- a/src/mango/lexing/mod.rs +++ b/src/mango/lexing/mod.rs @@ -1,7 +1,6 @@ pub mod typ; pub mod code_lexer; -mod gen_code_lexer; pub mod comment_lexer; From 71e65185a665389739ee40025e1f38e59087cd38 Mon Sep 17 00:00:00 2001 From: Mark Date: Sun, 17 Jun 2018 15:43:01 +0200 Subject: [PATCH 33/49] Some utils for lexing #52 --- src/mango/io/fortest/stringreader.rs | 4 ++++ src/mango/io/typ.rs | 4 ++++ src/mango/util/collection/mod.rs | 3 +++ src/mango/util/collection/queue.rs | 18 +++++++++++------- src/mango/util/collection/stack.rs | 27 +++++++++++++++++++++++++++ 5 files changed, 49 insertions(+), 7 deletions(-) create mode 100644 src/mango/util/collection/stack.rs diff --git a/src/mango/io/fortest/stringreader.rs b/src/mango/io/fortest/stringreader.rs index fd5a1d4c..9419fb5a 100644 --- a/src/mango/io/fortest/stringreader.rs +++ b/src/mango/io/fortest/stringreader.rs @@ -52,6 +52,10 @@ impl Reader for StringReader { }; }) } + + fn get_progress(&self) -> usize { + self.index + } } // TODO: tests (spaces, end) diff --git a/src/mango/io/typ.rs b/src/mango/io/typ.rs index 5f8fd9f0..6aedf0d5 100644 --- a/src/mango/io/typ.rs +++ b/src/mango/io/typ.rs @@ -17,6 +17,10 @@ pub trait Reader: Debug { /// /// This has to eventually return EOF, and keep returning EOF forever after that. fn matches(&mut self, subpattern: &str) -> ReaderResult; + + /// Return a number that can be used to check whether the state has changed. + /// This need not correspond to a specific position, but should be unique for the progress. + fn get_progress(&self) -> usize; } pub trait Writer { diff --git a/src/mango/util/collection/mod.rs b/src/mango/util/collection/mod.rs index 5f327a57..f31304ed 100644 --- a/src/mango/util/collection/mod.rs +++ b/src/mango/util/collection/mod.rs @@ -1,2 +1,5 @@ pub mod queue; pub use self::queue::Queue; + +pub mod stack; +pub use self::stack::Stack; diff --git a/src/mango/util/collection/queue.rs b/src/mango/util/collection/queue.rs index 02996c44..8eb43ec1 100644 --- a/src/mango/util/collection/queue.rs +++ b/src/mango/util/collection/queue.rs @@ -1,23 +1,27 @@ -use std::collections::VecDeque; -/// A one-ended queue. -/// This is just a wrapper around deque so nobody pushes or pops the wrong end. +/// A one-ended queue. See also [Stack]. +/// This is just a wrapper around vec so nobody pushes or pops the wrong end. pub struct Queue { - deque: VecDeque, + items: Vec, } impl Queue { pub fn new() -> Self { Queue { - deque: VecDeque::with_capacity(16), + items: Vec::with_capacity(16), } } pub fn push(&mut self, value: T) { - self.deque.push_back(value) + self.items.push(value) } pub fn pop(&mut self) -> Option { - self.deque.pop_front() + self.items.pop() + } + + /// Moves all the elements from a vector into the queue. + pub fn append(&mut self, mut other: Vec) { + self.items.append(&mut other); } } diff --git a/src/mango/util/collection/stack.rs b/src/mango/util/collection/stack.rs new file mode 100644 index 00000000..055c0a18 --- /dev/null +++ b/src/mango/util/collection/stack.rs @@ -0,0 +1,27 @@ +use std::collections::VecDeque; + +/// A one-ended stack. See also [Queue]. +/// This is just a wrapper around deque so nobody pushes or pops the wrong end. +pub struct Stack { + items: VecDeque, +} + +impl Stack { + pub fn new() -> Self { + Stack { + items: VecDeque::with_capacity(16), + } + } + + pub fn push(&mut self, value: T) { + self.items.push_back(value) + } + + pub fn pop(&mut self) -> Option { + self.items.pop_back() + } + + pub fn borrow_mut(&mut self) -> Option<&mut T> { + self.items.back_mut() + } +} From e6610704d1ad181d5f6800cd7a98dcf35b065a21 Mon Sep 17 00:00:00 2001 From: Mark Date: Sun, 17 Jun 2018 15:43:19 +0200 Subject: [PATCH 34/49] Implement the combi-lexer #52 --- src/mango/lexing/code_lexer.rs | 36 +++---- src/mango/lexing/combi_lexer.rs | 149 +++++++++++++++++++++++++++++ src/mango/lexing/gen_code_lexer.rs | 1 + src/mango/lexing/mod.rs | 12 ++- src/mango/lexing/string_lexer.rs | 6 +- src/mango/lexing/typ.rs | 18 +++- 6 files changed, 187 insertions(+), 35 deletions(-) create mode 100644 src/mango/lexing/combi_lexer.rs diff --git a/src/mango/lexing/code_lexer.rs b/src/mango/lexing/code_lexer.rs index ddc06f3f..06fda0f7 100644 --- a/src/mango/lexing/code_lexer.rs +++ b/src/mango/lexing/code_lexer.rs @@ -17,6 +17,8 @@ use mango::token::Tokens; use mango::util::collection::Queue; use std::cell::RefCell; use std::rc::Rc; +use mango::lexing::typ::SubLexer; +use mango::lexing::typ::SubLexerResult; // TODO: Preferably there'd be only one Lexer at a time which has a Reader, but I did not get that to work, // TODO: see this SO question: https://stackoverflow.com/questions/50535022/borrow-checker-problems-for-parser-that-can-delegate @@ -30,7 +32,6 @@ pub struct CodeLexer { // reader: Rc>, indent: i32, - reader: Rc>, // This delegate deals with nested structures, like string literals and comments. reader_or_delegate: ReaderOrDelegate, // This is unfortunate, would not be needed with 'yield' but is now for indents. @@ -38,16 +39,15 @@ pub struct CodeLexer { } impl CodeLexer { - pub fn new(reader: Rc>) -> Self { + pub fn new() -> Self { CodeLexer { - reader: reader, reader_or_delegate: ReaderOrDelegate::Reader(), indent: 0, buffer: Queue::new(), } } - fn lex_indents(&mut self) -> MaybeToken { + fn lex_indents(&mut self, reader: Box) -> MaybeToken { let mut line_indent = 0; while let Match(_) = self.reader.borrow_mut().matches("\\t") { line_indent += 1; @@ -73,10 +73,10 @@ impl CodeLexer { } } -impl Lexer for CodeLexer { +impl SubLexer for CodeLexer { // TODO: TURN THIS AROUND: MAKE A FUNCTION THAT RETURNS FROM A QUEUE, AND CALLS ANOTHER TO FILL THE QUEUE IF NO RETURN - fn lex(&mut self) -> MaybeToken { + fn lex_pass(&mut self, reader: Box) -> SubLexerResult { use self::MaybeToken::*; // If currently delegating to a sub-lexer, return from that. @@ -214,12 +214,12 @@ impl Lexer for CodeLexer { } } - fn get_reader(&self) -> Rc> { - match self.reader_or_delegate { - ReaderOrDelegate::Reader() => self.reader.clone(), - ReaderOrDelegate::Delegate(ref delegate) => delegate.get_reader(), - } - } +// fn get_reader(&self) -> Rc> { +// match self.reader_or_delegate { +// ReaderOrDelegate::Reader() => self.reader.clone(), +// ReaderOrDelegate::Delegate(ref delegate) => delegate.get_reader(), +// } +// } } #[cfg(test)] @@ -239,7 +239,6 @@ mod tests { use mango::token::tokens::StartBlockToken; use mango::token::Tokens; use std::cell::RefCell; - use std::ops::Generator; use std::rc::Rc; fn assert_text_to_tokens(text: &str, tokens: Vec) { @@ -287,15 +286,4 @@ mod tests { #[test] fn test_lexing_delegation() {} - - #[test] - fn generators() { - let mut gen = || { - yield Tokens::Keyword(KeywordToken::from_str("let".to_owned()).unwrap()); - yield Tokens::Identifier(IdentifierToken::from_str("x".to_owned()).unwrap()); - yield Tokens::Association(AssociationToken::from_unprefixed()); - return; - }; - let first = unsafe { gen.resume() }; - } } diff --git a/src/mango/lexing/combi_lexer.rs b/src/mango/lexing/combi_lexer.rs new file mode 100644 index 00000000..9d85aa36 --- /dev/null +++ b/src/mango/lexing/combi_lexer.rs @@ -0,0 +1,149 @@ +use mango::io::typ::Reader; +use mango::io::typ::ReaderResult::*; +use mango::lexing::code_lexer::CodeLexer; +use mango::lexing::string_lexer::StringLexer; +use mango::lexing::typ::Lexer; +use mango::lexing::typ::SubLexer; +use mango::lexing::typ::MaybeToken; +use mango::lexing::typ::SubLexerResult; +use mango::token::special::UnlexableToken; +use mango::token::Tokens; +use mango::token::tokens::AssociationToken; +use mango::token::tokens::EndBlockToken; +use mango::token::tokens::EndStatementToken; +use mango::token::tokens::IdentifierToken; +use mango::token::tokens::KeywordToken; +use mango::token::tokens::OperatorToken; +use mango::token::tokens::ParenthesisCloseToken; +use mango::token::tokens::ParenthesisOpenToken; +use mango::token::tokens::StartBlockToken; +use mango::util::collection::Queue; +use mango::util::collection::Stack; +use std::cell::RefCell; +use std::rc::Rc; + + +pub struct CombiLexer { + reader: Box, + lexers: Stack>, + buffer: Queue, +} + +impl CombiLexer { + pub fn new(reader: Box) -> Self { + let mut lexers: Stack> = Stack::new(); + lexers.push(Box::new(CodeLexer::new())); + CombiLexer { + reader: reader, + lexers: lexers, + buffer: Queue::new(), + } + } +} + +impl Lexer for CombiLexer { + fn lex(&mut self) -> MaybeToken { + + // If there are tokens in the buffer, return from there; + if let Option::Some(token) = self.buffer.pop() { + return MaybeToken::Token(token); + } + + match self.lexers.borrow_mut() { + // No more lexers to delegate to; lexing is finished. + Option::None => MaybeToken::End, + Option::Some(ref mut lexer) => { + match lexer.lex_pass(self.reader) { + SubLexerResult::Tokens(tokens) => { + if tokens.len() > 0 { + // The sublexer produced tokens, queue them. + self.buffer.append(tokens); + self.lex() // TODO: if every branch does this, move it down + } else { + // No tokens were produced; make sure the reader has advanced to prevent infinite loops. + // TODO: check reader state + self.lex() + } + }, + SubLexerResult::Delegate(lexer) => { + // Switch to a different delegate lexer. + self.lexers.push(lexer); + self.lex() + }, + SubLexerResult::End => { + // The sublexer is done, remove it from the stack and continue with the next. + self.lexers.pop(); // This needs non-lexical lifetimes + self.lex() + }, + } + } + } + } + +} + +#[cfg(test)] +mod tests { + use mango::io::fortest::StringReader; + use mango::lexing::util::lex_all::{lex_all, LexList}; + use mango::token::Tokens; + use mango::token::tokens::AssociationToken; + use mango::token::tokens::EndBlockToken; + use mango::token::tokens::EndStatementToken; + use mango::token::tokens::IdentifierToken; + use mango::token::tokens::KeywordToken; + use mango::token::tokens::LiteralToken; + use mango::token::tokens::OperatorToken; + use mango::token::tokens::ParenthesisCloseToken; + use mango::token::tokens::ParenthesisOpenToken; + use mango::token::tokens::StartBlockToken; + use std::cell::RefCell; + use std::rc::Rc; + use super::CombiLexer; + + fn assert_text_to_tokens(text: &str, tokens: Vec) { + assert_eq!( + LexList::from_tokens(tokens), + lex_all(&mut CombiLexer::new(Rc::new(RefCell::new( + StringReader::new(text.to_owned()) + )))) + ) + } + + #[test] + fn test_lexing_individual() { + assert_text_to_tokens( + "if", + vec![Tokens::Keyword( + KeywordToken::from_str("if".to_owned()).unwrap(), + )], + ); + // todo: more + } + + #[test] + fn test_lexing_combined() { + assert_text_to_tokens( + "let x = 0\nfor x < 128\n\tx += 1", + vec![ + Tokens::Keyword(KeywordToken::from_str("let".to_owned()).unwrap()), + Tokens::Identifier(IdentifierToken::from_str("x".to_owned()).unwrap()), + Tokens::Association(AssociationToken::from_unprefixed()), + Tokens::Literal(LiteralToken::Int(0)), + Tokens::EndStatement(EndStatementToken::new_end_line()), + Tokens::Keyword(KeywordToken::from_str("for".to_owned()).unwrap()), + Tokens::Operator(OperatorToken::from_str("<").unwrap()), + Tokens::Literal(LiteralToken::Int(128)), + Tokens::EndStatement(EndStatementToken::new_end_line()), + Tokens::StartBlock(StartBlockToken::new()), + Tokens::Identifier(IdentifierToken::from_str("x".to_owned()).unwrap()), + Tokens::Association(AssociationToken::from_str("+".to_owned()).unwrap()), + Tokens::Literal(LiteralToken::Int(1)), + Tokens::EndBlock(EndBlockToken::new(true, false)), + ], + ); + } + + #[test] + fn test_lexing_delegation() {} +} diff --git a/src/mango/lexing/gen_code_lexer.rs b/src/mango/lexing/gen_code_lexer.rs index 655ad282..35d9737c 100644 --- a/src/mango/lexing/gen_code_lexer.rs +++ b/src/mango/lexing/gen_code_lexer.rs @@ -42,6 +42,7 @@ use mango::util::strslice::slice::glyphat; // TODO: this is problematic because the generator wants references to the container, // TODO: and the container obviously stores the generator +// TODO: use generator: Box> directory struct CodeLexer> { indent: i32, delegate: Option>, diff --git a/src/mango/lexing/mod.rs b/src/mango/lexing/mod.rs index 254d9a1c..9d251b70 100644 --- a/src/mango/lexing/mod.rs +++ b/src/mango/lexing/mod.rs @@ -1,9 +1,11 @@ -pub mod typ; +mod typ; -pub mod code_lexer; +mod combi_lexer; -pub mod comment_lexer; +mod code_lexer; -pub mod string_lexer; +mod comment_lexer; -pub mod util; +mod string_lexer; + +mod util; diff --git a/src/mango/lexing/string_lexer.rs b/src/mango/lexing/string_lexer.rs index 8e4adc83..18d313d7 100644 --- a/src/mango/lexing/string_lexer.rs +++ b/src/mango/lexing/string_lexer.rs @@ -42,7 +42,7 @@ impl Lexer for StringLexer { } } - fn get_reader(&self) -> Rc> { - self.reader.clone() - } +// fn get_reader(&self) -> Rc> { +// self.reader.clone() +// } } diff --git a/src/mango/lexing/typ.rs b/src/mango/lexing/typ.rs index 8ea53ba5..ee98c1fc 100644 --- a/src/mango/lexing/typ.rs +++ b/src/mango/lexing/typ.rs @@ -1,7 +1,19 @@ use mango::io::typ::Reader; use mango::token::Tokens; -use std::cell::RefCell; -use std::rc::Rc; + +// TODO: I don't want this to be public outside the crate +pub enum SubLexerResult { + Tokens(Vec), + Delegate(Box), + End, +} + +// TODO: I don't want this to be public outside the crate +pub trait SubLexer { + /// Does one iteration of a sublexer, which should either delegate or return tokens. + /// If an empty vector of tokens is returned, the reader should have advanced (to prevent infinite loops). + fn lex_pass(&mut self, reader: Box) -> SubLexerResult; +} pub enum MaybeToken { Token(Tokens), @@ -17,5 +29,5 @@ pub trait Lexer { /// Every call to lex returns a token until the end of the input. fn lex(&mut self) -> MaybeToken; - fn get_reader(&self) -> Rc>; +// fn get_reader(&self) -> Rc>; } From 491e214219629a51eddd685d1f6666a7484ebbb3 Mon Sep 17 00:00:00 2001 From: Mark Date: Sun, 17 Jun 2018 17:06:35 +0200 Subject: [PATCH 35/49] Start restructuring code lexer #52 --- src/mango/lexing/code_lexer.rs | 361 +++++++++++--------------------- src/mango/lexing/combi_lexer.rs | 8 +- src/mango/lexing/typ.rs | 11 +- 3 files changed, 134 insertions(+), 246 deletions(-) diff --git a/src/mango/lexing/code_lexer.rs b/src/mango/lexing/code_lexer.rs index 06fda0f7..de2d3d09 100644 --- a/src/mango/lexing/code_lexer.rs +++ b/src/mango/lexing/code_lexer.rs @@ -3,7 +3,10 @@ use mango::io::typ::ReaderResult::*; use mango::lexing::string_lexer::StringLexer; use mango::lexing::typ::Lexer; use mango::lexing::typ::MaybeToken; +use mango::lexing::typ::SubLexer; +use mango::lexing::typ::SubLexerResult; use mango::token::special::UnlexableToken; +use mango::token::Tokens; use mango::token::tokens::AssociationToken; use mango::token::tokens::EndBlockToken; use mango::token::tokens::EndStatementToken; @@ -13,277 +16,169 @@ use mango::token::tokens::OperatorToken; use mango::token::tokens::ParenthesisCloseToken; use mango::token::tokens::ParenthesisOpenToken; use mango::token::tokens::StartBlockToken; -use mango::token::Tokens; use mango::util::collection::Queue; use std::cell::RefCell; use std::rc::Rc; -use mango::lexing::typ::SubLexer; -use mango::lexing::typ::SubLexerResult; - -// TODO: Preferably there'd be only one Lexer at a time which has a Reader, but I did not get that to work, -// TODO: see this SO question: https://stackoverflow.com/questions/50535022/borrow-checker-problems-for-parser-that-can-delegate - -enum ReaderOrDelegate { - Reader(), - Delegate(Box), -} pub struct CodeLexer { - // reader: Rc>, indent: i32, - - // This delegate deals with nested structures, like string literals and comments. - reader_or_delegate: ReaderOrDelegate, - // This is unfortunate, would not be needed with 'yield' but is now for indents. buffer: Queue, } +// TODO: keep the regexes in thread local global scope storage + impl CodeLexer { pub fn new() -> Self { CodeLexer { - reader_or_delegate: ReaderOrDelegate::Reader(), indent: 0, buffer: Queue::new(), } } - fn lex_indents(&mut self, reader: Box) -> MaybeToken { + fn lex_indents(&mut self, reader: &mut Box) -> Vec { let mut line_indent = 0; - while let Match(_) = self.reader.borrow_mut().matches("\\t") { + while let Match(_) = reader.matches("\\t") { line_indent += 1; } - for _ in line_indent..self.indent { - // This line is dedented, make end tokens. - // TODO: turn this "new" into a constant - if let Match(_) = self.reader.borrow_mut().matches("end") { + let mut tokens: Vec = Vec::with_capacity(8); + if line_indent < self.indent { + if let Match(_) = reader.matches(r"end\s") { // If this is followed by an 'end' keyword, then that 'end' is redundant. - self.buffer - .push(Tokens::EndBlock(EndBlockToken::new(true, true))); + tokens.push(Tokens::EndBlock(EndBlockToken::new(true, true))); } else { - self.buffer - .push(Tokens::EndBlock(EndBlockToken::new(true, false))); + tokens.push(Tokens::EndBlock(EndBlockToken::new(true, false))); + } + for _ in line_indent..(self.indent - 1) { + // This line is dedented, make end tokens. + tokens.push(Tokens::EndBlock(EndBlockToken::new(true, false))); } } for _ in self.indent..line_indent { // This line is indented, make start tokens. + // TODO: increasing indent by more than one should be a warning self.buffer.push(Tokens::StartBlock(StartBlockToken::new())); } self.indent = line_indent; - self.lex() + tokens } } impl SubLexer for CodeLexer { - // TODO: TURN THIS AROUND: MAKE A FUNCTION THAT RETURNS FROM A QUEUE, AND CALLS ANOTHER TO FILL THE QUEUE IF NO RETURN - - fn lex_pass(&mut self, reader: Box) -> SubLexerResult { - use self::MaybeToken::*; - - // If currently delegating to a sub-lexer, return from that. - match self.reader_or_delegate { - ReaderOrDelegate::Delegate(ref mut delegate) => { - let delegated_token = delegate.lex(); - match delegated_token { - End => { - // Swap back from delegation to direct mode. - // let reader = delegate.get_reader().clone(); - self.reader_or_delegate = ReaderOrDelegate::Reader(); - self.lex() - } - Token(token) => Token(token), - } - // Code to stop delegation cannot be here, because `self` is still mutably borrowed through `delegate` - } - ReaderOrDelegate::Reader() => { - // todo: maybe this branch could be a separate function? - - // If there is a buffer due to indentation or continuations, return from that. - if let Some(token) = self.buffer.pop() { - return Token(token); - } - // Past this point, we assume that hte buffer is empty. When adding stuff, pop it or re-enter lex() soon. - let continue_match_res = self.reader.borrow_mut().matches("\\.\\.\\."); - if let Match(_) = continue_match_res { - // Line continuation has no token, it just continues on the next line. - let newline_match_res = self.reader.borrow_mut().matches("\\n\\r?"); - if let Match(_) = newline_match_res { - // There should always be a newline after continuations, so that they can be ignored together. - } else { - let newline_match_res = self.reader.borrow_mut().matches("[^\\n]*\\n\\r?"); - if let Match(word) = newline_match_res { - self.buffer - .push(Tokens::Unlexable(UnlexableToken::new(word))); - // This is a new line, so there may be indents. - self.lex_indents(); - return self.lex(); - } else { - // TODO: I don't know yet how to deal with '...' followed by end-of-file - panic!() - } - } - } - let newline_match_res = self.reader.borrow_mut().matches("\\n\\r?"); - if let Match(_) = newline_match_res { - // Newline WITHOUT line continuation. + fn lex_pass(&mut self, reader: &mut Box) -> SubLexerResult { + use self::SubLexerResult::*; + + // TODO: put all these match results inline + + // End of line continuation + let continue_match_res = reader.matches(r"\.\.\."); + if let Match(_) = continue_match_res { + // Line continuation has no token, it just continues on the next line, ignoring indents (for now). + let newline_match_res = reader.matches(r"\n\r?\t*"); + if let Match(_) = newline_match_res { + // There should always be a newline after continuations, so that they can be ignored together. + } else { + // The rest of this line is unparsable. + let newline_match_res = reader.matches("[^\\n]*\\n\\r?"); + if let Match(word) = newline_match_res { + let mut res: Vec = vec![Tokens::Unlexable(UnlexableToken::new(word))]; // This is a new line, so there may be indents. - self.buffer - .push(Tokens::EndStatement(EndStatementToken::new_end_line())); - self.lex_indents(); - return self.lex(); - } - let end_statement_match_res = self.reader.borrow_mut().matches(";"); - if let Match(_) = end_statement_match_res { - // Semicolon, which ends a statement. - // Need to do some extra work with buffer, because there may be a newline followed by indentation, which ; should precede. - self.buffer - .push(Tokens::EndStatement(EndStatementToken::new_semicolon())); - let end_line_match_res = self.reader.borrow_mut().matches("\\n\\r?"); - if let Match(_) = end_line_match_res { - // If semicolon is followed by a newline (redundant), then we need to deal with indents (but ignore the newline itself). - // This will return the queue of tokens, including the semicolon. - return self.lex_indents(); - } - // No newline, can just return the semicolon (which is certainly on the queue, and should be the only thing, but it is fine here if not). - return Token(self.buffer.pop().unwrap()); - } - // - // Indentation done; do the rest of lexing. - // - // Parse identifiers and keywords. This assumes that keywords are a subset of identifiers. - if let Match(word) = self - .reader - .borrow_mut() - .matches(IdentifierToken::subpattern()) - { - // later: maybe turn identifier into keyword to avoid a string copy? kind of elaborate... - if let Ok(keyword) = KeywordToken::from_str(word.clone()) { - return Token(Tokens::Keyword(keyword)); - } - return Token(Tokens::Identifier(IdentifierToken::from_str(word).unwrap())); - } - // Literal - let string_match_res = self.reader.borrow_mut().matches("[a-z]?\""); - if let Match(_) = string_match_res { - let sublexer: Box = - Box::new(StringLexer::new_double_quoted(self.reader.clone())); - self.reader_or_delegate = ReaderOrDelegate::Delegate(sublexer); - return self.lex(); - } - // Association (before operator) - let association_match_res = self - .reader - .borrow_mut() - .matches(&AssociationToken::subpattern()); - if let Match(token) = association_match_res { - if token.chars().last().unwrap() == '=' { - // return Token(Tokens::Association(AssociationToken::from_str(token[..1]).unwrap())); - return Token(Tokens::Association(AssociationToken::from_unprefixed())); // TODO - } else { - return Token(Tokens::Association(AssociationToken::from_unprefixed())); - } - } - // Operator - let operator_match_res = self - .reader - .borrow_mut() - .matches(OperatorToken::subpattern()); - if let Match(token) = operator_match_res { - return Token(Tokens::Operator(OperatorToken::from_str(&token).unwrap())); - } - // Grouping symbols - if let Match(_) = self.reader.borrow_mut().matches(r"\(") { - return Token(Tokens::ParenthesisOpen(ParenthesisOpenToken::new())); - } - if let Match(_) = self.reader.borrow_mut().matches(r"\)") { - return Token(Tokens::ParenthesisClose(ParenthesisCloseToken::new())); - } - - let unknown_word = self.reader.borrow_mut().matches("[^\\s]+"); - match unknown_word { - Match(word) => return Token(Tokens::Unlexable(UnlexableToken::new(word))), - NoMatch() => { - println!("END {:?}", self.reader.borrow()); // TODO - panic!("Do not know how to proceed with parsing") - } - EOF() => { - // TODO: also dedent and end statement here - End - } + res.append(&mut self.lex_indents(reader)); + return Result(res); + } else { + // TODO: I don't know yet how to deal with '...' followed by end-of-file + panic!() } } } - } -// fn get_reader(&self) -> Rc> { -// match self.reader_or_delegate { -// ReaderOrDelegate::Reader() => self.reader.clone(), -// ReaderOrDelegate::Delegate(ref delegate) => delegate.get_reader(), + panic!(); +// let newline_match_res = reader.matches("\\n\\r?"); +// if let Match(_) = newline_match_res { +// // Newline WITHOUT line continuation. +// // This is a new line, so there may be indents. +// self.buffer +// .push(Tokens::EndStatement(EndStatementToken::new_end_line())); +// self.lex_indents(); +// return self.lex(); +// } +// let end_statement_match_res = reader.matches(";"); +// if let Match(_) = end_statement_match_res { +// // Semicolon, which ends a statement. +// // Need to do some extra work with buffer, because there may be a newline followed by indentation, which ; should precede. +// self.buffer +// .push(Tokens::EndStatement(EndStatementToken::new_semicolon())); +// let end_line_match_res = reader.matches("\\n\\r?"); +// if let Match(_) = end_line_match_res { +// // If semicolon is followed by a newline (redundant), then we need to deal with indents (but ignore the newline itself). +// // This will return the queue of tokens, including the semicolon. +// return self.lex_indents(); +// } +// // No newline, can just return the semicolon (which is certainly on the queue, and should be the only thing, but it is fine here if not). +// return Token(self.buffer.pop().unwrap()); +// } +// // +// // Indentation done; do the rest of lexing. +// // +// // Parse identifiers and keywords. This assumes that keywords are a subset of identifiers. +// if let Match(word) = self +// .reader +// .borrow_mut() +// .matches(IdentifierToken::subpattern()) +// { +// // later: maybe turn identifier into keyword to avoid a string copy? kind of elaborate... +// if let Ok(keyword) = KeywordToken::from_str(word.clone()) { +// return Token(Tokens::Keyword(keyword)); +// } +// return Token(Tokens::Identifier(IdentifierToken::from_str(word).unwrap())); +// } +// // Literal +// let string_match_res = reader.matches("[a-z]?\""); +// if let Match(_) = string_match_res { +// let sublexer: Box = +// Box::new(StringLexer::new_double_quoted(self.reader.clone())); +// self.reader_or_delegate = ReaderOrDelegate::Delegate(sublexer); +// return self.lex(); +// } +// // Association (before operator) +// let association_match_res = self +// .reader +// .borrow_mut() +// .matches(&AssociationToken::subpattern()); +// if let Match(token) = association_match_res { +// if token.chars().last().unwrap() == '=' { +// // return Token(Tokens::Association(AssociationToken::from_str(token[..1]).unwrap())); +// return Token(Tokens::Association(AssociationToken::from_unprefixed())); // TODO +// } else { +// return Token(Tokens::Association(AssociationToken::from_unprefixed())); +// } +// } +// // Operator +// let operator_match_res = self +// .reader +// .borrow_mut() +// .matches(OperatorToken::subpattern()); +// if let Match(token) = operator_match_res { +// return Token(Tokens::Operator(OperatorToken::from_str(&token).unwrap())); +// } +// // Grouping symbols +// if let Match(_) = reader.matches(r"\(") { +// return Token(Tokens::ParenthesisOpen(ParenthesisOpenToken::new())); +// } +// if let Match(_) = reader.matches(r"\)") { +// return Token(Tokens::ParenthesisClose(ParenthesisCloseToken::new())); +// } +// +// let unknown_word = reader.matches("[^\\s]+"); +// match unknown_word { +// Match(word) => return Token(Tokens::Unlexable(UnlexableToken::new(word))), +// NoMatch() => { +// println!("END {:?}", self.reader.borrow()); // TODO +// panic!("Do not know how to proceed with parsing") +// } +// EOF() => { +// // TODO: also dedent and end statement here +// End +// } // } -// } -} - -#[cfg(test)] -mod tests { - use super::CodeLexer; - use mango::io::fortest::StringReader; - use mango::lexing::util::lex_all::{lex_all, LexList}; - use mango::token::tokens::AssociationToken; - use mango::token::tokens::EndBlockToken; - use mango::token::tokens::EndStatementToken; - use mango::token::tokens::IdentifierToken; - use mango::token::tokens::KeywordToken; - use mango::token::tokens::LiteralToken; - use mango::token::tokens::OperatorToken; - use mango::token::tokens::ParenthesisCloseToken; - use mango::token::tokens::ParenthesisOpenToken; - use mango::token::tokens::StartBlockToken; - use mango::token::Tokens; - use std::cell::RefCell; - use std::rc::Rc; - - fn assert_text_to_tokens(text: &str, tokens: Vec) { - assert_eq!( - LexList::from_tokens(tokens), - lex_all(&mut CodeLexer::new(Rc::new(RefCell::new( - StringReader::new(text.to_owned()) - )))) - ) - } - - #[test] - fn test_lexing_individual() { - assert_text_to_tokens( - "if", - vec![Tokens::Keyword( - KeywordToken::from_str("if".to_owned()).unwrap(), - )], - ); - // todo: more - } - - #[test] - fn test_lexing_combined() { - assert_text_to_tokens( - "let x = 0\nfor x < 128\n\tx += 1", - vec![ - Tokens::Keyword(KeywordToken::from_str("let".to_owned()).unwrap()), - Tokens::Identifier(IdentifierToken::from_str("x".to_owned()).unwrap()), - Tokens::Association(AssociationToken::from_unprefixed()), - Tokens::Literal(LiteralToken::Int(0)), - Tokens::EndStatement(EndStatementToken::new_end_line()), - Tokens::Keyword(KeywordToken::from_str("for".to_owned()).unwrap()), - Tokens::Operator(OperatorToken::from_str("<").unwrap()), - Tokens::Literal(LiteralToken::Int(128)), - Tokens::EndStatement(EndStatementToken::new_end_line()), - Tokens::StartBlock(StartBlockToken::new()), - Tokens::Identifier(IdentifierToken::from_str("x".to_owned()).unwrap()), - Tokens::Association(AssociationToken::from_str("+".to_owned()).unwrap()), - Tokens::Literal(LiteralToken::Int(1)), - Tokens::EndBlock(EndBlockToken::new(true, false)), - ], - ); } - - #[test] - fn test_lexing_delegation() {} } diff --git a/src/mango/lexing/combi_lexer.rs b/src/mango/lexing/combi_lexer.rs index 9d85aa36..b0fc9b84 100644 --- a/src/mango/lexing/combi_lexer.rs +++ b/src/mango/lexing/combi_lexer.rs @@ -53,8 +53,8 @@ impl Lexer for CombiLexer { // No more lexers to delegate to; lexing is finished. Option::None => MaybeToken::End, Option::Some(ref mut lexer) => { - match lexer.lex_pass(self.reader) { - SubLexerResult::Tokens(tokens) => { + match lexer.lex_pass(&mut self.reader) { + SubLexerResult::Result(tokens) => { if tokens.len() > 0 { // The sublexer produced tokens, queue them. self.buffer.append(tokens); @@ -104,9 +104,9 @@ mod tests { fn assert_text_to_tokens(text: &str, tokens: Vec) { assert_eq!( LexList::from_tokens(tokens), - lex_all(&mut CombiLexer::new(Rc::new(RefCell::new( + lex_all(&mut CombiLexer::new(Box::new( StringReader::new(text.to_owned()) - )))) + ))) ) } diff --git a/src/mango/lexing/typ.rs b/src/mango/lexing/typ.rs index ee98c1fc..0aba1981 100644 --- a/src/mango/lexing/typ.rs +++ b/src/mango/lexing/typ.rs @@ -3,7 +3,7 @@ use mango::token::Tokens; // TODO: I don't want this to be public outside the crate pub enum SubLexerResult { - Tokens(Vec), + Result(Vec), Delegate(Box), End, } @@ -12,7 +12,7 @@ pub enum SubLexerResult { pub trait SubLexer { /// Does one iteration of a sublexer, which should either delegate or return tokens. /// If an empty vector of tokens is returned, the reader should have advanced (to prevent infinite loops). - fn lex_pass(&mut self, reader: Box) -> SubLexerResult; + fn lex_pass(&mut self, reader: &mut Box) -> SubLexerResult; } pub enum MaybeToken { @@ -21,13 +21,6 @@ pub enum MaybeToken { } pub trait Lexer { - // /// Create a new lexer from a reader instance. - // fn new(reader: &'r mut Reader) -> Self; - - // fn new(reader: Rc>); - /// Every call to lex returns a token until the end of the input. fn lex(&mut self) -> MaybeToken; - -// fn get_reader(&self) -> Rc>; } From 10650d5fb0b53ba433dca7ac76f1512963e6eb96 Mon Sep 17 00:00:00 2001 From: Mark Date: Sun, 17 Jun 2018 17:20:04 +0200 Subject: [PATCH 36/49] More code lexing functionality reactivated #52 --- src/mango/lexing/code_lexer.rs | 199 +++++++++++++++-------------- src/mango/lexing/combi_lexer.rs | 25 ++-- src/mango/lexing/string_lexer.rs | 6 +- src/mango/lexing/typ.rs | 6 + src/mango/util/collection/queue.rs | 1 - src/mango/util/strslice/slice.rs | 2 +- 6 files changed, 121 insertions(+), 118 deletions(-) diff --git a/src/mango/lexing/code_lexer.rs b/src/mango/lexing/code_lexer.rs index de2d3d09..c92c2026 100644 --- a/src/mango/lexing/code_lexer.rs +++ b/src/mango/lexing/code_lexer.rs @@ -6,7 +6,6 @@ use mango::lexing::typ::MaybeToken; use mango::lexing::typ::SubLexer; use mango::lexing::typ::SubLexerResult; use mango::token::special::UnlexableToken; -use mango::token::Tokens; use mango::token::tokens::AssociationToken; use mango::token::tokens::EndBlockToken; use mango::token::tokens::EndStatementToken; @@ -16,6 +15,7 @@ use mango::token::tokens::OperatorToken; use mango::token::tokens::ParenthesisCloseToken; use mango::token::tokens::ParenthesisOpenToken; use mango::token::tokens::StartBlockToken; +use mango::token::Tokens; use mango::util::collection::Queue; use std::cell::RefCell; use std::rc::Rc; @@ -61,6 +61,13 @@ impl CodeLexer { self.indent = line_indent; tokens } + + fn token_and_indents(&mut self, reader: &mut Box, token: Tokens) -> SubLexerResult { + let mut tokens: Vec = vec![token]; + // This is a new line, so there may be indents. + tokens.append(&mut self.lex_indents(reader)); + return SubLexerResult::Result(tokens); + } } impl SubLexer for CodeLexer { @@ -70,115 +77,109 @@ impl SubLexer for CodeLexer { // TODO: put all these match results inline // End of line continuation - let continue_match_res = reader.matches(r"\.\.\."); - if let Match(_) = continue_match_res { + if let Match(_) = reader.matches(r"\.\.\.") { // Line continuation has no token, it just continues on the next line, ignoring indents (for now). - let newline_match_res = reader.matches(r"\n\r?\t*"); - if let Match(_) = newline_match_res { + if let Match(_) = reader.matches(r"\n\r?\t*") { // There should always be a newline after continuations, so that they can be ignored together. } else { // The rest of this line is unparsable. - let newline_match_res = reader.matches("[^\\n]*\\n\\r?"); - if let Match(word) = newline_match_res { - let mut res: Vec = vec![Tokens::Unlexable(UnlexableToken::new(word))]; + if let Match(word) = reader.matches("[^\\n]*\\n\\r?") { // This is a new line, so there may be indents. - res.append(&mut self.lex_indents(reader)); - return Result(res); + return self + .token_and_indents(reader, Tokens::Unlexable(UnlexableToken::new(word))); } else { // TODO: I don't know yet how to deal with '...' followed by end-of-file panic!() } } } + // Newlines + if let Match(_) = reader.matches("\\n\\r?") { + // Newline WITHOUT line continuation. + // This is a new line, so there may be indents. + return self.token_and_indents( + reader, + Tokens::EndStatement(EndStatementToken::new_end_line()), + ); + } + // End of statement + if let Match(_) = reader.matches(";") { + // Semicolon, which ends a statement. + if let Match(_) = reader.matches("\\n\\r?") { + // If semicolon is followed by a newline, it is redundant. Deal with indents (but ignore the newline itself). + return self.token_and_indents( + reader, + Tokens::EndStatement(EndStatementToken::new_semicolon()), + ); + } else { + return SubLexerResult::single(Tokens::EndStatement( + EndStatementToken::new_semicolon(), + )); + } + } + // + // Indentation done; do the rest of lexing. + // + // Parse identifiers and keywords. This assumes that keywords are a subset of identifiers. + if let Match(word) = reader.matches(IdentifierToken::subpattern()) { + // TODO: maybe turn identifier into keyword to avoid a string copy? kind of elaborate... + if let Ok(keyword) = KeywordToken::from_str(word.clone()) { + return SubLexerResult::single(Tokens::Keyword(keyword)); + } + return SubLexerResult::single(Tokens::Identifier( + IdentifierToken::from_str(word).unwrap(), + )); + } + // // Literal + // let string_match_res = reader.matches("[a-z]?\""); + // if let Match(_) = string_match_res { + // let sublexer: Box = + // Box::new(StringLexer::new_double_quoted(self.reader.clone())); + // self.reader_or_delegate = ReaderOrDelegate::Delegate(sublexer); + // return self.lex(); + // } + // // Association (before operator) + // let association_match_res = self + // .reader + // .borrow_mut() + // .matches(&AssociationToken::subpattern()); + // if let Match(token) = association_match_res { + // if token.chars().last().unwrap() == '=' { + // // return Token(Tokens::Association(AssociationToken::from_str(token[..1]).unwrap())); + // return Token(Tokens::Association(AssociationToken::from_unprefixed())); // TODO + // } else { + // return Token(Tokens::Association(AssociationToken::from_unprefixed())); + // } + // } + // // Operator + // let operator_match_res = self + // .reader + // .borrow_mut() + // .matches(OperatorToken::subpattern()); + // if let Match(token) = operator_match_res { + // return Token(Tokens::Operator(OperatorToken::from_str(&token).unwrap())); + // } + // // Grouping symbols + // if let Match(_) = reader.matches(r"\(") { + // return Token(Tokens::ParenthesisOpen(ParenthesisOpenToken::new())); + // } + // if let Match(_) = reader.matches(r"\)") { + // return Token(Tokens::ParenthesisClose(ParenthesisCloseToken::new())); + // } + // + // let unknown_word = reader.matches("[^\\s]+"); + // match unknown_word { + // Match(word) => return Token(Tokens::Unlexable(UnlexableToken::new(word))), + // NoMatch() => { + // println!("END {:?}", self.reader.borrow()); // TODO + // panic!("Do not know how to proceed with parsing") + // } + // EOF() => { + // // TODO: also dedent and end statement here + // End + // } + // } - panic!(); -// let newline_match_res = reader.matches("\\n\\r?"); -// if let Match(_) = newline_match_res { -// // Newline WITHOUT line continuation. -// // This is a new line, so there may be indents. -// self.buffer -// .push(Tokens::EndStatement(EndStatementToken::new_end_line())); -// self.lex_indents(); -// return self.lex(); -// } -// let end_statement_match_res = reader.matches(";"); -// if let Match(_) = end_statement_match_res { -// // Semicolon, which ends a statement. -// // Need to do some extra work with buffer, because there may be a newline followed by indentation, which ; should precede. -// self.buffer -// .push(Tokens::EndStatement(EndStatementToken::new_semicolon())); -// let end_line_match_res = reader.matches("\\n\\r?"); -// if let Match(_) = end_line_match_res { -// // If semicolon is followed by a newline (redundant), then we need to deal with indents (but ignore the newline itself). -// // This will return the queue of tokens, including the semicolon. -// return self.lex_indents(); -// } -// // No newline, can just return the semicolon (which is certainly on the queue, and should be the only thing, but it is fine here if not). -// return Token(self.buffer.pop().unwrap()); -// } -// // -// // Indentation done; do the rest of lexing. -// // -// // Parse identifiers and keywords. This assumes that keywords are a subset of identifiers. -// if let Match(word) = self -// .reader -// .borrow_mut() -// .matches(IdentifierToken::subpattern()) -// { -// // later: maybe turn identifier into keyword to avoid a string copy? kind of elaborate... -// if let Ok(keyword) = KeywordToken::from_str(word.clone()) { -// return Token(Tokens::Keyword(keyword)); -// } -// return Token(Tokens::Identifier(IdentifierToken::from_str(word).unwrap())); -// } -// // Literal -// let string_match_res = reader.matches("[a-z]?\""); -// if let Match(_) = string_match_res { -// let sublexer: Box = -// Box::new(StringLexer::new_double_quoted(self.reader.clone())); -// self.reader_or_delegate = ReaderOrDelegate::Delegate(sublexer); -// return self.lex(); -// } -// // Association (before operator) -// let association_match_res = self -// .reader -// .borrow_mut() -// .matches(&AssociationToken::subpattern()); -// if let Match(token) = association_match_res { -// if token.chars().last().unwrap() == '=' { -// // return Token(Tokens::Association(AssociationToken::from_str(token[..1]).unwrap())); -// return Token(Tokens::Association(AssociationToken::from_unprefixed())); // TODO -// } else { -// return Token(Tokens::Association(AssociationToken::from_unprefixed())); -// } -// } -// // Operator -// let operator_match_res = self -// .reader -// .borrow_mut() -// .matches(OperatorToken::subpattern()); -// if let Match(token) = operator_match_res { -// return Token(Tokens::Operator(OperatorToken::from_str(&token).unwrap())); -// } -// // Grouping symbols -// if let Match(_) = reader.matches(r"\(") { -// return Token(Tokens::ParenthesisOpen(ParenthesisOpenToken::new())); -// } -// if let Match(_) = reader.matches(r"\)") { -// return Token(Tokens::ParenthesisClose(ParenthesisCloseToken::new())); -// } -// -// let unknown_word = reader.matches("[^\\s]+"); -// match unknown_word { -// Match(word) => return Token(Tokens::Unlexable(UnlexableToken::new(word))), -// NoMatch() => { -// println!("END {:?}", self.reader.borrow()); // TODO -// panic!("Do not know how to proceed with parsing") -// } -// EOF() => { -// // TODO: also dedent and end statement here -// End -// } -// } + panic!() // TODO TMP } } diff --git a/src/mango/lexing/combi_lexer.rs b/src/mango/lexing/combi_lexer.rs index b0fc9b84..7c353ffb 100644 --- a/src/mango/lexing/combi_lexer.rs +++ b/src/mango/lexing/combi_lexer.rs @@ -3,11 +3,10 @@ use mango::io::typ::ReaderResult::*; use mango::lexing::code_lexer::CodeLexer; use mango::lexing::string_lexer::StringLexer; use mango::lexing::typ::Lexer; -use mango::lexing::typ::SubLexer; use mango::lexing::typ::MaybeToken; +use mango::lexing::typ::SubLexer; use mango::lexing::typ::SubLexerResult; use mango::token::special::UnlexableToken; -use mango::token::Tokens; use mango::token::tokens::AssociationToken; use mango::token::tokens::EndBlockToken; use mango::token::tokens::EndStatementToken; @@ -17,12 +16,12 @@ use mango::token::tokens::OperatorToken; use mango::token::tokens::ParenthesisCloseToken; use mango::token::tokens::ParenthesisOpenToken; use mango::token::tokens::StartBlockToken; +use mango::token::Tokens; use mango::util::collection::Queue; use mango::util::collection::Stack; use std::cell::RefCell; use std::rc::Rc; - pub struct CombiLexer { reader: Box, lexers: Stack>, @@ -43,7 +42,6 @@ impl CombiLexer { impl Lexer for CombiLexer { fn lex(&mut self) -> MaybeToken { - // If there are tokens in the buffer, return from there; if let Option::Some(token) = self.buffer.pop() { return MaybeToken::Token(token); @@ -64,29 +62,28 @@ impl Lexer for CombiLexer { // TODO: check reader state self.lex() } - }, + } SubLexerResult::Delegate(lexer) => { // Switch to a different delegate lexer. self.lexers.push(lexer); self.lex() - }, + } SubLexerResult::End => { // The sublexer is done, remove it from the stack and continue with the next. - self.lexers.pop(); // This needs non-lexical lifetimes + self.lexers.pop(); // This needs non-lexical lifetimes self.lex() - }, + } } } } } - } #[cfg(test)] mod tests { + use super::CombiLexer; use mango::io::fortest::StringReader; use mango::lexing::util::lex_all::{lex_all, LexList}; - use mango::token::Tokens; use mango::token::tokens::AssociationToken; use mango::token::tokens::EndBlockToken; use mango::token::tokens::EndStatementToken; @@ -97,16 +94,16 @@ mod tests { use mango::token::tokens::ParenthesisCloseToken; use mango::token::tokens::ParenthesisOpenToken; use mango::token::tokens::StartBlockToken; + use mango::token::Tokens; use std::cell::RefCell; use std::rc::Rc; - use super::CombiLexer; fn assert_text_to_tokens(text: &str, tokens: Vec) { assert_eq!( LexList::from_tokens(tokens), - lex_all(&mut CombiLexer::new(Box::new( - StringReader::new(text.to_owned()) - ))) + lex_all(&mut CombiLexer::new(Box::new(StringReader::new( + text.to_owned() + )))) ) } diff --git a/src/mango/lexing/string_lexer.rs b/src/mango/lexing/string_lexer.rs index 18d313d7..4a433a00 100644 --- a/src/mango/lexing/string_lexer.rs +++ b/src/mango/lexing/string_lexer.rs @@ -42,7 +42,7 @@ impl Lexer for StringLexer { } } -// fn get_reader(&self) -> Rc> { -// self.reader.clone() -// } + // fn get_reader(&self) -> Rc> { + // self.reader.clone() + // } } diff --git a/src/mango/lexing/typ.rs b/src/mango/lexing/typ.rs index 0aba1981..6911c479 100644 --- a/src/mango/lexing/typ.rs +++ b/src/mango/lexing/typ.rs @@ -8,6 +8,12 @@ pub enum SubLexerResult { End, } +impl SubLexerResult { + pub fn single(token: Tokens) -> Self { + SubLexerResult::Result(vec![token]) + } +} + // TODO: I don't want this to be public outside the crate pub trait SubLexer { /// Does one iteration of a sublexer, which should either delegate or return tokens. diff --git a/src/mango/util/collection/queue.rs b/src/mango/util/collection/queue.rs index 8eb43ec1..bd239bef 100644 --- a/src/mango/util/collection/queue.rs +++ b/src/mango/util/collection/queue.rs @@ -1,4 +1,3 @@ - /// A one-ended queue. See also [Stack]. /// This is just a wrapper around vec so nobody pushes or pops the wrong end. pub struct Queue { diff --git a/src/mango/util/strslice/slice.rs b/src/mango/util/strslice/slice.rs index 95055439..60ab23a8 100644 --- a/src/mango/util/strslice/slice.rs +++ b/src/mango/util/strslice/slice.rs @@ -49,7 +49,7 @@ pub fn charsliceto>(text: S, end: isize) -> String { } pub fn glyphat>(text: S, pos: isize) -> String { - charslice(text, pos, pos+1) + charslice(text, pos, pos + 1) } #[cfg(test)] From 4346a4f63998e5f1f6f324ce12cd2ddce1f45c2c Mon Sep 17 00:00:00 2001 From: Mark Date: Sun, 17 Jun 2018 18:48:08 +0200 Subject: [PATCH 37/49] Fairly complex regular expressions for ints and floats #52 --- src/mango/token/tokens/literal.rs | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/src/mango/token/tokens/literal.rs b/src/mango/token/tokens/literal.rs index 2065dcda..4836de63 100644 --- a/src/mango/token/tokens/literal.rs +++ b/src/mango/token/tokens/literal.rs @@ -26,8 +26,21 @@ impl LiteralToken { LiteralToken::Real(f64eq::new(value)) } + /// This matches integer literals, either just numbers in base 10, or base 2-36 with prefix. + /// The syntax for -37 in base 16 is -16b25 and 2748 is 16bABC. + /// Incorrect values like 4b7 or 0b0 are not handled at the lexing stage. pub fn subpattern_int() -> &'static str { - "[a-z]?\"" + r"(?:\+|-*)(?:[1-9][0-9]*b(?:_?[0-9a-zA-Z])+|[0-9](?:_?[0-9])*)" + } + + /// This matches real literals (base 10), which look like this: + /// sign / int1 / period / int2 / e / sign / int + /// Here int is a series of 0-9 digits separated by at most one underscore. + /// Signs are optional, everything from 'e' is optional, and int1 OR int2 is optional. + pub fn subpattern_real() -> &'static str { + // TODO: do I want to allow numbers to start with a period? + // TODO: for now, only base10 for reals (would 8b11e2 be 9*8^2 or 9*10^2?) + r"(?:\+|-*)(?:\d(?:_?\d)*\.\d(?:_?\d)*|\d(?:_?\d)*\.|\.\d(?:_?\d)*)(?:e(?:\+|-|)\d(?:_?\d)*)?" } } From 8b4a125b6006094f818d17ac8a4c7d2ace1c88d1 Mon Sep 17 00:00:00 2001 From: Mark Date: Sun, 17 Jun 2018 19:16:07 +0200 Subject: [PATCH 38/49] Start infrastructure for parsing numbers in strings #52 --- src/mango/util/mod.rs | 2 ++ src/mango/util/parsetxt/int.rs | 32 +++++++++++++++++++++++++++++ src/mango/util/parsetxt/mod.rs | 3 +++ src/mango/util/parsetxt/real.rs | 36 +++++++++++++++++++++++++++++++++ 4 files changed, 73 insertions(+) create mode 100644 src/mango/util/parsetxt/int.rs create mode 100644 src/mango/util/parsetxt/mod.rs create mode 100644 src/mango/util/parsetxt/real.rs diff --git a/src/mango/util/mod.rs b/src/mango/util/mod.rs index 8d0156e9..a6371011 100644 --- a/src/mango/util/mod.rs +++ b/src/mango/util/mod.rs @@ -15,3 +15,5 @@ pub mod encdec; pub mod errors; pub mod codeparts; + +pub mod parsetxt; diff --git a/src/mango/util/parsetxt/int.rs b/src/mango/util/parsetxt/int.rs new file mode 100644 index 00000000..d2d22b55 --- /dev/null +++ b/src/mango/util/parsetxt/int.rs @@ -0,0 +1,32 @@ +use regex::Regex; + +/// This matches integer literals, either just numbers in base 10, or base 2-36 with prefix. +/// The syntax for -37 in base 16 is -16b25 and 2748 is 16bABC. +/// Incorrect values like 4b7 or 0b0 are not handled at the lexing stage. +pub fn int_pattern() -> &'static str { + r"(?:\+|-*)(?:[1-9][0-9]*b(?:_?[0-9a-zA-Z])+|[0-9](?:_?[0-9])*)" +} + +/// Convert a String that matches [int_pattern] to an i64 integer. Overflow is possible. +pub fn parse_int>(text: S) -> Option { + let text = text.into(); + debug_assert!( + Regex::new(&format!("^{}$", int_pattern())) + .unwrap() + .is_match(&text) + ); + Some(0i64) +} + +// TODO: possibly add a i32 version? +// TODO: Option to deal with overflow? + +#[cfg(test)] +mod tests { + use super::parse_int; + + #[test] + fn test_parse_int() { + assert_eq!(42, parse_int("42").unwrap()) + } +} diff --git a/src/mango/util/parsetxt/mod.rs b/src/mango/util/parsetxt/mod.rs new file mode 100644 index 00000000..04611574 --- /dev/null +++ b/src/mango/util/parsetxt/mod.rs @@ -0,0 +1,3 @@ +pub mod int; + +pub mod real; diff --git a/src/mango/util/parsetxt/real.rs b/src/mango/util/parsetxt/real.rs new file mode 100644 index 00000000..9d858a55 --- /dev/null +++ b/src/mango/util/parsetxt/real.rs @@ -0,0 +1,36 @@ +use regex::Regex; + +/// This matches real literals (base 10), which look like this: +/// sign / int1 / period / int2 / e / sign / int +/// Here int is a series of 0-9 digits separated by at most one underscore. +/// Signs are optional, everything from 'e' is optional, and int1 OR int2 is optional. +pub fn real_pattern() -> &'static str { + // TODO: do I want to allow numbers to start with a period? + // TODO: for now, only base10 for reals (would 8b11e2 be 9*8^2 or 9*10^2?) + // TODO: does not deal with NaN of infinity + r"(?:\+|-*)(?:\d(?:_?\d)*\.\d(?:_?\d)*|\d(?:_?\d)*\.|\.\d(?:_?\d)*)(?:e(?:\+|-|)\d(?:_?\d)*)?" +} + +/// Convert a String that matches [real_pattern] to an f64 real. Overflow and loss of precision is possible. +pub fn parse_real>(text: S) -> Option { + let text = text.into(); + debug_assert!( + Regex::new(&format!("^{}$", real_pattern())) + .unwrap() + .is_match(&text) + ); + Some(0.0f64) +} + +// TODO: possibly add a i32 version? +// TODO: Option to deal with overflow? + +#[cfg(test)] +mod tests { + use super::parse_real; + + #[test] + fn test_parse_int() { + assert_eq!(42., parse_real("42.").unwrap()) + } +} From 28bc91ab220d0b9bb7600528cd8f8a5f419809c7 Mon Sep 17 00:00:00 2001 From: Mark Date: Sun, 17 Jun 2018 19:16:44 +0200 Subject: [PATCH 39/49] Some progress on lexing literals #52 --- src/mango/lexing/code_lexer.rs | 24 ++++++++++++++---------- src/mango/lexing/combi_lexer.rs | 14 -------------- src/mango/lexing/string_lexer.rs | 26 ++++++++++++-------------- src/mango/token/tokens/literal.rs | 13 +++++++++++++ 4 files changed, 39 insertions(+), 38 deletions(-) diff --git a/src/mango/lexing/code_lexer.rs b/src/mango/lexing/code_lexer.rs index c92c2026..64cbf815 100644 --- a/src/mango/lexing/code_lexer.rs +++ b/src/mango/lexing/code_lexer.rs @@ -6,6 +6,7 @@ use mango::lexing::typ::MaybeToken; use mango::lexing::typ::SubLexer; use mango::lexing::typ::SubLexerResult; use mango::token::special::UnlexableToken; +use mango::token::tokens::literal::LiteralToken; use mango::token::tokens::AssociationToken; use mango::token::tokens::EndBlockToken; use mango::token::tokens::EndStatementToken; @@ -17,8 +18,6 @@ use mango::token::tokens::ParenthesisOpenToken; use mango::token::tokens::StartBlockToken; use mango::token::Tokens; use mango::util::collection::Queue; -use std::cell::RefCell; -use std::rc::Rc; pub struct CodeLexer { indent: i32, @@ -130,14 +129,19 @@ impl SubLexer for CodeLexer { IdentifierToken::from_str(word).unwrap(), )); } - // // Literal - // let string_match_res = reader.matches("[a-z]?\""); - // if let Match(_) = string_match_res { - // let sublexer: Box = - // Box::new(StringLexer::new_double_quoted(self.reader.clone())); - // self.reader_or_delegate = ReaderOrDelegate::Delegate(sublexer); - // return self.lex(); - // } + // Literal + if let Match(_) = reader.matches("[a-z]?\"") { + return Delegate(Box::new(StringLexer::new_double_quoted())); + } + if let Match(nr) = reader.matches(LiteralToken::subpattern_int()) { + let value = LiteralToken::parse_int(nr); + return SubLexerResult::single(Tokens::Literal(LiteralToken::Int(value))); + } + if let Match(nr) = reader.matches(LiteralToken::subpattern_real()) { + let value = LiteralToken::parse_real(nr); + return SubLexerResult::single(Tokens::Literal(LiteralToken::Real(value))); + } + // // Association (before operator) // let association_match_res = self // .reader diff --git a/src/mango/lexing/combi_lexer.rs b/src/mango/lexing/combi_lexer.rs index 7c353ffb..9a3965ec 100644 --- a/src/mango/lexing/combi_lexer.rs +++ b/src/mango/lexing/combi_lexer.rs @@ -1,26 +1,12 @@ use mango::io::typ::Reader; -use mango::io::typ::ReaderResult::*; use mango::lexing::code_lexer::CodeLexer; -use mango::lexing::string_lexer::StringLexer; use mango::lexing::typ::Lexer; use mango::lexing::typ::MaybeToken; use mango::lexing::typ::SubLexer; use mango::lexing::typ::SubLexerResult; -use mango::token::special::UnlexableToken; -use mango::token::tokens::AssociationToken; -use mango::token::tokens::EndBlockToken; -use mango::token::tokens::EndStatementToken; -use mango::token::tokens::IdentifierToken; -use mango::token::tokens::KeywordToken; -use mango::token::tokens::OperatorToken; -use mango::token::tokens::ParenthesisCloseToken; -use mango::token::tokens::ParenthesisOpenToken; -use mango::token::tokens::StartBlockToken; use mango::token::Tokens; use mango::util::collection::Queue; use mango::util::collection::Stack; -use std::cell::RefCell; -use std::rc::Rc; pub struct CombiLexer { reader: Box, diff --git a/src/mango/lexing/string_lexer.rs b/src/mango/lexing/string_lexer.rs index 4a433a00..ab303c7b 100644 --- a/src/mango/lexing/string_lexer.rs +++ b/src/mango/lexing/string_lexer.rs @@ -2,10 +2,10 @@ use mango::io::typ::Reader; use mango::io::typ::ReaderResult::*; use mango::lexing::typ::Lexer; use mango::lexing::typ::MaybeToken; +use mango::lexing::typ::SubLexer; +use mango::lexing::typ::SubLexerResult; use mango::token::tokens::LiteralToken; use mango::token::Tokens; -use std::cell::RefCell; -use std::rc::Rc; pub enum StringType { SingleQuotedInline, @@ -16,33 +16,31 @@ pub enum StringType { /// Lexes a string literal token. // Starts after the opening quote and expected to consume until closing quote. pub struct StringLexer { - reader: Rc>, typ: StringType, } impl StringLexer { // TODO: support other types of strings - pub fn new_double_quoted(reader: Rc>) -> Self { + pub fn new_double_quoted() -> Self { StringLexer { - reader, typ: StringType::DoubleQuotedInline, } } } -impl Lexer for StringLexer { - fn lex(&mut self) -> MaybeToken { +impl SubLexer for StringLexer { + fn lex_pass(&mut self, reader: &mut Box) -> SubLexerResult { // TODO: perhaps there's a library that does parsing a string with escape characters // TODO: doesn't handle escaping etc at all now // TODO: this is going to have a problem if `matches` automatically eats whitespace - match self.reader.borrow_mut().matches("[^\"\\n]*") { - Match(value) => return MaybeToken::Token(Tokens::Literal(LiteralToken::string(value))), + match reader.matches("[^\"\\n]*") { + Match(value) => { + return SubLexerResult::single(Tokens::Literal(LiteralToken::string(value))) + } NoMatch() => panic!("failed to parse string"), // This can't really go wrong since empty pattern matches - EOF() => return MaybeToken::Token(Tokens::Literal(LiteralToken::string("".to_owned()))), // Unclosed string literal, let code parser deal with it + EOF() => { + return SubLexerResult::single(Tokens::Literal(LiteralToken::string("".to_owned()))) + } // Unclosed string literal, let code parser deal with it } } - - // fn get_reader(&self) -> Rc> { - // self.reader.clone() - // } } diff --git a/src/mango/token/tokens/literal.rs b/src/mango/token/tokens/literal.rs index 4836de63..814f9d9c 100644 --- a/src/mango/token/tokens/literal.rs +++ b/src/mango/token/tokens/literal.rs @@ -1,6 +1,8 @@ use mango::token::Token; use mango::util::encdec::ToText; use mango::util::numtype::f64eq; +use mango::util::parsetxt::int::parse_int; +use mango::util::parsetxt::real::parse_real; // LATER: it is likely that this will be refactored when the type system is in place. @@ -40,8 +42,19 @@ impl LiteralToken { pub fn subpattern_real() -> &'static str { // TODO: do I want to allow numbers to start with a period? // TODO: for now, only base10 for reals (would 8b11e2 be 9*8^2 or 9*10^2?) + // TODO: does not deal with NaN of infinity r"(?:\+|-*)(?:\d(?:_?\d)*\.\d(?:_?\d)*|\d(?:_?\d)*\.|\.\d(?:_?\d)*)(?:e(?:\+|-|)\d(?:_?\d)*)?" } + + /// Parse a string matching [subpattern_int] to an i64 integer. Overflow is possible. + pub fn parse_int(text: String) -> i64 { + parse_int(text).unwrap() + } + + /// Parse a string matching [subpattern_real] to a f64 real. Loss of precision or overflow are possible. + pub fn parse_real(text: String) -> f64eq { + f64eq::new(parse_real(text).unwrap()) + } } impl ToText for LiteralToken { From fd1241213e275f6235e071bc85856f68da99230b Mon Sep 17 00:00:00 2001 From: Mark Date: Sun, 17 Jun 2018 20:16:28 +0200 Subject: [PATCH 40/49] Parsing of base10 integers implemented #52 --- src/mango/util/parsetxt/int.rs | 62 ++++++++++++++++++++++++----- src/mango/util/parsetxt/real.rs | 16 ++++++-- src/mango/util/strslice/char_ops.rs | 6 +++ src/mango/util/strslice/mod.rs | 2 + 4 files changed, 73 insertions(+), 13 deletions(-) create mode 100644 src/mango/util/strslice/char_ops.rs diff --git a/src/mango/util/parsetxt/int.rs b/src/mango/util/parsetxt/int.rs index d2d22b55..0408acf0 100644 --- a/src/mango/util/parsetxt/int.rs +++ b/src/mango/util/parsetxt/int.rs @@ -1,21 +1,63 @@ +use mango::util::strslice::char_ops::char_drop; use regex::Regex; +#[derive(Debug)] +pub enum IntParseFailReason { + Invalid, + Overflow, + Underflow, +} + /// This matches integer literals, either just numbers in base 10, or base 2-36 with prefix. /// The syntax for -37 in base 16 is -16b25 and 2748 is 16bABC. /// Incorrect values like 4b7 or 0b0 are not handled at the lexing stage. pub fn int_pattern() -> &'static str { - r"(?:\+|-*)(?:[1-9][0-9]*b(?:_?[0-9a-zA-Z])+|[0-9](?:_?[0-9])*)" + r"(?P\+|-?)(?:(?P[1-9][0-9]*)b(?P(?:_?[0-9a-zA-Z])+)|(?P[0-9](?:_?[0-9])*))" } /// Convert a String that matches [int_pattern] to an i64 integer. Overflow is possible. -pub fn parse_int>(text: S) -> Option { +pub fn parse_int>(text: S) -> Result { let text = text.into(); - debug_assert!( - Regex::new(&format!("^{}$", int_pattern())) - .unwrap() - .is_match(&text) - ); - Some(0i64) + match Regex::new(&format!("^{}$", int_pattern())) + .unwrap() + .captures(&text) + { + None => return Err(IntParseFailReason::Invalid), + Some(captures) => { + // Sign + let sign_str = captures.name("sign").unwrap().as_str(); + let sign = if sign_str == "+" || sign_str == "" { + 1 // positive + } else { + -1 // negative + }; + // Check if base10 or special + match captures.name("b10_val") { + None => { + // There is a base provided. + if let Some(base) = captures.name("base") { + if let Some(value) = captures.name("reb_val") { + // TODO: implement + panic!(format!( + "Do not yet know how to deal with {} in base {}", + char_drop(value.as_str(), &'_'), + base.as_str() + )) + } else { + panic!("Expected 'reb_val' match in regex") + } + } else { + panic!("Expected 'base' match in regex") + } + } + Some(value) => { + // This is a 'normal' (base10) value. + // TODO: check for over/underflow + return Ok(char_drop(value.as_str(), &'_').parse::().unwrap()); + } + } + } + } } // TODO: possibly add a i32 version? @@ -27,6 +69,8 @@ mod tests { #[test] fn test_parse_int() { - assert_eq!(42, parse_int("42").unwrap()) + assert_eq!(42, parse_int("42").unwrap()); + assert_eq!(42, parse_int("4_2").unwrap()); + // assert_eq!(42, parse_int("10b4_2").unwrap()); } } diff --git a/src/mango/util/parsetxt/real.rs b/src/mango/util/parsetxt/real.rs index 9d858a55..1ba756c6 100644 --- a/src/mango/util/parsetxt/real.rs +++ b/src/mango/util/parsetxt/real.rs @@ -1,5 +1,13 @@ use regex::Regex; +#[derive(Debug)] +pub enum RealParseFailReason { + Invalid, + Overflow, + Underflow, + PrecisionLoss(f64), +} + /// This matches real literals (base 10), which look like this: /// sign / int1 / period / int2 / e / sign / int /// Here int is a series of 0-9 digits separated by at most one underscore. @@ -8,18 +16,18 @@ pub fn real_pattern() -> &'static str { // TODO: do I want to allow numbers to start with a period? // TODO: for now, only base10 for reals (would 8b11e2 be 9*8^2 or 9*10^2?) // TODO: does not deal with NaN of infinity - r"(?:\+|-*)(?:\d(?:_?\d)*\.\d(?:_?\d)*|\d(?:_?\d)*\.|\.\d(?:_?\d)*)(?:e(?:\+|-|)\d(?:_?\d)*)?" + r"(?P\+|-?)(?:\d(?:_?\d)*\.\d(?:_?\d)*|\d(?:_?\d)*\.|\.\d(?:_?\d)*)(?:e(?P\+|-?)\d(?:_?\d)*)?" } /// Convert a String that matches [real_pattern] to an f64 real. Overflow and loss of precision is possible. -pub fn parse_real>(text: S) -> Option { +pub fn parse_real>(text: S) -> Result { let text = text.into(); debug_assert!( Regex::new(&format!("^{}$", real_pattern())) .unwrap() .is_match(&text) ); - Some(0.0f64) + Ok(0.0f64) } // TODO: possibly add a i32 version? @@ -30,7 +38,7 @@ mod tests { use super::parse_real; #[test] - fn test_parse_int() { + fn test_parse_real() { assert_eq!(42., parse_real("42.").unwrap()) } } diff --git a/src/mango/util/strslice/char_ops.rs b/src/mango/util/strslice/char_ops.rs new file mode 100644 index 00000000..a2e351f4 --- /dev/null +++ b/src/mango/util/strslice/char_ops.rs @@ -0,0 +1,6 @@ +/// Remove all matching characters from the string. +// Signature may be changed to support a set of characters, if the need arises. +pub fn char_drop>(text: S, strip: &char) -> String { + let text = text.into(); + text.chars().filter(|chr| chr != strip).collect() +} diff --git a/src/mango/util/strslice/mod.rs b/src/mango/util/strslice/mod.rs index 5846d94a..08a0519c 100644 --- a/src/mango/util/strslice/mod.rs +++ b/src/mango/util/strslice/mod.rs @@ -2,3 +2,5 @@ pub mod slice; pub use self::slice::charslice; pub use self::slice::charslicefrom; pub use self::slice::charsliceto; + +pub mod char_ops; From b61d6f2d61f3f5281303d105b3f69e9af569a506 Mon Sep 17 00:00:00 2001 From: Mark Date: Sun, 17 Jun 2018 20:53:45 +0200 Subject: [PATCH 41/49] Parsing floats implemented #52 --- src/mango/util/parsetxt/int.rs | 38 ++++++++++----- src/mango/util/parsetxt/real.rs | 86 ++++++++++++++++++++++++++++----- 2 files changed, 101 insertions(+), 23 deletions(-) diff --git a/src/mango/util/parsetxt/int.rs b/src/mango/util/parsetxt/int.rs index 0408acf0..744aaca8 100644 --- a/src/mango/util/parsetxt/int.rs +++ b/src/mango/util/parsetxt/int.rs @@ -12,7 +12,7 @@ pub enum IntParseFailReason { /// The syntax for -37 in base 16 is -16b25 and 2748 is 16bABC. /// Incorrect values like 4b7 or 0b0 are not handled at the lexing stage. pub fn int_pattern() -> &'static str { - r"(?P\+|-?)(?:(?P[1-9][0-9]*)b(?P(?:_?[0-9a-zA-Z])+)|(?P[0-9](?:_?[0-9])*))" + r"(?:(?P(?:\+|-?)[1-9][0-9]*)b(?P(?:_?[0-9a-zA-Z])+)|(?P(?:\+|-?)[0-9](?:_?[0-9])*))" } /// Convert a String that matches [int_pattern] to an i64 integer. Overflow is possible. @@ -24,13 +24,13 @@ pub fn parse_int>(text: S) -> Result { { None => return Err(IntParseFailReason::Invalid), Some(captures) => { - // Sign - let sign_str = captures.name("sign").unwrap().as_str(); - let sign = if sign_str == "+" || sign_str == "" { - 1 // positive - } else { - -1 // negative - }; + // // Sign + // let sign_str = captures.name("sign").unwrap().as_str(); + // let sign = if sign_str == "+" || sign_str == "" { + // 1 // positive + // } else { + // -1 // negative + // }; // Check if base10 or special match captures.name("b10_val") { None => { @@ -61,16 +61,32 @@ pub fn parse_int>(text: S) -> Result { } // TODO: possibly add a i32 version? -// TODO: Option to deal with overflow? #[cfg(test)] mod tests { use super::parse_int; #[test] - fn test_parse_int() { + fn test_parse_b10_int() { assert_eq!(42, parse_int("42").unwrap()); assert_eq!(42, parse_int("4_2").unwrap()); - // assert_eq!(42, parse_int("10b4_2").unwrap()); + assert_eq!(123456789, parse_int("+1_2_3_4_5_6_7_8_9").unwrap()); + assert_eq!(-123456789, parse_int("-123456789").unwrap()); + assert_eq!(0, parse_int("-0").unwrap()); + assert_eq!(-1, parse_int("-1").unwrap()); + // Weird bases with 0 prefix are not supported. + assert_eq!(9, parse_int("09").unwrap()); + } + + fn test_invalid_b10_ints() { + assert!(parse_int("0x9").is_err()); + assert!(parse_int("A").is_err()); + assert!(parse_int("_0").is_err()); + assert!(parse_int("0_").is_err()); + // TODO: over/underflow + } + + fn test_parse_based_ints() { + // TODO: not implemented yet } } diff --git a/src/mango/util/parsetxt/real.rs b/src/mango/util/parsetxt/real.rs index 1ba756c6..a5de9948 100644 --- a/src/mango/util/parsetxt/real.rs +++ b/src/mango/util/parsetxt/real.rs @@ -1,3 +1,4 @@ +use mango::util::strslice::char_ops::char_drop; use regex::Regex; #[derive(Debug)] @@ -13,32 +14,93 @@ pub enum RealParseFailReason { /// Here int is a series of 0-9 digits separated by at most one underscore. /// Signs are optional, everything from 'e' is optional, and int1 OR int2 is optional. pub fn real_pattern() -> &'static str { - // TODO: do I want to allow numbers to start with a period? + // TODO: do I really want to allow numbers to start with a period? // TODO: for now, only base10 for reals (would 8b11e2 be 9*8^2 or 9*10^2?) // TODO: does not deal with NaN of infinity - r"(?P\+|-?)(?:\d(?:_?\d)*\.\d(?:_?\d)*|\d(?:_?\d)*\.|\.\d(?:_?\d)*)(?:e(?P\+|-?)\d(?:_?\d)*)?" + r"(?P(?:\+|-?)(?:\d(?:_?\d)*\.\d(?:_?\d)*|\d(?:_?\d)*\.|\.\d(?:_?\d)*))(?:e(?P(?:\+|-?)\d(?:_?\d)*))?" } /// Convert a String that matches [real_pattern] to an f64 real. Overflow and loss of precision is possible. pub fn parse_real>(text: S) -> Result { let text = text.into(); - debug_assert!( - Regex::new(&format!("^{}$", real_pattern())) - .unwrap() - .is_match(&text) - ); - Ok(0.0f64) + match Regex::new(&format!("^{}$", real_pattern())) + .unwrap() + .captures(&text) + { + None => return Err(RealParseFailReason::Invalid), + Some(captures) => { + let multiplier = char_drop(captures.name("multiplier").unwrap().as_str(), &'_') + .parse::() + .unwrap(); + match captures.name("exponent") { + None => { + // This is a 'normal' real, no exponential notation + return Ok(multiplier); + } + Some(exponent_match) => { + // This real is in exponential notation + let exponent = char_drop(exponent_match.as_str(), &'_') + .parse::() + .unwrap(); + // TODO: is there a numerically smarter way to do this? + return Ok(10f64.powf(exponent) * multiplier); + } + } + } + } } -// TODO: possibly add a i32 version? -// TODO: Option to deal with overflow? +// TODO: possibly add a f32 version? #[cfg(test)] mod tests { use super::parse_real; + fn close(x: f64, y: f64) -> bool { + (x - y).abs() < 1e-8 + } + + #[test] + fn test_parse_nonexp_real() { + assert!(close(42., parse_real("42.0").unwrap())); + assert!(close(-0.1, parse_real("-.1").unwrap())); + assert!(close(-1., parse_real("-1.").unwrap())); + assert!(close(12345.6789, parse_real("1_2_3_4_5.6_7_8_9").unwrap())); + } + #[test] - fn test_parse_real() { - assert_eq!(42., parse_real("42.").unwrap()) + fn test_parse_exp_real() { + assert!(close(42., parse_real("42.0e0").unwrap())); + assert!(close(-0.1, parse_real("-.1e0").unwrap())); + assert!(close(-1., parse_real("-1.e0").unwrap())); + assert!(close(42., parse_real("42.0e+0").unwrap())); + assert!(close( + 12345.6789, + parse_real("1_2_3_4_5.6_7_8_9e0").unwrap() + )); + assert!(close(0.42, parse_real("42.0e-2").unwrap())); + assert!(close(-0.001, parse_real("-.1e-2").unwrap())); + assert!(close(-0.01, parse_real("-1.e-2").unwrap())); + assert!(close( + 123.456789, + parse_real("1_2_3_4_5.6_7_8_9e-2").unwrap() + )); + assert!(close(42.0, parse_real("42.0e-0_0_0").unwrap())); } + + #[test] + fn test_invalid_real() { + assert!(parse_real("+_42.0").is_err()); + assert!(parse_real("-_42.0").is_err()); + assert!(parse_real("_42.0").is_err()); + assert!(parse_real("42_.0").is_err()); + assert!(parse_real("42._0").is_err()); + assert!(parse_real("42.0_").is_err()); + assert!(parse_real("42.0e_0").is_err()); + assert!(parse_real("42.0e0_").is_err()); + assert!(parse_real("42.0e0b0").is_err()); + } + + // TODO: over/underflow + // TODO: loss of precision } From 946f417fb23ecc9390e722ae1af55aac48d9e264 Mon Sep 17 00:00:00 2001 From: Mark Date: Sun, 17 Jun 2018 21:52:29 +0200 Subject: [PATCH 42/49] Expand lexing and improvements to string utils #52 --- src/mango/io/util.rs | 3 ++- src/mango/lexing/code_lexer.rs | 23 +++++++++------------ src/mango/lexing/string_lexer.rs | 2 -- src/mango/token/tokens/literal.rs | 2 +- src/mango/util/parsetxt/int.rs | 6 +++--- src/mango/util/parsetxt/real.rs | 6 +++--- src/mango/util/strslice/char_ops.rs | 32 ++++++++++++++++++++++++----- 7 files changed, 46 insertions(+), 28 deletions(-) diff --git a/src/mango/io/util.rs b/src/mango/io/util.rs index 9d8710ef..b096b333 100644 --- a/src/mango/io/util.rs +++ b/src/mango/io/util.rs @@ -18,7 +18,8 @@ impl RegexCache { if !self.cache.contains_key(subpattern) { match Regex::new(&format!("^ *{}", subpattern)) { Err(err) => panic!(format!( - "Invalid regular expression while adding to library; this is a bug:\n{:?}", + "Invalid regular expression '{}' while adding to library; this is a bug:\n{:?}", + subpattern, err )), Ok(regex) => { diff --git a/src/mango/lexing/code_lexer.rs b/src/mango/lexing/code_lexer.rs index 64cbf815..58c149ab 100644 --- a/src/mango/lexing/code_lexer.rs +++ b/src/mango/lexing/code_lexer.rs @@ -142,19 +142,16 @@ impl SubLexer for CodeLexer { return SubLexerResult::single(Tokens::Literal(LiteralToken::Real(value))); } - // // Association (before operator) - // let association_match_res = self - // .reader - // .borrow_mut() - // .matches(&AssociationToken::subpattern()); - // if let Match(token) = association_match_res { - // if token.chars().last().unwrap() == '=' { - // // return Token(Tokens::Association(AssociationToken::from_str(token[..1]).unwrap())); - // return Token(Tokens::Association(AssociationToken::from_unprefixed())); // TODO - // } else { - // return Token(Tokens::Association(AssociationToken::from_unprefixed())); - // } - // } + // Association (before operator) + if let Match(token) = reader.matches(&AssociationToken::subpattern()) { + debug_assert!(token.chars().last().unwrap() == '='); + if token.chars().count() > 1 { + panic!(); // TODO + return SubLexerResult::single((Tokens::Association(AssociationToken::from_unprefixed()))); + } else { + return SubLexerResult::single((Tokens::Association(AssociationToken::from_unprefixed()))); + } + } // // Operator // let operator_match_res = self // .reader diff --git a/src/mango/lexing/string_lexer.rs b/src/mango/lexing/string_lexer.rs index ab303c7b..dbc68034 100644 --- a/src/mango/lexing/string_lexer.rs +++ b/src/mango/lexing/string_lexer.rs @@ -1,7 +1,5 @@ use mango::io::typ::Reader; use mango::io::typ::ReaderResult::*; -use mango::lexing::typ::Lexer; -use mango::lexing::typ::MaybeToken; use mango::lexing::typ::SubLexer; use mango::lexing::typ::SubLexerResult; use mango::token::tokens::LiteralToken; diff --git a/src/mango/token/tokens/literal.rs b/src/mango/token/tokens/literal.rs index 814f9d9c..ba1259b3 100644 --- a/src/mango/token/tokens/literal.rs +++ b/src/mango/token/tokens/literal.rs @@ -43,7 +43,7 @@ impl LiteralToken { // TODO: do I want to allow numbers to start with a period? // TODO: for now, only base10 for reals (would 8b11e2 be 9*8^2 or 9*10^2?) // TODO: does not deal with NaN of infinity - r"(?:\+|-*)(?:\d(?:_?\d)*\.\d(?:_?\d)*|\d(?:_?\d)*\.|\.\d(?:_?\d)*)(?:e(?:\+|-|)\d(?:_?\d)*)?" + r"(?:\+|-*)(?:\d(?:_?\d)*\.\d(?:_?\d)*|\d(?:_?\d)*\.|\.\d(?:_?\d)*)(?:e(?:\+|-?)\d(?:_?\d)*)?" } /// Parse a string matching [subpattern_int] to an i64 integer. Overflow is possible. diff --git a/src/mango/util/parsetxt/int.rs b/src/mango/util/parsetxt/int.rs index 744aaca8..38bdb248 100644 --- a/src/mango/util/parsetxt/int.rs +++ b/src/mango/util/parsetxt/int.rs @@ -1,4 +1,4 @@ -use mango::util::strslice::char_ops::char_drop; +use mango::util::strslice::char_ops::CharOps; use regex::Regex; #[derive(Debug)] @@ -40,7 +40,7 @@ pub fn parse_int>(text: S) -> Result { // TODO: implement panic!(format!( "Do not yet know how to deal with {} in base {}", - char_drop(value.as_str(), &'_'), + value.as_str().without_char(&'_'), base.as_str() )) } else { @@ -53,7 +53,7 @@ pub fn parse_int>(text: S) -> Result { Some(value) => { // This is a 'normal' (base10) value. // TODO: check for over/underflow - return Ok(char_drop(value.as_str(), &'_').parse::().unwrap()); + return Ok(value.as_str().without_char(&'_').parse::().unwrap()); } } } diff --git a/src/mango/util/parsetxt/real.rs b/src/mango/util/parsetxt/real.rs index a5de9948..50c8fc8d 100644 --- a/src/mango/util/parsetxt/real.rs +++ b/src/mango/util/parsetxt/real.rs @@ -1,4 +1,4 @@ -use mango::util::strslice::char_ops::char_drop; +use mango::util::strslice::char_ops::CharOps; use regex::Regex; #[derive(Debug)] @@ -29,7 +29,7 @@ pub fn parse_real>(text: S) -> Result { None => return Err(RealParseFailReason::Invalid), Some(captures) => { - let multiplier = char_drop(captures.name("multiplier").unwrap().as_str(), &'_') + let multiplier = captures.name("multiplier").unwrap().as_str().without_char(&'_') .parse::() .unwrap(); match captures.name("exponent") { @@ -39,7 +39,7 @@ pub fn parse_real>(text: S) -> Result } Some(exponent_match) => { // This real is in exponential notation - let exponent = char_drop(exponent_match.as_str(), &'_') + let exponent = exponent_match.as_str().without_char(&'_') .parse::() .unwrap(); // TODO: is there a numerically smarter way to do this? diff --git a/src/mango/util/strslice/char_ops.rs b/src/mango/util/strslice/char_ops.rs index a2e351f4..73026377 100644 --- a/src/mango/util/strslice/char_ops.rs +++ b/src/mango/util/strslice/char_ops.rs @@ -1,6 +1,28 @@ -/// Remove all matching characters from the string. -// Signature may be changed to support a set of characters, if the need arises. -pub fn char_drop>(text: S, strip: &char) -> String { - let text = text.into(); - text.chars().filter(|chr| chr != strip).collect() + +pub trait CharOps { + /// Remove all matching characters from the string. + // Signature may be changed to support a set of characters, if the need arises. + fn without_char(&self, strip: &char) -> String; + + fn char_len(&self) -> usize; +} + +impl<'a> CharOps for &'a str { + fn without_char(&self, strip: &char) -> String { + self.chars().filter(|chr| chr != strip).collect() + } + + fn char_len(&self) -> usize { + self.chars().count() + } +} + +impl CharOps for String { + fn without_char(&self, strip: &char) -> String { + (&self).without_char(strip) + } + + fn char_len(&self) -> usize { + (&self).char_len() + } } From ba3acda81238f9ab0212962a590eb83b0ad1d2d7 Mon Sep 17 00:00:00 2001 From: Mark Date: Sun, 17 Jun 2018 22:05:35 +0200 Subject: [PATCH 43/49] Lexer has an infinite loop somewhere #52 --- src/mango/io/util.rs | 3 +- src/mango/lexing/code_lexer.rs | 67 ++++++++++++++++------------- src/mango/util/parsetxt/real.rs | 10 ++++- src/mango/util/strslice/char_ops.rs | 3 +- 4 files changed, 47 insertions(+), 36 deletions(-) diff --git a/src/mango/io/util.rs b/src/mango/io/util.rs index b096b333..a8cbb96d 100644 --- a/src/mango/io/util.rs +++ b/src/mango/io/util.rs @@ -19,8 +19,7 @@ impl RegexCache { match Regex::new(&format!("^ *{}", subpattern)) { Err(err) => panic!(format!( "Invalid regular expression '{}' while adding to library; this is a bug:\n{:?}", - subpattern, - err + subpattern, err )), Ok(regex) => { self.cache.insert(subpattern.to_owned(), regex); diff --git a/src/mango/lexing/code_lexer.rs b/src/mango/lexing/code_lexer.rs index 58c149ab..05fd61ea 100644 --- a/src/mango/lexing/code_lexer.rs +++ b/src/mango/lexing/code_lexer.rs @@ -147,40 +147,45 @@ impl SubLexer for CodeLexer { debug_assert!(token.chars().last().unwrap() == '='); if token.chars().count() > 1 { panic!(); // TODO - return SubLexerResult::single((Tokens::Association(AssociationToken::from_unprefixed()))); + return SubLexerResult::single( + (Tokens::Association(AssociationToken::from_unprefixed())), + ); } else { - return SubLexerResult::single((Tokens::Association(AssociationToken::from_unprefixed()))); + return SubLexerResult::single( + (Tokens::Association(AssociationToken::from_unprefixed())), + ); } } - // // Operator - // let operator_match_res = self - // .reader - // .borrow_mut() - // .matches(OperatorToken::subpattern()); - // if let Match(token) = operator_match_res { - // return Token(Tokens::Operator(OperatorToken::from_str(&token).unwrap())); - // } - // // Grouping symbols - // if let Match(_) = reader.matches(r"\(") { - // return Token(Tokens::ParenthesisOpen(ParenthesisOpenToken::new())); - // } - // if let Match(_) = reader.matches(r"\)") { - // return Token(Tokens::ParenthesisClose(ParenthesisCloseToken::new())); - // } - // - // let unknown_word = reader.matches("[^\\s]+"); - // match unknown_word { - // Match(word) => return Token(Tokens::Unlexable(UnlexableToken::new(word))), - // NoMatch() => { - // println!("END {:?}", self.reader.borrow()); // TODO - // panic!("Do not know how to proceed with parsing") - // } - // EOF() => { - // // TODO: also dedent and end statement here - // End - // } - // } + // Operator + if let Match(token) = reader.matches(OperatorToken::subpattern()) { + return SubLexerResult::single(Tokens::Operator( + OperatorToken::from_str(&token).unwrap(), + )); + } + // Grouping symbols + if let Match(_) = reader.matches(r"\(") { + return SubLexerResult::single(Tokens::ParenthesisOpen(ParenthesisOpenToken::new())); + } + if let Match(_) = reader.matches(r"\)") { + return SubLexerResult::single(Tokens::ParenthesisClose(ParenthesisCloseToken::new())); + } - panic!() // TODO TMP + // If the code gets here, it did not recognize the text as any token + return match reader.matches(r"[^\s]+") { + Match(word) => SubLexerResult::single(Tokens::Unlexable(UnlexableToken::new(word))), + NoMatch() => { + println!("END {:?}", reader); // todo: tmp + panic!("Do not know how to proceed with parsing") + } + EOF() => { + // TODO: also dedent and end statement here + let mut tokens = vec![Tokens::EndStatement(EndStatementToken::new_end_line())]; + for _ in 0..self.indent { + // This line is dedented, make end tokens. + tokens.push(Tokens::EndBlock(EndBlockToken::new(true, false))); + } + SubLexerResult::Result(tokens) + } + }; } } diff --git a/src/mango/util/parsetxt/real.rs b/src/mango/util/parsetxt/real.rs index 50c8fc8d..b99925ff 100644 --- a/src/mango/util/parsetxt/real.rs +++ b/src/mango/util/parsetxt/real.rs @@ -29,7 +29,11 @@ pub fn parse_real>(text: S) -> Result { None => return Err(RealParseFailReason::Invalid), Some(captures) => { - let multiplier = captures.name("multiplier").unwrap().as_str().without_char(&'_') + let multiplier = captures + .name("multiplier") + .unwrap() + .as_str() + .without_char(&'_') .parse::() .unwrap(); match captures.name("exponent") { @@ -39,7 +43,9 @@ pub fn parse_real>(text: S) -> Result } Some(exponent_match) => { // This real is in exponential notation - let exponent = exponent_match.as_str().without_char(&'_') + let exponent = exponent_match + .as_str() + .without_char(&'_') .parse::() .unwrap(); // TODO: is there a numerically smarter way to do this? diff --git a/src/mango/util/strslice/char_ops.rs b/src/mango/util/strslice/char_ops.rs index 73026377..054ba7f3 100644 --- a/src/mango/util/strslice/char_ops.rs +++ b/src/mango/util/strslice/char_ops.rs @@ -1,4 +1,3 @@ - pub trait CharOps { /// Remove all matching characters from the string. // Signature may be changed to support a set of characters, if the need arises. @@ -19,10 +18,12 @@ impl<'a> CharOps for &'a str { impl CharOps for String { fn without_char(&self, strip: &char) -> String { + println!("String.without_char"); (&self).without_char(strip) } fn char_len(&self) -> usize { + println!("String.char_len"); (&self).char_len() } } From 0195c7af712efb1098d5fa25679f80ac8171f255 Mon Sep 17 00:00:00 2001 From: Mark Date: Mon, 18 Jun 2018 08:28:45 +0200 Subject: [PATCH 44/49] Fix the infinite loop, lexing back to prev state #52 --- src/mango/lexing/code_lexer.rs | 6 +++++- src/mango/token/collect/all.rs | 2 +- src/mango/util/strslice/char_ops.rs | 6 ++---- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/src/mango/lexing/code_lexer.rs b/src/mango/lexing/code_lexer.rs index 05fd61ea..59134bda 100644 --- a/src/mango/lexing/code_lexer.rs +++ b/src/mango/lexing/code_lexer.rs @@ -178,12 +178,16 @@ impl SubLexer for CodeLexer { panic!("Do not know how to proceed with parsing") } EOF() => { - // TODO: also dedent and end statement here + if self.indent <= 0 { + return SubLexerResult::End; + } + // TODO: currently the EndStatement is only made if the file stops on an indented line let mut tokens = vec![Tokens::EndStatement(EndStatementToken::new_end_line())]; for _ in 0..self.indent { // This line is dedented, make end tokens. tokens.push(Tokens::EndBlock(EndBlockToken::new(true, false))); } + self.indent = 0; SubLexerResult::Result(tokens) } }; diff --git a/src/mango/token/collect/all.rs b/src/mango/token/collect/all.rs index 0dd44600..618cc741 100644 --- a/src/mango/token/collect/all.rs +++ b/src/mango/token/collect/all.rs @@ -55,6 +55,6 @@ mod tests { #[test] fn test_tokens_size() { - assert!(size_of::() < 32); + assert!(size_of::() < 32, size_of::()); } } diff --git a/src/mango/util/strslice/char_ops.rs b/src/mango/util/strslice/char_ops.rs index 054ba7f3..894994aa 100644 --- a/src/mango/util/strslice/char_ops.rs +++ b/src/mango/util/strslice/char_ops.rs @@ -18,12 +18,10 @@ impl<'a> CharOps for &'a str { impl CharOps for String { fn without_char(&self, strip: &char) -> String { - println!("String.without_char"); - (&self).without_char(strip) + self.chars().filter(|chr| chr != strip).collect() } fn char_len(&self) -> usize { - println!("String.char_len"); - (&self).char_len() + self.chars().count() } } From 089c17e1da7f88d1583516cde956f857499a62f8 Mon Sep 17 00:00:00 2001 From: Mark Date: Tue, 19 Jun 2018 07:57:10 +0200 Subject: [PATCH 45/49] Resolved technical errors, only lexing problems remain #52 --- src/mango/lexing/code_lexer.rs | 19 ++++++++++--------- src/mango/lexing/combi_lexer.rs | 17 ++++++++++++----- src/mango/lexing/util/lex_all.rs | 11 +++++++++++ src/mango/token/collect/all.rs | 2 +- src/mango/token/tokens/association.rs | 2 +- src/mango/util/codeparts/operator.rs | 2 +- 6 files changed, 36 insertions(+), 17 deletions(-) diff --git a/src/mango/lexing/code_lexer.rs b/src/mango/lexing/code_lexer.rs index 59134bda..64a90955 100644 --- a/src/mango/lexing/code_lexer.rs +++ b/src/mango/lexing/code_lexer.rs @@ -18,6 +18,8 @@ use mango::token::tokens::ParenthesisOpenToken; use mango::token::tokens::StartBlockToken; use mango::token::Tokens; use mango::util::collection::Queue; +use mango::util::strslice::char_ops::CharOps; +use mango::util::strslice::charsliceto; pub struct CodeLexer { indent: i32, @@ -145,11 +147,13 @@ impl SubLexer for CodeLexer { // Association (before operator) if let Match(token) = reader.matches(&AssociationToken::subpattern()) { debug_assert!(token.chars().last().unwrap() == '='); - if token.chars().count() > 1 { - panic!(); // TODO - return SubLexerResult::single( - (Tokens::Association(AssociationToken::from_unprefixed())), - ); + if token.char_len() > 1 { + match AssociationToken::from_str(charsliceto(token, -1)) { + Ok(association) => { + return SubLexerResult::single((Tokens::Association(association))) + } + Err(msg) => panic!(format!("Invalid association prefix: {}", msg)), + } } else { return SubLexerResult::single( (Tokens::Association(AssociationToken::from_unprefixed())), @@ -173,10 +177,7 @@ impl SubLexer for CodeLexer { // If the code gets here, it did not recognize the text as any token return match reader.matches(r"[^\s]+") { Match(word) => SubLexerResult::single(Tokens::Unlexable(UnlexableToken::new(word))), - NoMatch() => { - println!("END {:?}", reader); // todo: tmp - panic!("Do not know how to proceed with parsing") - } + NoMatch() => panic!("Do not know how to proceed with parsing"), EOF() => { if self.indent <= 0 { return SubLexerResult::End; diff --git a/src/mango/lexing/combi_lexer.rs b/src/mango/lexing/combi_lexer.rs index 9a3965ec..a31a02db 100644 --- a/src/mango/lexing/combi_lexer.rs +++ b/src/mango/lexing/combi_lexer.rs @@ -81,16 +81,22 @@ mod tests { use mango::token::tokens::ParenthesisOpenToken; use mango::token::tokens::StartBlockToken; use mango::token::Tokens; + use mango::util::encdec::to_text::ToText; use std::cell::RefCell; use std::rc::Rc; fn assert_text_to_tokens(text: &str, tokens: Vec) { + let expected = LexList::from_tokens(tokens); + let actual = lex_all(&mut CombiLexer::new(Box::new(StringReader::new( + text.to_owned(), + )))); assert_eq!( - LexList::from_tokens(tokens), - lex_all(&mut CombiLexer::new(Box::new(StringReader::new( - text.to_owned() - )))) - ) + expected, + actual, + "expected: {}\nactual: {}", + expected.to_text(), + actual.to_text(), + ); } #[test] @@ -115,6 +121,7 @@ mod tests { Tokens::Literal(LiteralToken::Int(0)), Tokens::EndStatement(EndStatementToken::new_end_line()), Tokens::Keyword(KeywordToken::from_str("for".to_owned()).unwrap()), + Tokens::Identifier(IdentifierToken::from_str("x".to_owned()).unwrap()), Tokens::Operator(OperatorToken::from_str("<").unwrap()), Tokens::Literal(LiteralToken::Int(128)), Tokens::EndStatement(EndStatementToken::new_end_line()), diff --git a/src/mango/lexing/util/lex_all.rs b/src/mango/lexing/util/lex_all.rs index 82ee0c1d..e10557a8 100644 --- a/src/mango/lexing/util/lex_all.rs +++ b/src/mango/lexing/util/lex_all.rs @@ -1,6 +1,7 @@ use mango::lexing::typ::Lexer; use mango::lexing::typ::MaybeToken; use mango::token::Tokens; +use mango::util::encdec::ToText; /// Represents all the lex tokens in a source. #[derive(PartialEq, Eq, Debug)] @@ -18,6 +19,16 @@ impl LexList { } } +impl ToText for LexList { + fn to_text(&self) -> String { + self.tokens + .iter() + .map(|token| token.to_text()) + .collect::>() + .join(" ") + } +} + pub fn lex_all(lexer: &mut Lexer) -> LexList { let mut list = Vec::with_capacity(512); while let MaybeToken::Token(token) = lexer.lex() { diff --git a/src/mango/token/collect/all.rs b/src/mango/token/collect/all.rs index 618cc741..70576f40 100644 --- a/src/mango/token/collect/all.rs +++ b/src/mango/token/collect/all.rs @@ -55,6 +55,6 @@ mod tests { #[test] fn test_tokens_size() { - assert!(size_of::() < 32, size_of::()); + assert!(size_of::() <= 40, size_of::()); } } diff --git a/src/mango/token/tokens/association.rs b/src/mango/token/tokens/association.rs index 1c857ee2..a5be9028 100644 --- a/src/mango/token/tokens/association.rs +++ b/src/mango/token/tokens/association.rs @@ -28,7 +28,7 @@ impl AssociationToken { } pub fn subpattern() -> String { - format!("{}=", Symbol::subpattern()) + format!(r"(?:{})?=", Symbol::subpattern()) } } diff --git a/src/mango/util/codeparts/operator.rs b/src/mango/util/codeparts/operator.rs index 22d39375..31dfb043 100644 --- a/src/mango/util/codeparts/operator.rs +++ b/src/mango/util/codeparts/operator.rs @@ -35,7 +35,7 @@ impl Symbol { /// Generate an eager subpattern to match tokens, that can be composed in a regular expression. pub fn subpattern() -> &'static str { - r"(\+|\-|\*|/)" + r"[\-+*/]" } } From 1d0c8d5838749a7f14a4aeb633b5ef89df85c7a3 Mon Sep 17 00:00:00 2001 From: Mark Date: Tue, 19 Jun 2018 22:49:03 +0200 Subject: [PATCH 46/49] Fix various problems incl stack/queue bug #52 --- src/mango/io/util.rs | 2 +- src/mango/lexing/code_lexer.rs | 31 +++++++++--------- src/mango/lexing/combi_lexer.rs | 6 +++- src/mango/util/codeparts/operator.rs | 47 +++++++++++++++++++--------- src/mango/util/collection/queue.rs | 31 ++++++++++++++---- src/mango/util/collection/stack.rs | 15 +++++++++ 6 files changed, 93 insertions(+), 39 deletions(-) diff --git a/src/mango/io/util.rs b/src/mango/io/util.rs index a8cbb96d..258d0e8c 100644 --- a/src/mango/io/util.rs +++ b/src/mango/io/util.rs @@ -16,7 +16,7 @@ impl RegexCache { pub fn make_or_get(&mut self, subpattern: &str) -> &Regex { if !self.cache.contains_key(subpattern) { - match Regex::new(&format!("^ *{}", subpattern)) { + match Regex::new(&format!(r"^ *{}", subpattern)) { Err(err) => panic!(format!( "Invalid regular expression '{}' while adding to library; this is a bug:\n{:?}", subpattern, err diff --git a/src/mango/lexing/code_lexer.rs b/src/mango/lexing/code_lexer.rs index 64a90955..0dfbda60 100644 --- a/src/mango/lexing/code_lexer.rs +++ b/src/mango/lexing/code_lexer.rs @@ -23,17 +23,13 @@ use mango::util::strslice::charsliceto; pub struct CodeLexer { indent: i32, - buffer: Queue, } // TODO: keep the regexes in thread local global scope storage impl CodeLexer { pub fn new() -> Self { - CodeLexer { - indent: 0, - buffer: Queue::new(), - } + CodeLexer { indent: 0 } } fn lex_indents(&mut self, reader: &mut Box) -> Vec { @@ -43,7 +39,7 @@ impl CodeLexer { } let mut tokens: Vec = Vec::with_capacity(8); if line_indent < self.indent { - if let Match(_) = reader.matches(r"end\s") { + if let Match(_) = reader.matches(r"end") { // If this is followed by an 'end' keyword, then that 'end' is redundant. tokens.push(Tokens::EndBlock(EndBlockToken::new(true, true))); } else { @@ -53,11 +49,12 @@ impl CodeLexer { // This line is dedented, make end tokens. tokens.push(Tokens::EndBlock(EndBlockToken::new(true, false))); } - } - for _ in self.indent..line_indent { - // This line is indented, make start tokens. - // TODO: increasing indent by more than one should be a warning - self.buffer.push(Tokens::StartBlock(StartBlockToken::new())); + } else { + for _ in self.indent..line_indent { + // This line is indented, make start tokens. + // TODO: increasing indent by more than one should be a warning + tokens.push(Tokens::StartBlock(StartBlockToken::new())); + } } self.indent = line_indent; tokens @@ -144,6 +141,12 @@ impl SubLexer for CodeLexer { return SubLexerResult::single(Tokens::Literal(LiteralToken::Real(value))); } + // Operator (before association) + if let Match(token) = reader.matches(OperatorToken::subpattern()) { + return SubLexerResult::single(Tokens::Operator( + OperatorToken::from_str(&token).unwrap(), + )); + } // Association (before operator) if let Match(token) = reader.matches(&AssociationToken::subpattern()) { debug_assert!(token.chars().last().unwrap() == '='); @@ -160,12 +163,6 @@ impl SubLexer for CodeLexer { ); } } - // Operator - if let Match(token) = reader.matches(OperatorToken::subpattern()) { - return SubLexerResult::single(Tokens::Operator( - OperatorToken::from_str(&token).unwrap(), - )); - } // Grouping symbols if let Match(_) = reader.matches(r"\(") { return SubLexerResult::single(Tokens::ParenthesisOpen(ParenthesisOpenToken::new())); diff --git a/src/mango/lexing/combi_lexer.rs b/src/mango/lexing/combi_lexer.rs index a31a02db..34b6cbe9 100644 --- a/src/mango/lexing/combi_lexer.rs +++ b/src/mango/lexing/combi_lexer.rs @@ -40,6 +40,10 @@ impl Lexer for CombiLexer { match lexer.lex_pass(&mut self.reader) { SubLexerResult::Result(tokens) => { if tokens.len() > 0 { + if tokens.len() > 1 { + // TODO + println!(">> GOING TO ADD: {:?}", tokens); + } // The sublexer produced tokens, queue them. self.buffer.append(tokens); self.lex() // TODO: if every branch does this, move it down @@ -93,7 +97,7 @@ mod tests { assert_eq!( expected, actual, - "expected: {}\nactual: {}", + "\nexpected:\n{}\nactual:\n{}", expected.to_text(), actual.to_text(), ); diff --git a/src/mango/util/codeparts/operator.rs b/src/mango/util/codeparts/operator.rs index 31dfb043..b001bfda 100644 --- a/src/mango/util/codeparts/operator.rs +++ b/src/mango/util/codeparts/operator.rs @@ -11,21 +11,32 @@ pub enum Symbol { Dash, Asterisk, Slash, + LT, + GT, + Eq, + LE, + GE, + Exclamation, + Question, } impl Symbol { pub fn new>(symbol_txt: S) -> Result { + use self::Symbol::*; let ssymbol_txt = symbol_txt.into(); match &*ssymbol_txt { - "+" => Ok(Symbol::Plus), - "-" => Ok(Symbol::Dash), - "*" => Ok(Symbol::Asterisk), - "/" => Ok(Symbol::Slash), - "<" => Ok(Symbol::Slash), - ">" => Ok(Symbol::Slash), - "==" => Ok(Symbol::Slash), - ">=" => Ok(Symbol::Slash), - "<=" => Ok(Symbol::Slash), + "+" => Ok(Plus), + "-" => Ok(Dash), + "*" => Ok(Asterisk), + "/" => Ok(Slash), + // TODO: how do I know < is an operator, rather than e.g. a generic? + "<" => Ok(LT), + ">" => Ok(GT), + "==" => Ok(Eq), + "<=" => Ok(LE), + ">=" => Ok(GE), + "!" => Ok(Exclamation), + "?" => Ok(Question), _ => Err(Msg::from_valid(&format!( "Unknown symbol: '{}'", ssymbol_txt @@ -35,20 +46,28 @@ impl Symbol { /// Generate an eager subpattern to match tokens, that can be composed in a regular expression. pub fn subpattern() -> &'static str { - r"[\-+*/]" + r"(?:\+|-|\*|/|<=|>=|==|>|<)" } } impl Display for Symbol { fn fmt(&self, f: &mut Formatter) -> fResult { + use self::Symbol::*; write!( f, "{}", match *self { - Symbol::Plus => "+", - Symbol::Dash => "-", - Symbol::Asterisk => "*", - Symbol::Slash => "/", + Plus => "+", + Dash => "-", + Asterisk => "*", + Slash => "/", + LT => "<", + GT => ">", + Eq => "==", + LE => "<=", + GE => ">=", + Exclamation => "!", + Question => "?", } ) } diff --git a/src/mango/util/collection/queue.rs b/src/mango/util/collection/queue.rs index bd239bef..3f18fad4 100644 --- a/src/mango/util/collection/queue.rs +++ b/src/mango/util/collection/queue.rs @@ -1,26 +1,45 @@ +use std::collections::VecDeque; + /// A one-ended queue. See also [Stack]. /// This is just a wrapper around vec so nobody pushes or pops the wrong end. pub struct Queue { - items: Vec, + items: VecDeque, } impl Queue { pub fn new() -> Self { Queue { - items: Vec::with_capacity(16), + items: VecDeque::with_capacity(16), } } pub fn push(&mut self, value: T) { - self.items.push(value) + self.items.push_back(value) } pub fn pop(&mut self) -> Option { - self.items.pop() + self.items.pop_front() } /// Moves all the elements from a vector into the queue. - pub fn append(&mut self, mut other: Vec) { - self.items.append(&mut other); + pub fn append(&mut self, other: Vec) { + for item in other.into_iter() { + self.items.push_back(item); + } + } +} + +#[cfg(test)] +mod tests { + use super::Queue; + + #[test] + fn test_queue() { + let mut queue: Queue = Queue::new(); + queue.push(1); + queue.push(2); + assert_eq!(1, queue.pop().unwrap()); + assert_eq!(2, queue.pop().unwrap()); + assert!(queue.pop().is_none()); } } diff --git a/src/mango/util/collection/stack.rs b/src/mango/util/collection/stack.rs index 055c0a18..942e43cd 100644 --- a/src/mango/util/collection/stack.rs +++ b/src/mango/util/collection/stack.rs @@ -25,3 +25,18 @@ impl Stack { self.items.back_mut() } } + +#[cfg(test)] +mod tests { + use super::Stack; + + #[test] + fn test_stack() { + let mut stack: Stack = Stack::new(); + stack.push(1); + stack.push(2); + assert_eq!(2, stack.pop().unwrap()); + assert_eq!(1, stack.pop().unwrap()); + assert!(stack.pop().is_none()); + } +} From f79c75bd01facb17032fffbec5b6c0ce94b33db1 Mon Sep 17 00:00:00 2001 From: Mark Date: Wed, 20 Jun 2018 08:21:48 +0200 Subject: [PATCH 47/49] MWE of the lexer works! #52 --- src/mango/lexing/code_lexer.rs | 19 +++++++++---------- src/mango/lexing/combi_lexer.rs | 12 +++++------- 2 files changed, 14 insertions(+), 17 deletions(-) diff --git a/src/mango/lexing/code_lexer.rs b/src/mango/lexing/code_lexer.rs index 0dfbda60..18adb6ee 100644 --- a/src/mango/lexing/code_lexer.rs +++ b/src/mango/lexing/code_lexer.rs @@ -22,7 +22,7 @@ use mango::util::strslice::char_ops::CharOps; use mango::util::strslice::charsliceto; pub struct CodeLexer { - indent: i32, + indent: i32, // -1: finished } // TODO: keep the regexes in thread local global scope storage @@ -141,12 +141,6 @@ impl SubLexer for CodeLexer { return SubLexerResult::single(Tokens::Literal(LiteralToken::Real(value))); } - // Operator (before association) - if let Match(token) = reader.matches(OperatorToken::subpattern()) { - return SubLexerResult::single(Tokens::Operator( - OperatorToken::from_str(&token).unwrap(), - )); - } // Association (before operator) if let Match(token) = reader.matches(&AssociationToken::subpattern()) { debug_assert!(token.chars().last().unwrap() == '='); @@ -163,6 +157,12 @@ impl SubLexer for CodeLexer { ); } } + // Operator (after association) + if let Match(token) = reader.matches(OperatorToken::subpattern()) { + return SubLexerResult::single(Tokens::Operator( + OperatorToken::from_str(&token).unwrap(), + )); + } // Grouping symbols if let Match(_) = reader.matches(r"\(") { return SubLexerResult::single(Tokens::ParenthesisOpen(ParenthesisOpenToken::new())); @@ -176,16 +176,15 @@ impl SubLexer for CodeLexer { Match(word) => SubLexerResult::single(Tokens::Unlexable(UnlexableToken::new(word))), NoMatch() => panic!("Do not know how to proceed with parsing"), EOF() => { - if self.indent <= 0 { + if self.indent < 0 { return SubLexerResult::End; } - // TODO: currently the EndStatement is only made if the file stops on an indented line let mut tokens = vec![Tokens::EndStatement(EndStatementToken::new_end_line())]; for _ in 0..self.indent { // This line is dedented, make end tokens. tokens.push(Tokens::EndBlock(EndBlockToken::new(true, false))); } - self.indent = 0; + self.indent = -1; SubLexerResult::Result(tokens) } }; diff --git a/src/mango/lexing/combi_lexer.rs b/src/mango/lexing/combi_lexer.rs index 34b6cbe9..f7fcb8f7 100644 --- a/src/mango/lexing/combi_lexer.rs +++ b/src/mango/lexing/combi_lexer.rs @@ -40,10 +40,6 @@ impl Lexer for CombiLexer { match lexer.lex_pass(&mut self.reader) { SubLexerResult::Result(tokens) => { if tokens.len() > 0 { - if tokens.len() > 1 { - // TODO - println!(">> GOING TO ADD: {:?}", tokens); - } // The sublexer produced tokens, queue them. self.buffer.append(tokens); self.lex() // TODO: if every branch does this, move it down @@ -107,9 +103,10 @@ mod tests { fn test_lexing_individual() { assert_text_to_tokens( "if", - vec![Tokens::Keyword( - KeywordToken::from_str("if".to_owned()).unwrap(), - )], + vec![ + Tokens::Keyword(KeywordToken::from_str("if".to_owned()).unwrap()), + Tokens::EndStatement(EndStatementToken::new_end_line()), + ], ); // todo: more } @@ -133,6 +130,7 @@ mod tests { Tokens::Identifier(IdentifierToken::from_str("x".to_owned()).unwrap()), Tokens::Association(AssociationToken::from_str("+".to_owned()).unwrap()), Tokens::Literal(LiteralToken::Int(1)), + Tokens::EndStatement(EndStatementToken::new_end_line()), Tokens::EndBlock(EndBlockToken::new(true, false)), ], ); From cdda4d413e80ffc129f2109580dfde82d22035b5 Mon Sep 17 00:00:00 2001 From: Mark Date: Wed, 20 Jun 2018 20:35:19 +0200 Subject: [PATCH 48/49] Solve most compiler warnings #52 --- rustfmt.toml | 1 + src/mango/ast_full/node/assignment.rs | 6 +-- src/mango/ast_full/node/unary_operation.rs | 6 +-- src/mango/ast_full/terminal/literal.rs | 4 +- src/mango/io/util.rs | 4 +- src/mango/lexing/code_lexer.rs | 58 +++++++++++----------- src/mango/lexing/combi_lexer.rs | 20 +------- src/mango/lexing/string_lexer.rs | 10 ++-- src/mango/lexing/util/lex_all.rs | 9 ++-- src/mango/lexing/util/mod.rs | 2 + src/mango/lexing/util/test_util.rs | 19 +++++++ src/mango/token/tokens/association.rs | 4 +- src/mango/token/tokens/block.rs | 5 +- src/mango/towasm/control/block.rs | 6 +-- src/mango/towasm/control/repeat.rs | 6 +-- src/mango/towasm/numeric/arithmetic.rs | 24 ++------- src/mango/towasm/numeric/logic.rs | 24 ++------- src/mango/towasm/scope/function.rs | 25 ++-------- src/mango/towasm/scope/module.rs | 6 +-- src/mango/towasm/tests.rs | 34 ++++--------- src/mango/towasm/values/assign.rs | 6 +-- src/mango/towasm/values/localvar.rs | 10 +--- src/mango/util/codeparts/keyword.rs | 5 +- src/mango/util/codeparts/operator.rs | 5 +- src/mango/util/errors/code_problem.rs | 15 ++---- src/mango/util/errors/collector.rs | 26 ++-------- src/mango/util/format/strings.rs | 5 +- src/mango/util/numtype/eqfloat.rs | 10 +--- src/mango/util/parsetxt/int.rs | 7 ++- src/mango/util/parsetxt/real.rs | 21 ++------ src/mango/util/strslice/slice.rs | 5 +- src/mango/util/strtype/msg.rs | 4 +- src/mango/util/strtype/name.rs | 30 +++-------- 33 files changed, 120 insertions(+), 302 deletions(-) create mode 100644 src/mango/lexing/util/test_util.rs diff --git a/rustfmt.toml b/rustfmt.toml index 44148a2d..eaf1d122 100644 --- a/rustfmt.toml +++ b/rustfmt.toml @@ -1 +1,2 @@ reorder_imports = true +max_width = 140 diff --git a/src/mango/ast_full/node/assignment.rs b/src/mango/ast_full/node/assignment.rs index 558d6d3a..4b060303 100644 --- a/src/mango/ast_full/node/assignment.rs +++ b/src/mango/ast_full/node/assignment.rs @@ -21,11 +21,7 @@ impl AssignmentAST { impl ToText for AssignmentAST { fn to_text(&self) -> String { - return format!( - "{0:} = ({1:})", - self.assignee.to_text(), - self.value.to_text() - ); + return format!("{0:} = ({1:})", self.assignee.to_text(), self.value.to_text()); } } diff --git a/src/mango/ast_full/node/unary_operation.rs b/src/mango/ast_full/node/unary_operation.rs index 751105a3..36fc77b1 100644 --- a/src/mango/ast_full/node/unary_operation.rs +++ b/src/mango/ast_full/node/unary_operation.rs @@ -22,11 +22,7 @@ impl UnaryOperationAST { impl ToText for UnaryOperationAST { fn to_text(&self) -> String { - return format!( - "({0:} {1:})", - self.operator.to_text(), - self.subject.to_text() - ); + return format!("({0:} {1:})", self.operator.to_text(), self.subject.to_text()); } } diff --git a/src/mango/ast_full/terminal/literal.rs b/src/mango/ast_full/terminal/literal.rs index b585461d..74d907e8 100644 --- a/src/mango/ast_full/terminal/literal.rs +++ b/src/mango/ast_full/terminal/literal.rs @@ -31,9 +31,7 @@ pub struct StringLiteralAST { impl FloatLiteralAST { pub fn new(value: f64) -> Self { - FloatLiteralAST { - value: f64eq::new(value), - } + FloatLiteralAST { value: f64eq::new(value) } } } diff --git a/src/mango/io/util.rs b/src/mango/io/util.rs index 258d0e8c..e70c250e 100644 --- a/src/mango/io/util.rs +++ b/src/mango/io/util.rs @@ -9,9 +9,7 @@ pub struct RegexCache { impl RegexCache { // Not public to prevent having more than one instance. fn new() -> Self { - RegexCache { - cache: HashMap::new(), - } + RegexCache { cache: HashMap::new() } } pub fn make_or_get(&mut self, subpattern: &str) -> &Regex { diff --git a/src/mango/lexing/code_lexer.rs b/src/mango/lexing/code_lexer.rs index 18adb6ee..8134247f 100644 --- a/src/mango/lexing/code_lexer.rs +++ b/src/mango/lexing/code_lexer.rs @@ -1,8 +1,6 @@ use mango::io::typ::Reader; use mango::io::typ::ReaderResult::*; use mango::lexing::string_lexer::StringLexer; -use mango::lexing::typ::Lexer; -use mango::lexing::typ::MaybeToken; use mango::lexing::typ::SubLexer; use mango::lexing::typ::SubLexerResult; use mango::token::special::UnlexableToken; @@ -17,7 +15,6 @@ use mango::token::tokens::ParenthesisCloseToken; use mango::token::tokens::ParenthesisOpenToken; use mango::token::tokens::StartBlockToken; use mango::token::Tokens; -use mango::util::collection::Queue; use mango::util::strslice::char_ops::CharOps; use mango::util::strslice::charsliceto; @@ -72,8 +69,6 @@ impl SubLexer for CodeLexer { fn lex_pass(&mut self, reader: &mut Box) -> SubLexerResult { use self::SubLexerResult::*; - // TODO: put all these match results inline - // End of line continuation if let Match(_) = reader.matches(r"\.\.\.") { // Line continuation has no token, it just continues on the next line, ignoring indents (for now). @@ -83,8 +78,7 @@ impl SubLexer for CodeLexer { // The rest of this line is unparsable. if let Match(word) = reader.matches("[^\\n]*\\n\\r?") { // This is a new line, so there may be indents. - return self - .token_and_indents(reader, Tokens::Unlexable(UnlexableToken::new(word))); + return self.token_and_indents(reader, Tokens::Unlexable(UnlexableToken::new(word))); } else { // TODO: I don't know yet how to deal with '...' followed by end-of-file panic!() @@ -95,24 +89,16 @@ impl SubLexer for CodeLexer { if let Match(_) = reader.matches("\\n\\r?") { // Newline WITHOUT line continuation. // This is a new line, so there may be indents. - return self.token_and_indents( - reader, - Tokens::EndStatement(EndStatementToken::new_end_line()), - ); + return self.token_and_indents(reader, Tokens::EndStatement(EndStatementToken::new_end_line())); } // End of statement if let Match(_) = reader.matches(";") { // Semicolon, which ends a statement. if let Match(_) = reader.matches("\\n\\r?") { // If semicolon is followed by a newline, it is redundant. Deal with indents (but ignore the newline itself). - return self.token_and_indents( - reader, - Tokens::EndStatement(EndStatementToken::new_semicolon()), - ); + return self.token_and_indents(reader, Tokens::EndStatement(EndStatementToken::new_semicolon())); } else { - return SubLexerResult::single(Tokens::EndStatement( - EndStatementToken::new_semicolon(), - )); + return SubLexerResult::single(Tokens::EndStatement(EndStatementToken::new_semicolon())); } } // @@ -124,9 +110,7 @@ impl SubLexer for CodeLexer { if let Ok(keyword) = KeywordToken::from_str(word.clone()) { return SubLexerResult::single(Tokens::Keyword(keyword)); } - return SubLexerResult::single(Tokens::Identifier( - IdentifierToken::from_str(word).unwrap(), - )); + return SubLexerResult::single(Tokens::Identifier(IdentifierToken::from_str(word).unwrap())); } // Literal if let Match(_) = reader.matches("[a-z]?\"") { @@ -146,22 +130,16 @@ impl SubLexer for CodeLexer { debug_assert!(token.chars().last().unwrap() == '='); if token.char_len() > 1 { match AssociationToken::from_str(charsliceto(token, -1)) { - Ok(association) => { - return SubLexerResult::single((Tokens::Association(association))) - } + Ok(association) => return SubLexerResult::single(Tokens::Association(association)), Err(msg) => panic!(format!("Invalid association prefix: {}", msg)), } } else { - return SubLexerResult::single( - (Tokens::Association(AssociationToken::from_unprefixed())), - ); + return SubLexerResult::single(Tokens::Association(AssociationToken::from_unprefixed())); } } // Operator (after association) if let Match(token) = reader.matches(OperatorToken::subpattern()) { - return SubLexerResult::single(Tokens::Operator( - OperatorToken::from_str(&token).unwrap(), - )); + return SubLexerResult::single(Tokens::Operator(OperatorToken::from_str(&token).unwrap())); } // Grouping symbols if let Match(_) = reader.matches(r"\(") { @@ -190,3 +168,23 @@ impl SubLexer for CodeLexer { }; } } + +#[cfg(test)] +mod tests { + use mango::lexing::util::test_util::assert_text_to_tokens; + use mango::token::tokens::EndStatementToken; + use mango::token::tokens::KeywordToken; + use mango::token::Tokens; + + #[test] + fn test_lexing_individual() { + assert_text_to_tokens( + "if", + vec![ + Tokens::Keyword(KeywordToken::from_str("if".to_owned()).unwrap()), + Tokens::EndStatement(EndStatementToken::new_end_line()), + ], + ); + // todo: more + } +} diff --git a/src/mango/lexing/combi_lexer.rs b/src/mango/lexing/combi_lexer.rs index f7fcb8f7..09d6a520 100644 --- a/src/mango/lexing/combi_lexer.rs +++ b/src/mango/lexing/combi_lexer.rs @@ -77,19 +77,13 @@ mod tests { use mango::token::tokens::KeywordToken; use mango::token::tokens::LiteralToken; use mango::token::tokens::OperatorToken; - use mango::token::tokens::ParenthesisCloseToken; - use mango::token::tokens::ParenthesisOpenToken; use mango::token::tokens::StartBlockToken; use mango::token::Tokens; use mango::util::encdec::to_text::ToText; - use std::cell::RefCell; - use std::rc::Rc; fn assert_text_to_tokens(text: &str, tokens: Vec) { let expected = LexList::from_tokens(tokens); - let actual = lex_all(&mut CombiLexer::new(Box::new(StringReader::new( - text.to_owned(), - )))); + let actual = lex_all(&mut CombiLexer::new(Box::new(StringReader::new(text.to_owned())))); assert_eq!( expected, actual, @@ -99,18 +93,6 @@ mod tests { ); } - #[test] - fn test_lexing_individual() { - assert_text_to_tokens( - "if", - vec![ - Tokens::Keyword(KeywordToken::from_str("if".to_owned()).unwrap()), - Tokens::EndStatement(EndStatementToken::new_end_line()), - ], - ); - // todo: more - } - #[test] fn test_lexing_combined() { assert_text_to_tokens( diff --git a/src/mango/lexing/string_lexer.rs b/src/mango/lexing/string_lexer.rs index dbc68034..270bced4 100644 --- a/src/mango/lexing/string_lexer.rs +++ b/src/mango/lexing/string_lexer.rs @@ -5,6 +5,7 @@ use mango::lexing::typ::SubLexerResult; use mango::token::tokens::LiteralToken; use mango::token::Tokens; +#[allow(dead_code)] // TODO: TMP pub enum StringType { SingleQuotedInline, DoubleQuotedInline, @@ -13,6 +14,7 @@ pub enum StringType { /// Lexes a string literal token. // Starts after the opening quote and expected to consume until closing quote. +#[allow(dead_code)] // TODO: TMP pub struct StringLexer { typ: StringType, } @@ -32,13 +34,9 @@ impl SubLexer for StringLexer { // TODO: doesn't handle escaping etc at all now // TODO: this is going to have a problem if `matches` automatically eats whitespace match reader.matches("[^\"\\n]*") { - Match(value) => { - return SubLexerResult::single(Tokens::Literal(LiteralToken::string(value))) - } + Match(value) => return SubLexerResult::single(Tokens::Literal(LiteralToken::string(value))), NoMatch() => panic!("failed to parse string"), // This can't really go wrong since empty pattern matches - EOF() => { - return SubLexerResult::single(Tokens::Literal(LiteralToken::string("".to_owned()))) - } // Unclosed string literal, let code parser deal with it + EOF() => return SubLexerResult::single(Tokens::Literal(LiteralToken::string("".to_owned()))), // Unclosed string literal, let code parser deal with it } } } diff --git a/src/mango/lexing/util/lex_all.rs b/src/mango/lexing/util/lex_all.rs index e10557a8..a6aee685 100644 --- a/src/mango/lexing/util/lex_all.rs +++ b/src/mango/lexing/util/lex_all.rs @@ -14,6 +14,7 @@ impl LexList { LexList { tokens } } + #[allow(unused)] pub fn from_reader(lexer: &mut Lexer) -> Self { lex_all(lexer) } @@ -21,11 +22,7 @@ impl LexList { impl ToText for LexList { fn to_text(&self) -> String { - self.tokens - .iter() - .map(|token| token.to_text()) - .collect::>() - .join(" ") + self.tokens.iter().map(|token| token.to_text()).collect::>().join(" ") } } @@ -35,5 +32,5 @@ pub fn lex_all(lexer: &mut Lexer) -> LexList { list.push(token) } list.shrink_to_fit(); - LexList { tokens: list } + LexList::from_tokens(list) } diff --git a/src/mango/lexing/util/mod.rs b/src/mango/lexing/util/mod.rs index 52be7fa1..37351b8a 100644 --- a/src/mango/lexing/util/mod.rs +++ b/src/mango/lexing/util/mod.rs @@ -1 +1,3 @@ pub mod lex_all; + +pub mod test_util; diff --git a/src/mango/lexing/util/test_util.rs b/src/mango/lexing/util/test_util.rs new file mode 100644 index 00000000..aa5c0ece --- /dev/null +++ b/src/mango/lexing/util/test_util.rs @@ -0,0 +1,19 @@ +use mango::io::fortest::stringreader::StringReader; +use mango::lexing::combi_lexer::CombiLexer; +use mango::lexing::util::lex_all::lex_all; +use mango::lexing::util::lex_all::LexList; +use mango::token::Tokens; +use mango::util::encdec::to_text::ToText; + +#[allow(dead_code)] +pub fn assert_text_to_tokens(text: &str, tokens: Vec) { + let expected = LexList::from_tokens(tokens); + let actual = lex_all(&mut CombiLexer::new(Box::new(StringReader::new(text.to_owned())))); + assert_eq!( + expected, + actual, + "\nexpected:\n{}\nactual:\n{}", + expected.to_text(), + actual.to_text(), + ); +} diff --git a/src/mango/token/tokens/association.rs b/src/mango/token/tokens/association.rs index a5be9028..4f086166 100644 --- a/src/mango/token/tokens/association.rs +++ b/src/mango/token/tokens/association.rs @@ -12,9 +12,7 @@ pub struct AssociationToken { impl AssociationToken { pub fn from_unprefixed() -> Self { - AssociationToken { - symbol: Option::None, - } + AssociationToken { symbol: Option::None } } pub fn from_str>(symbol_txt: S) -> Result { diff --git a/src/mango/token/tokens/block.rs b/src/mango/token/tokens/block.rs index 64a3041f..c538cf2a 100644 --- a/src/mango/token/tokens/block.rs +++ b/src/mango/token/tokens/block.rs @@ -20,10 +20,7 @@ impl StartBlockToken { impl EndBlockToken { pub fn new(is_dedent: bool, is_end_keyword: bool) -> Self { assert!(is_dedent || is_end_keyword); - EndBlockToken { - is_dedent, - is_end_keyword, - } + EndBlockToken { is_dedent, is_end_keyword } } } diff --git a/src/mango/towasm/control/block.rs b/src/mango/towasm/control/block.rs index 63ac5a06..770555aa 100644 --- a/src/mango/towasm/control/block.rs +++ b/src/mango/towasm/control/block.rs @@ -64,11 +64,7 @@ impl Block { impl Wasm for Block { fn as_wat(&self) -> String { - format!( - "(block {0:}\n{1:}\n) ;; block {0:}", - self.name.as_wat(), - self.group.as_wat() - ) + format!("(block {0:}\n{1:}\n) ;; block {0:}", self.name.as_wat(), self.group.as_wat()) } fn write_wasm(&self, file: &mut File) -> io::Result<()> { diff --git a/src/mango/towasm/control/repeat.rs b/src/mango/towasm/control/repeat.rs index 3672a981..594d974c 100644 --- a/src/mango/towasm/control/repeat.rs +++ b/src/mango/towasm/control/repeat.rs @@ -34,11 +34,7 @@ impl Loop { impl Wasm for Loop { fn as_wat(&self) -> String { - format!( - "loop {0:}\n{1:}\nend ;; loop {0:}", - self.name.as_wat(), - self.group.as_wat() - ) + format!("loop {0:}\n{1:}\nend ;; loop {0:}", self.name.as_wat(), self.group.as_wat()) } fn write_wasm(&self, file: &mut File) -> io::Result<()> { diff --git a/src/mango/towasm/numeric/arithmetic.rs b/src/mango/towasm/numeric/arithmetic.rs index 3b2fcede..7faac2bd 100644 --- a/src/mango/towasm/numeric/arithmetic.rs +++ b/src/mango/towasm/numeric/arithmetic.rs @@ -13,10 +13,7 @@ pub struct Add { impl Add { pub fn new(left: Box, right: Box) -> Box { assert!(left.typ() == right.typ()); - Box::new(Add { - left: left, - right: right, - }) + Box::new(Add { left: left, right: right }) } pub fn typ(&self) -> &Type { @@ -26,12 +23,7 @@ impl Add { impl Wasm for Add { fn as_wat(&self) -> String { - format!( - "{}\n{}\n{}.add", - self.left.as_wat(), - self.right.as_wat(), - self.typ().as_wat(), - ) + format!("{}\n{}\n{}.add", self.left.as_wat(), self.right.as_wat(), self.typ().as_wat(),) } fn write_wasm(&self, file: &mut File) -> io::Result<()> { @@ -54,10 +46,7 @@ pub struct Mul { impl Mul { pub fn new(left: Box, right: Box) -> Box { assert!(left.typ() == right.typ()); - Box::new(Mul { - left: left, - right: right, - }) + Box::new(Mul { left: left, right: right }) } pub fn typ(&self) -> &Type { @@ -67,12 +56,7 @@ impl Mul { impl Wasm for Mul { fn as_wat(&self) -> String { - format!( - "{}\n{}\n{}.mul", - self.left.as_wat(), - self.right.as_wat(), - self.typ().as_wat(), - ) + format!("{}\n{}\n{}.mul", self.left.as_wat(), self.right.as_wat(), self.typ().as_wat(),) } fn write_wasm(&self, file: &mut File) -> io::Result<()> { diff --git a/src/mango/towasm/numeric/logic.rs b/src/mango/towasm/numeric/logic.rs index eeb5bd51..92fa2d99 100644 --- a/src/mango/towasm/numeric/logic.rs +++ b/src/mango/towasm/numeric/logic.rs @@ -13,21 +13,13 @@ pub struct Gt { impl Gt { pub fn new(left: Box, right: Box) -> Box { assert!(left.typ() == right.typ()); - Box::new(Gt { - left: left, - right: right, - }) + Box::new(Gt { left: left, right: right }) } } impl Wasm for Gt { fn as_wat(&self) -> String { - format!( - "{}\n{}\n{}.gt_s", - self.left.as_wat(), - self.right.as_wat(), - self.typ().as_wat(), - ) + format!("{}\n{}\n{}.gt_s", self.left.as_wat(), self.right.as_wat(), self.typ().as_wat(),) } fn write_wasm(&self, file: &mut File) -> io::Result<()> { @@ -50,10 +42,7 @@ pub struct Lt { impl Lt { pub fn new(left: Box, right: Box) -> Box { assert!(left.typ() == right.typ()); - Box::new(Lt { - left: left, - right: right, - }) + Box::new(Lt { left: left, right: right }) } pub fn typ(&self) -> &Type { @@ -63,12 +52,7 @@ impl Lt { impl Wasm for Lt { fn as_wat(&self) -> String { - format!( - "{}\n{}\n{}.lt_s", - self.left.as_wat(), - self.right.as_wat(), - self.typ().as_wat(), - ) + format!("{}\n{}\n{}.lt_s", self.left.as_wat(), self.right.as_wat(), self.typ().as_wat(),) } fn write_wasm(&self, file: &mut File) -> io::Result<()> { diff --git a/src/mango/towasm/scope/function.rs b/src/mango/towasm/scope/function.rs index 3847c4f9..61c7690c 100644 --- a/src/mango/towasm/scope/function.rs +++ b/src/mango/towasm/scope/function.rs @@ -75,11 +75,7 @@ pub struct FunctionSignature { impl FunctionSignature { pub fn new(name: Rc, parameters: Vec>, results: Vec>) -> Self { assert!(results.len() <= 1); // - FunctionSignature { - name, - parameters, - results, - } + FunctionSignature { name, parameters, results } } } @@ -89,16 +85,8 @@ impl Wasm for FunctionSignature { "func {} (export \"{}\") {} {}", self.name.as_wat(), self.name.pure_name(), - self.parameters - .iter() - .map(|func| func.as_wat()) - .collect::>() - .join("\n"), - self.results - .iter() - .map(|func| func.as_wat()) - .collect::>() - .join("\n") + self.parameters.iter().map(|func| func.as_wat()).collect::>().join("\n"), + self.results.iter().map(|func| func.as_wat()).collect::>().join("\n") ) } @@ -114,12 +102,7 @@ pub struct Function { impl Function { // This uses group, so it has a label, but this isn't final... It might be useless. - pub fn new( - name: Rc, - parameters: Vec>, - results: Vec>, - statements_gen: F, - ) -> Box + pub fn new(name: Rc, parameters: Vec>, results: Vec>, statements_gen: F) -> Box where F: FnOnce(Label) -> Vec>, { diff --git a/src/mango/towasm/scope/module.rs b/src/mango/towasm/scope/module.rs index 95bcc155..c46cbe50 100644 --- a/src/mango/towasm/scope/module.rs +++ b/src/mango/towasm/scope/module.rs @@ -18,11 +18,7 @@ impl Wasm for Module { fn as_wat(&self) -> String { format!( "(module\n{}\n) ;; module", - self.functions - .iter() - .map(|func| func.as_wat()) - .collect::>() - .join("\n") + self.functions.iter().map(|func| func.as_wat()).collect::>().join("\n") ) } diff --git a/src/mango/towasm/tests.rs b/src/mango/towasm/tests.rs index 0bf72f76..f5941bde 100644 --- a/src/mango/towasm/tests.rs +++ b/src/mango/towasm/tests.rs @@ -26,37 +26,23 @@ fn test_example_1() { vec![param_n], vec![Output::new(Type::Int32)], |func_label: Label| { - let fac_result_decl = - DeclareLocal::new(Name::new("fac_result".to_owned()).unwrap(), Type::Int32); + let fac_result_decl = DeclareLocal::new(Name::new("fac_result".to_owned()).unwrap(), Type::Int32); let fac_result = fac_result_decl.local(); - let loop_condition_decl = - DeclareLocal::new(Name::new("loop_condition".to_owned()).unwrap(), Type::Bool); + let loop_condition_decl = DeclareLocal::new(Name::new("loop_condition".to_owned()).unwrap(), Type::Bool); let loop_condition = loop_condition_decl.local(); vec![ // Function body fac_result_decl, loop_condition_decl, Assign::new(fac_result.clone(), Const::new(Type::Int32, Value::Int(1))), - Loop::new_named( - Name::new("fac_loop".to_owned()).unwrap(), - |loop_label: Label| { - vec![ - Assign::new( - fac_result.clone(), - Mul::new(fac_result.get(), var_n.get()), - ), - Assign::new( - loop_condition.clone(), - Gt::new(var_n.get(), Const::new(Type::Int32, Value::Int(2))), - ), - Assign::new( - var_n.clone(), - Add::new(var_n.get(), Const::new(Type::Int32, Value::Int(-1))), - ), - BranchIf::new(loop_condition.get(), loop_label), - ] - }, - ), + Loop::new_named(Name::new("fac_loop".to_owned()).unwrap(), |loop_label: Label| { + vec![ + Assign::new(fac_result.clone(), Mul::new(fac_result.get(), var_n.get())), + Assign::new(loop_condition.clone(), Gt::new(var_n.get(), Const::new(Type::Int32, Value::Int(2)))), + Assign::new(var_n.clone(), Add::new(var_n.get(), Const::new(Type::Int32, Value::Int(-1)))), + BranchIf::new(loop_condition.get(), loop_label), + ] + }), Return::new(func_label, fac_result.get()), ] }, diff --git a/src/mango/towasm/values/assign.rs b/src/mango/towasm/values/assign.rs index 2044fefc..b4fd64bb 100644 --- a/src/mango/towasm/values/assign.rs +++ b/src/mango/towasm/values/assign.rs @@ -18,11 +18,7 @@ impl Assign { impl Wasm for Assign { fn as_wat(&self) -> String { - format!( - "{}\nset_local {}", - self.value.as_wat(), - self.assignee.as_wat() - ) + format!("{}\nset_local {}", self.value.as_wat(), self.assignee.as_wat()) // set_local $fac_result } diff --git a/src/mango/towasm/values/localvar.rs b/src/mango/towasm/values/localvar.rs index 6af061bc..47536f1d 100644 --- a/src/mango/towasm/values/localvar.rs +++ b/src/mango/towasm/values/localvar.rs @@ -40,11 +40,7 @@ impl DeclareLocal { impl Wasm for DeclareLocal { fn as_wat(&self) -> String { - format!( - "(local {} {})", - self.local.name().as_wat(), - self.local.typ().as_wat() - ) + format!("(local {} {})", self.local.name().as_wat(), self.local.typ().as_wat()) } fn write_wasm(&self, file: &mut File) -> io::Result<()> { @@ -69,9 +65,7 @@ pub struct Local { impl Local { pub fn get(&self) -> Box { Box::new(GetLocal { - local: Local { - inner: self.inner.clone(), - }, + local: Local { inner: self.inner.clone() }, }) } diff --git a/src/mango/util/codeparts/keyword.rs b/src/mango/util/codeparts/keyword.rs index f3f176d4..92605849 100644 --- a/src/mango/util/codeparts/keyword.rs +++ b/src/mango/util/codeparts/keyword.rs @@ -166,10 +166,7 @@ impl Keyword { "xor" => Ok(Reserved("xor".to_owned())), "yield" => Ok(Reserved("yield".to_owned())), - _ => Err(Msg::from_valid(&format!( - "Unknown keywords: '{}'", - ssymbol_txt - ))), + _ => Err(Msg::from_valid(&format!("Unknown keywords: '{}'", ssymbol_txt))), } } } diff --git a/src/mango/util/codeparts/operator.rs b/src/mango/util/codeparts/operator.rs index b001bfda..625331ca 100644 --- a/src/mango/util/codeparts/operator.rs +++ b/src/mango/util/codeparts/operator.rs @@ -37,10 +37,7 @@ impl Symbol { ">=" => Ok(GE), "!" => Ok(Exclamation), "?" => Ok(Question), - _ => Err(Msg::from_valid(&format!( - "Unknown symbol: '{}'", - ssymbol_txt - ))), + _ => Err(Msg::from_valid(&format!("Unknown symbol: '{}'", ssymbol_txt))), } } diff --git a/src/mango/util/errors/code_problem.rs b/src/mango/util/errors/code_problem.rs index 6919824a..f28d20a5 100644 --- a/src/mango/util/errors/code_problem.rs +++ b/src/mango/util/errors/code_problem.rs @@ -109,17 +109,8 @@ mod tests { #[test] fn test_new_problem() { - CodeProblem::error( - Msg::copy_new("test problem").unwrap(), - Context::new("test context".to_string()), - ); - CodeProblem::warning( - Msg::copy_new("test problem").unwrap(), - Context::new("test context".to_string()), - ); - CodeProblem::debug( - Msg::copy_new("test problem").unwrap(), - Context::new("test context".to_string()), - ); + CodeProblem::error(Msg::copy_new("test problem").unwrap(), Context::new("test context".to_string())); + CodeProblem::warning(Msg::copy_new("test problem").unwrap(), Context::new("test context".to_string())); + CodeProblem::debug(Msg::copy_new("test problem").unwrap(), Context::new("test context".to_string())); } } diff --git a/src/mango/util/errors/collector.rs b/src/mango/util/errors/collector.rs index 619ed139..86a18828 100644 --- a/src/mango/util/errors/collector.rs +++ b/src/mango/util/errors/collector.rs @@ -49,33 +49,17 @@ mod tests { #[test] fn test_iter_collector() { let mut collector = ProblemCollector::new(); - collector.error( - Msg::copy_new("test problem").unwrap(), - Context::new("test context".to_string()), - ); + collector.error(Msg::copy_new("test problem").unwrap(), Context::new("test context".to_string())); let cnt = collector.into_iter().count(); assert_eq!(1, cnt, "No item in ProblemCollector"); - assert_eq!( - cnt, - collector.into_iter().count(), - "Failed to iterate over ProblemCollector twice" - ) + assert_eq!(cnt, collector.into_iter().count(), "Failed to iterate over ProblemCollector twice") } #[test] fn test_new_problem() { let mut collector = ProblemCollector::new(); - collector.error( - Msg::copy_new("test problem").unwrap(), - Context::new("test context".to_string()), - ); - collector.warning( - Msg::copy_new("test problem").unwrap(), - Context::new("test context".to_string()), - ); - collector.debug( - Msg::copy_new("test problem").unwrap(), - Context::new("test context".to_string()), - ); + collector.error(Msg::copy_new("test problem").unwrap(), Context::new("test context".to_string())); + collector.warning(Msg::copy_new("test problem").unwrap(), Context::new("test context".to_string())); + collector.debug(Msg::copy_new("test problem").unwrap(), Context::new("test context".to_string())); } } diff --git a/src/mango/util/format/strings.rs b/src/mango/util/format/strings.rs index 7161c5a9..bcacf3ee 100644 --- a/src/mango/util/format/strings.rs +++ b/src/mango/util/format/strings.rs @@ -26,10 +26,7 @@ mod tests { assert_eq!("\"hello\\nworld\"", to_double_quoted_str("hello\nworld")); assert_eq!("\"hello\\\\ world\"", to_double_quoted_str("hello\\ world")); assert_eq!("\"hello\\\"world\"", to_double_quoted_str("hello\"world")); - assert_eq!( - "\"\\\"\\\"\\\"\\n\\\\\"", - to_double_quoted_str("\"\"\"\n\\") - ); + assert_eq!("\"\\\"\\\"\\\"\\n\\\\\"", to_double_quoted_str("\"\"\"\n\\")); assert_eq!("\"\\\\n\"", to_double_quoted_str("\\n")); assert_eq!("\"\\\\\\n\"", to_double_quoted_str("\\\n")); } diff --git a/src/mango/util/numtype/eqfloat.rs b/src/mango/util/numtype/eqfloat.rs index df09318d..eb570d45 100644 --- a/src/mango/util/numtype/eqfloat.rs +++ b/src/mango/util/numtype/eqfloat.rs @@ -165,14 +165,8 @@ mod tests { assert_eq!(get_hash(f64eq::new(PI)), get_hash(f64eq::new(PI))); assert_ne!(get_hash(f64eq::new(42.)), get_hash(f64eq::new(-42.))); assert_eq!(get_hash(f64eq::new(0.)), get_hash(f64eq::new(-0.))); - assert_eq!( - get_hash(f64eq::new(INFINITY)), - get_hash(f64eq::new(INFINITY)) - ); - assert_ne!( - get_hash(f64eq::new(INFINITY)), - get_hash(f64eq::new(NEG_INFINITY)) - ); + assert_eq!(get_hash(f64eq::new(INFINITY)), get_hash(f64eq::new(INFINITY))); + assert_ne!(get_hash(f64eq::new(INFINITY)), get_hash(f64eq::new(NEG_INFINITY))); assert_ne!(get_hash(f64eq::new(42.)), get_hash(f64eq::new(NAN))); assert_ne!(get_hash(f64eq::new(NAN)), get_hash(f64eq::new(42.))); assert_eq!(get_hash(f64eq::new(NAN)), get_hash(f64eq::new(NAN))); diff --git a/src/mango/util/parsetxt/int.rs b/src/mango/util/parsetxt/int.rs index 38bdb248..89911ecf 100644 --- a/src/mango/util/parsetxt/int.rs +++ b/src/mango/util/parsetxt/int.rs @@ -18,10 +18,7 @@ pub fn int_pattern() -> &'static str { /// Convert a String that matches [int_pattern] to an i64 integer. Overflow is possible. pub fn parse_int>(text: S) -> Result { let text = text.into(); - match Regex::new(&format!("^{}$", int_pattern())) - .unwrap() - .captures(&text) - { + match Regex::new(&format!("^{}$", int_pattern())).unwrap().captures(&text) { None => return Err(IntParseFailReason::Invalid), Some(captures) => { // // Sign @@ -78,6 +75,7 @@ mod tests { assert_eq!(9, parse_int("09").unwrap()); } + #[test] fn test_invalid_b10_ints() { assert!(parse_int("0x9").is_err()); assert!(parse_int("A").is_err()); @@ -86,6 +84,7 @@ mod tests { // TODO: over/underflow } + #[test] fn test_parse_based_ints() { // TODO: not implemented yet } diff --git a/src/mango/util/parsetxt/real.rs b/src/mango/util/parsetxt/real.rs index b99925ff..20e805c7 100644 --- a/src/mango/util/parsetxt/real.rs +++ b/src/mango/util/parsetxt/real.rs @@ -23,10 +23,7 @@ pub fn real_pattern() -> &'static str { /// Convert a String that matches [real_pattern] to an f64 real. Overflow and loss of precision is possible. pub fn parse_real>(text: S) -> Result { let text = text.into(); - match Regex::new(&format!("^{}$", real_pattern())) - .unwrap() - .captures(&text) - { + match Regex::new(&format!("^{}$", real_pattern())).unwrap().captures(&text) { None => return Err(RealParseFailReason::Invalid), Some(captures) => { let multiplier = captures @@ -43,11 +40,7 @@ pub fn parse_real>(text: S) -> Result } Some(exponent_match) => { // This real is in exponential notation - let exponent = exponent_match - .as_str() - .without_char(&'_') - .parse::() - .unwrap(); + let exponent = exponent_match.as_str().without_char(&'_').parse::().unwrap(); // TODO: is there a numerically smarter way to do this? return Ok(10f64.powf(exponent) * multiplier); } @@ -80,17 +73,11 @@ mod tests { assert!(close(-0.1, parse_real("-.1e0").unwrap())); assert!(close(-1., parse_real("-1.e0").unwrap())); assert!(close(42., parse_real("42.0e+0").unwrap())); - assert!(close( - 12345.6789, - parse_real("1_2_3_4_5.6_7_8_9e0").unwrap() - )); + assert!(close(12345.6789, parse_real("1_2_3_4_5.6_7_8_9e0").unwrap())); assert!(close(0.42, parse_real("42.0e-2").unwrap())); assert!(close(-0.001, parse_real("-.1e-2").unwrap())); assert!(close(-0.01, parse_real("-1.e-2").unwrap())); - assert!(close( - 123.456789, - parse_real("1_2_3_4_5.6_7_8_9e-2").unwrap() - )); + assert!(close(123.456789, parse_real("1_2_3_4_5.6_7_8_9e-2").unwrap())); assert!(close(42.0, parse_real("42.0e-0_0_0").unwrap())); } diff --git a/src/mango/util/strslice/slice.rs b/src/mango/util/strslice/slice.rs index 60ab23a8..35fb70da 100644 --- a/src/mango/util/strslice/slice.rs +++ b/src/mango/util/strslice/slice.rs @@ -23,10 +23,7 @@ pub fn charslice>(text: S, start: isize, end: isize) -> String { "charslice: if 'end' is negative, the magnitude may not exceed the length" ); let new_end = (charcount as isize + end) as usize; - assert!( - new_end >= from, - "charslice: 'start' may not be before 'end' (end was negative)" - ); + assert!(new_end >= from, "charslice: 'start' may not be before 'end' (end was negative)"); length = new_end - from; } else { assert!( diff --git a/src/mango/util/strtype/msg.rs b/src/mango/util/strtype/msg.rs index 4f20b0ae..d2e2f28b 100644 --- a/src/mango/util/strtype/msg.rs +++ b/src/mango/util/strtype/msg.rs @@ -34,9 +34,7 @@ impl StrType for Msg { fn validate(msg: &str) -> Result<(), Msg> { if !VALID_MESSAGE.is_match(&msg.to_string()) { // Make sure this is a valid string, otherwise it causes an infinite loop making error messages for it! - return Err(Msg::from_valid( - "Messages should consist of printable text.", - )); + return Err(Msg::from_valid("Messages should consist of printable text.")); } Ok(()) } diff --git a/src/mango/util/strtype/name.rs b/src/mango/util/strtype/name.rs index 5940158a..23533c4c 100644 --- a/src/mango/util/strtype/name.rs +++ b/src/mango/util/strtype/name.rs @@ -8,8 +8,7 @@ use string_interner::StringInterner; const VALID_IDENTIFIER_SUBPATTERN: &'static str = r"[a-zA-Z_][a-zA-Z0-9_]*"; lazy_static! { - static ref VALID_IDENTIFIER: Regex = - Regex::new(&format!("{}{}{}", r"^", VALID_IDENTIFIER_SUBPATTERN, r"$")).unwrap(); + static ref VALID_IDENTIFIER: Regex = Regex::new(&format!("{}{}{}", r"^", VALID_IDENTIFIER_SUBPATTERN, r"$")).unwrap(); } // TODO: this alias just for https://github.com/rust-lang-nursery/rustfmt/issues/2610 @@ -33,12 +32,7 @@ impl Name { pub fn value(&self) -> String { // Unwrap only fails if another thread panicked while locking, which shouldn't happen. // todo: I want this to return &str but that'd need the interner to be borrowed longer - INTERNER - .lock() - .unwrap() - .resolve(self.name_id) - .unwrap() - .to_string() + INTERNER.lock().unwrap().resolve(self.name_id).unwrap().to_string() } /// Generate an eager subpattern to match names, that can be composed in a regular expression. @@ -50,11 +44,7 @@ impl Name { impl fmt::Display for Name { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { // Use interner directly instead of .value(), because that creates a copy - write!( - f, - "{}", - INTERNER.lock().unwrap().resolve(self.name_id).unwrap() - ) + write!(f, "{}", INTERNER.lock().unwrap().resolve(self.name_id).unwrap()) } } @@ -73,9 +63,7 @@ impl StrType for Name { fn validate(name: &str) -> Result<(), Msg> { match name.chars().next() { Some(chr) => if chr.is_digit(10) { - return Err(Msg::from_valid( - "Identifier names may not start with a digit.", - )); + return Err(Msg::from_valid("Identifier names may not start with a digit.")); }, None => return Ok(()), // empty string } @@ -169,13 +157,7 @@ mod tests { #[test] fn test_name_interning() { - assert_eq!( - Name::copy_new("Hello").unwrap(), - Name::copy_new("Hello").unwrap() - ); - assert_ne!( - Name::copy_new("Hello").unwrap(), - Name::copy_new("Goodbye").unwrap() - ); + assert_eq!(Name::copy_new("Hello").unwrap(), Name::copy_new("Hello").unwrap()); + assert_ne!(Name::copy_new("Hello").unwrap(), Name::copy_new("Goodbye").unwrap()); } } From 6dd62f7d9e38f19d43a541a2d5b2f894ab8246eb Mon Sep 17 00:00:00 2001 From: Mark Date: Wed, 20 Jun 2018 21:10:48 +0200 Subject: [PATCH 49/49] Resolve merge problems #52 --- dev/playground/src/{hashin.rs => hashing.rs} | 49 -------------------- src/lib.rs | 3 -- 2 files changed, 52 deletions(-) rename dev/playground/src/{hashin.rs => hashing.rs} (53%) diff --git a/dev/playground/src/hashin.rs b/dev/playground/src/hashing.rs similarity index 53% rename from dev/playground/src/hashin.rs rename to dev/playground/src/hashing.rs index 4acbdbc9..566e70ef 100644 --- a/dev/playground/src/hashin.rs +++ b/dev/playground/src/hashing.rs @@ -29,7 +29,6 @@ impl AnyHasher for H { } } -// TODO: but now I want this not for everything impl MyTrait for T { fn as_any(&self) -> &Any { self as &Any @@ -41,9 +40,6 @@ impl MyTrait for T { } } -//impl MyTrait for A {} -//impl MyTrait for B {} - impl Hash for MyTrait { fn hash(&self, hasher: &mut H) { self.my_hash(hasher) @@ -57,48 +53,3 @@ fn main() { let x: &MyTrait = &A(1); x.hash(&mut hasher); } - - -//trait PreS: Debug {} -// -//trait HasherAsAny { -// fn as_any(&self) -> &Any; -//} -// -//trait PostS { -// fn as_any(&self) -> &Any; -// -// fn _hash(&self, hasher: H); -//} -// -//impl HasherAsAny for T { -// fn as_any(&self) -> &Any { -// self as &Any -// } -//} -// -//impl PostS for T { -// fn as_any(&self) -> &Any { -// self as &Any -// } -// -// fn _hash(&self, hasher: H) { -// self.as_any().downcast_ref::().hash(hasher) -// } -//} -// -//impl PreS for A {} -// -//impl PreS for B {} -// -//impl Hash for PostS { -// fn hash(&self, hasher: &mut HasherAsAny) { -// self._hash(hasher.as_any().downcast_ref::()) -// } -//} -// -//fn main() { -// let x: &PostS = &A(1); -// let m = HashMap::new(); -// m.insert(x, 0); -//} diff --git a/src/lib.rs b/src/lib.rs index b6a06653..1ff780b3 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,6 +1,5 @@ #![feature(nll)] //#![feature(generators, generator_trait)] - #![feature(proc_macro, wasm_custom_section, wasm_import_module)] extern crate core; extern crate wasm_bindgen; @@ -13,9 +12,7 @@ extern crate derive_new; pub mod mango { // Utilities - pub mod cli; pub mod io; - pub mod jit; pub mod ui; pub mod util;