From 03b152a090346de3faab1211f20d0c3498fdecf7 Mon Sep 17 00:00:00 2001 From: Mark Date: Tue, 22 May 2018 21:44:11 +0200 Subject: [PATCH] Progress on sublexers and new tokens, fighting borrow rules #56 --- src/mango/lexing/code_lexer.rs | 184 ++++++++++++++++++------------ src/mango/lexing/mod.rs | 2 + src/mango/lexing/string_lexer.rs | 45 ++++++++ src/mango/lexing/typ.rs | 7 +- src/mango/token/tokens/literal.rs | 4 + 5 files changed, 170 insertions(+), 72 deletions(-) create mode 100644 src/mango/lexing/string_lexer.rs diff --git a/src/mango/lexing/code_lexer.rs b/src/mango/lexing/code_lexer.rs index 00eda5b9..1fc8bb29 100644 --- a/src/mango/lexing/code_lexer.rs +++ b/src/mango/lexing/code_lexer.rs @@ -1,5 +1,6 @@ use mango::io::typ::Reader; use mango::io::typ::ReaderResult::*; +use mango::lexing::string_lexer::StringLexer; use mango::lexing::typ::Lexer; use mango::lexing::typ::MaybeToken; use mango::token::special::UnlexableToken; @@ -13,31 +14,49 @@ use mango::token::tokens::StartBlockToken; use mango::token::Tokens; use mango::util::collection::Queue; -pub struct CodeLexer<'r> { - reader: &'r mut Reader, +enum ReaderOrDelegate { + Reader(Box), + Delegate(Box), +} + +impl ReaderOrDelegate { + fn end_delegation(&mut self) { + *self = match self { + ReaderOrDelegate::Delegate(delegate) => ReaderOrDelegate::Reader(delegate.consume()), + ReaderOrDelegate::Reader(reader) => ReaderOrDelegate::Reader(*reader), + } + } +} + +pub struct CodeLexer { + // reader: Option<&'r mut Reader>, indent: i32, + // TODO: both of the next two would be unnecessary with generators... + // This delegate deals with nested structures, like string literals and comments. + // delegate: Option<&'r mut Lexer<'r>>, + reader_or_delegate: ReaderOrDelegate, // This is unfortunate, would not be needed with 'yield' but is now for indents. buffer: Queue, } -impl<'r> CodeLexer<'r> { - fn new(reader: &'r mut Reader) -> Self { +impl CodeLexer { + fn new(reader: Box) -> Self { CodeLexer { - reader, + reader_or_delegate: ReaderOrDelegate::Reader(reader), indent: 0, buffer: Queue::new(), } } - fn lex_indents(&mut self) -> MaybeToken { + fn lex_indents(&mut self, reader: &mut Box) -> MaybeToken { let mut line_indent = 0; - while let Match(_) = self.reader.matches("\\t") { + while let Match(_) = reader.matches("\\t") { line_indent += 1; } for _ in line_indent..self.indent { // This line is dedented, make end tokens. // TODO: turn this "new" into a constant - if let Match(_) = self.reader.matches("end") { + if let Match(_) = reader.matches("end") { // If this is followed by an 'end' keyword, then that 'end' is redundant. self.buffer .push(Tokens::EndBlock(EndBlockToken::new(true, true))); @@ -55,74 +74,99 @@ impl<'r> CodeLexer<'r> { } } -impl<'r> Lexer<'r> for CodeLexer<'r> { +impl Lexer for CodeLexer { fn lex(&mut self) -> MaybeToken { use self::MaybeToken::*; - // If there is a buffer due to indentation or continuations, return from that. - if let Some(token) = self.buffer.pop() { - return Token(token); - } - // Past this point, we assume that hte buffer is empty. When adding stuff, pop it or re-enter lex() soon. - if let Match(_) = self.reader.matches("\\.\\.\\.") { - // Line continuation has no token, it just continues on the next line. - if let Match(_) = self.reader.matches("\\n\\r?") { - // There should always be a newline after continuations, so that they can be ignored together. - } else if let Match(word) = self.reader.matches("[^\\n]*\\n\\r?") { - return Token(Tokens::Unlexable(UnlexableToken::new(word))); - } else { - // TODO: I don't know yet how to deal with ... followed by end-of-file - panic!() - } - // This is a new line, so there may be indents. - return self.lex_indents(); - } - if let Match(_) = self.reader.matches("\\n\\r?") { - // Newline WITHOUT line continuation. - return Token(Tokens::EndStatement(EndStatementToken::new_end_line())); - } - if let Match(_) = self.reader.matches(";") { - // Semicolon, which ends a statement. - // Need to do some extra work with buffer, because there may be a newline followed by indentation, which ; should precede. - self.buffer - .push(Tokens::EndStatement(EndStatementToken::new_semicolon())); - if let Match(_) = self.reader.matches("\\n\\r?") { - // If semicolon is followed by a newline (redundant), then we need to deal with indents (but ignore the newline itself). - // This will return the queue of tokens, including the semicolon. - return self.lex_indents(); + // If currently delegating to a sub-lexer, return from that. + match self.reader_or_delegate { + ReaderOrDelegate::Delegate(ref mut delegate) => { + match delegate.lex() { + Token(token) => Token(token), + End => self.lex(), + } + // Code to stop delegation cannot be here, because `self` is still mutably borrowed through `delegate` } - // No newline, can just return the semicolon (which is certainly on the queue, and should be the only thing, but it is fine here if not). - return Token(self.buffer.pop().unwrap()); - } - // - // Indentation done; do the rest of lexing. - // - // Parse identifers and keywords. This assumes that keywords are a subset of identifiers. - if let Match(word) = self.reader.matches(IdentifierToken::subpattern()) { - // later: maybe turn identifier into keyword to avoid a string copy? kind of elaborate... - if let Ok(keyword) = KeywordToken::from_str(word.clone()) { - return Token(Tokens::Keyword(keyword)); + ReaderOrDelegate::Reader(mut reader) => { + // todo: maybe this branch could be a separate function? + + // If there is a buffer due to indentation or continuations, return from that. + if let Some(token) = self.buffer.pop() { + return Token(token); + } + // Past this point, we assume that hte buffer is empty. When adding stuff, pop it or re-enter lex() soon. + if let Match(_) = reader.matches("\\.\\.\\.") { + // Line continuation has no token, it just continues on the next line. + if let Match(_) = reader.matches("\\n\\r?") { + // There should always be a newline after continuations, so that they can be ignored together. + } else if let Match(word) = reader.matches("[^\\n]*\\n\\r?") { + return Token(Tokens::Unlexable(UnlexableToken::new(word))); + } else { + // TODO: I don't know yet how to deal with ... followed by end-of-file + panic!() + } + // This is a new line, so there may be indents. + return self.lex_indents(&mut reader); + } + if let Match(_) = reader.matches("\\n\\r?") { + // Newline WITHOUT line continuation. + return Token(Tokens::EndStatement(EndStatementToken::new_end_line())); + } + if let Match(_) = reader.matches(";") { + // Semicolon, which ends a statement. + // Need to do some extra work with buffer, because there may be a newline followed by indentation, which ; should precede. + self.buffer + .push(Tokens::EndStatement(EndStatementToken::new_semicolon())); + if let Match(_) = reader.matches("\\n\\r?") { + // If semicolon is followed by a newline (redundant), then we need to deal with indents (but ignore the newline itself). + // This will return the queue of tokens, including the semicolon. + return self.lex_indents(&mut reader); + } + // No newline, can just return the semicolon (which is certainly on the queue, and should be the only thing, but it is fine here if not). + return Token(self.buffer.pop().unwrap()); + } + // + // Indentation done; do the rest of lexing. + // + // Parse identifers and keywords. This assumes that keywords are a subset of identifiers. + if let Match(word) = reader.matches(IdentifierToken::subpattern()) { + // later: maybe turn identifier into keyword to avoid a string copy? kind of elaborate... + if let Ok(keyword) = KeywordToken::from_str(word.clone()) { + return Token(Tokens::Keyword(keyword)); + } + return Token(Tokens::Identifier(IdentifierToken::from_str(word).unwrap())); + } + // Literal + if let Match(word) = reader.matches("[a-z]?\"") { + // TODO: need to keep delegating to this until it exhausts, how to do that? + self.reader_or_delegate = ReaderOrDelegate::Delegate(Box::new( + StringLexer::new_double_quoted(reader), + )); + return self.lex(); + } + // Operator + // todo + // Association + // todo + // Grouping symbols + if let Match(_) = reader.matches("(") { + return Token(Tokens::ParenthesisOpen(ParenthesisOpenToken::new())); + } + if let Match(_) = reader.matches(")") { + return Token(Tokens::ParenthesisClose(ParenthesisCloseToken::new())); + } + + // TODO: specify the unlexable word + return Token(Tokens::Unlexable(UnlexableToken::new("TODO".to_owned()))); } - return Token(Tokens::Identifier(IdentifierToken::from_str(word).unwrap())); - } - // Literal - // todo - // if let Match(word) = self.reader.matches(LiteralToken::subpattern()) { - // return Token(LiteralToken::Literal(IdentifierToken::from_str(word).unwrap())); - // } - // Operator - // todo - // Association - // todo - // Grouping symbols - if let Match(_) = self.reader.matches("(") { - return Token(Tokens::ParenthesisOpen(ParenthesisOpenToken::new())); - } - if let Match(_) = self.reader.matches(")") { - return Token(Tokens::ParenthesisClose(ParenthesisCloseToken::new())); } + } - // TODO: specify the unlexable word - return Token(Tokens::Unlexable(UnlexableToken::new("TODO".to_owned()))); + fn consume(self) -> Box { + assert!(false, "I do not think this is ever called, is it?"); + match self.reader_or_delegate { + ReaderOrDelegate::Reader(reader) => reader, + ReaderOrDelegate::Delegate(delegate) => delegate.consume(), + } } } diff --git a/src/mango/lexing/mod.rs b/src/mango/lexing/mod.rs index 7becc2f3..bb88a815 100644 --- a/src/mango/lexing/mod.rs +++ b/src/mango/lexing/mod.rs @@ -3,3 +3,5 @@ pub mod typ; pub mod code_lexer; pub mod comment_lexer; + +pub mod string_lexer; diff --git a/src/mango/lexing/string_lexer.rs b/src/mango/lexing/string_lexer.rs new file mode 100644 index 00000000..d3f0cf2b --- /dev/null +++ b/src/mango/lexing/string_lexer.rs @@ -0,0 +1,45 @@ +use mango::io::typ::Reader; +use mango::io::typ::ReaderResult::*; +use mango::lexing::typ::Lexer; +use mango::lexing::typ::MaybeToken; +use mango::token::tokens::LiteralToken; +use mango::token::Tokens; + +pub enum StringType { + SingleQuotedInline, + DoubleQuotedInline, + MultiLine, +} + +/// Lexes a string literal token. +// Starts after the opening quote and expected to consume until closing quote. +pub struct StringLexer { + reader: Box, + typ: StringType, +} + +impl StringLexer { + // TODO: support other types of strings + pub fn new_double_quoted(reader: Box) -> Self { + StringLexer { + reader, + typ: StringType::DoubleQuotedInline, + } + } +} + +impl Lexer for StringLexer { + fn lex(&mut self) -> MaybeToken { + // TODO: doesn't handle escaping etc at all now + // TODO: this is going to have a problem if `matches` automatically eats whitespace + match self.reader.matches("[^\"\\n]*") { + Match(value) => return MaybeToken::Token(Tokens::Literal(LiteralToken::string(value))), + NoMatch() => panic!("failed to parse string"), // This can't really go wrong since empty pattern matches + EOF() => return MaybeToken::Token(Tokens::Literal(LiteralToken::string("".to_owned()))), // Unclosed string literal, let code parser deal with it + } + } + + fn consume(self) -> Box { + self.reader + } +} diff --git a/src/mango/lexing/typ.rs b/src/mango/lexing/typ.rs index 39295f76..063389e4 100644 --- a/src/mango/lexing/typ.rs +++ b/src/mango/lexing/typ.rs @@ -1,14 +1,17 @@ +use mango::io::typ::Reader; use mango::token::Tokens; pub enum MaybeToken { Token(Tokens), - End(), + End, } -pub trait Lexer<'r> { +pub trait Lexer { // /// Create a new lexer from a reader instance. // fn new(reader: &'r mut Reader) -> Self; /// Every call to lex returns a token until the end of the input. fn lex(&mut self) -> MaybeToken; + + fn consume(self) -> Box; } diff --git a/src/mango/token/tokens/literal.rs b/src/mango/token/tokens/literal.rs index a0953882..2065dcda 100644 --- a/src/mango/token/tokens/literal.rs +++ b/src/mango/token/tokens/literal.rs @@ -25,6 +25,10 @@ impl LiteralToken { pub fn real(value: f64) -> LiteralToken { LiteralToken::Real(f64eq::new(value)) } + + pub fn subpattern_int() -> &'static str { + "[a-z]?\"" + } } impl ToText for LiteralToken {