From e6610704d1ad181d5f6800cd7a98dcf35b065a21 Mon Sep 17 00:00:00 2001 From: Mark Date: Sun, 17 Jun 2018 15:43:19 +0200 Subject: [PATCH] Implement the combi-lexer #52 --- src/mango/lexing/code_lexer.rs | 36 +++---- src/mango/lexing/combi_lexer.rs | 149 +++++++++++++++++++++++++++++ src/mango/lexing/gen_code_lexer.rs | 1 + src/mango/lexing/mod.rs | 12 ++- src/mango/lexing/string_lexer.rs | 6 +- src/mango/lexing/typ.rs | 18 +++- 6 files changed, 187 insertions(+), 35 deletions(-) create mode 100644 src/mango/lexing/combi_lexer.rs diff --git a/src/mango/lexing/code_lexer.rs b/src/mango/lexing/code_lexer.rs index ddc06f3f..06fda0f7 100644 --- a/src/mango/lexing/code_lexer.rs +++ b/src/mango/lexing/code_lexer.rs @@ -17,6 +17,8 @@ use mango::token::Tokens; use mango::util::collection::Queue; use std::cell::RefCell; use std::rc::Rc; +use mango::lexing::typ::SubLexer; +use mango::lexing::typ::SubLexerResult; // TODO: Preferably there'd be only one Lexer at a time which has a Reader, but I did not get that to work, // TODO: see this SO question: https://stackoverflow.com/questions/50535022/borrow-checker-problems-for-parser-that-can-delegate @@ -30,7 +32,6 @@ pub struct CodeLexer { // reader: Rc>, indent: i32, - reader: Rc>, // This delegate deals with nested structures, like string literals and comments. reader_or_delegate: ReaderOrDelegate, // This is unfortunate, would not be needed with 'yield' but is now for indents. @@ -38,16 +39,15 @@ pub struct CodeLexer { } impl CodeLexer { - pub fn new(reader: Rc>) -> Self { + pub fn new() -> Self { CodeLexer { - reader: reader, reader_or_delegate: ReaderOrDelegate::Reader(), indent: 0, buffer: Queue::new(), } } - fn lex_indents(&mut self) -> MaybeToken { + fn lex_indents(&mut self, reader: Box) -> MaybeToken { let mut line_indent = 0; while let Match(_) = self.reader.borrow_mut().matches("\\t") { line_indent += 1; @@ -73,10 +73,10 @@ impl CodeLexer { } } -impl Lexer for CodeLexer { +impl SubLexer for CodeLexer { // TODO: TURN THIS AROUND: MAKE A FUNCTION THAT RETURNS FROM A QUEUE, AND CALLS ANOTHER TO FILL THE QUEUE IF NO RETURN - fn lex(&mut self) -> MaybeToken { + fn lex_pass(&mut self, reader: Box) -> SubLexerResult { use self::MaybeToken::*; // If currently delegating to a sub-lexer, return from that. @@ -214,12 +214,12 @@ impl Lexer for CodeLexer { } } - fn get_reader(&self) -> Rc> { - match self.reader_or_delegate { - ReaderOrDelegate::Reader() => self.reader.clone(), - ReaderOrDelegate::Delegate(ref delegate) => delegate.get_reader(), - } - } +// fn get_reader(&self) -> Rc> { +// match self.reader_or_delegate { +// ReaderOrDelegate::Reader() => self.reader.clone(), +// ReaderOrDelegate::Delegate(ref delegate) => delegate.get_reader(), +// } +// } } #[cfg(test)] @@ -239,7 +239,6 @@ mod tests { use mango::token::tokens::StartBlockToken; use mango::token::Tokens; use std::cell::RefCell; - use std::ops::Generator; use std::rc::Rc; fn assert_text_to_tokens(text: &str, tokens: Vec) { @@ -287,15 +286,4 @@ mod tests { #[test] fn test_lexing_delegation() {} - - #[test] - fn generators() { - let mut gen = || { - yield Tokens::Keyword(KeywordToken::from_str("let".to_owned()).unwrap()); - yield Tokens::Identifier(IdentifierToken::from_str("x".to_owned()).unwrap()); - yield Tokens::Association(AssociationToken::from_unprefixed()); - return; - }; - let first = unsafe { gen.resume() }; - } } diff --git a/src/mango/lexing/combi_lexer.rs b/src/mango/lexing/combi_lexer.rs new file mode 100644 index 00000000..9d85aa36 --- /dev/null +++ b/src/mango/lexing/combi_lexer.rs @@ -0,0 +1,149 @@ +use mango::io::typ::Reader; +use mango::io::typ::ReaderResult::*; +use mango::lexing::code_lexer::CodeLexer; +use mango::lexing::string_lexer::StringLexer; +use mango::lexing::typ::Lexer; +use mango::lexing::typ::SubLexer; +use mango::lexing::typ::MaybeToken; +use mango::lexing::typ::SubLexerResult; +use mango::token::special::UnlexableToken; +use mango::token::Tokens; +use mango::token::tokens::AssociationToken; +use mango::token::tokens::EndBlockToken; +use mango::token::tokens::EndStatementToken; +use mango::token::tokens::IdentifierToken; +use mango::token::tokens::KeywordToken; +use mango::token::tokens::OperatorToken; +use mango::token::tokens::ParenthesisCloseToken; +use mango::token::tokens::ParenthesisOpenToken; +use mango::token::tokens::StartBlockToken; +use mango::util::collection::Queue; +use mango::util::collection::Stack; +use std::cell::RefCell; +use std::rc::Rc; + + +pub struct CombiLexer { + reader: Box, + lexers: Stack>, + buffer: Queue, +} + +impl CombiLexer { + pub fn new(reader: Box) -> Self { + let mut lexers: Stack> = Stack::new(); + lexers.push(Box::new(CodeLexer::new())); + CombiLexer { + reader: reader, + lexers: lexers, + buffer: Queue::new(), + } + } +} + +impl Lexer for CombiLexer { + fn lex(&mut self) -> MaybeToken { + + // If there are tokens in the buffer, return from there; + if let Option::Some(token) = self.buffer.pop() { + return MaybeToken::Token(token); + } + + match self.lexers.borrow_mut() { + // No more lexers to delegate to; lexing is finished. + Option::None => MaybeToken::End, + Option::Some(ref mut lexer) => { + match lexer.lex_pass(self.reader) { + SubLexerResult::Tokens(tokens) => { + if tokens.len() > 0 { + // The sublexer produced tokens, queue them. + self.buffer.append(tokens); + self.lex() // TODO: if every branch does this, move it down + } else { + // No tokens were produced; make sure the reader has advanced to prevent infinite loops. + // TODO: check reader state + self.lex() + } + }, + SubLexerResult::Delegate(lexer) => { + // Switch to a different delegate lexer. + self.lexers.push(lexer); + self.lex() + }, + SubLexerResult::End => { + // The sublexer is done, remove it from the stack and continue with the next. + self.lexers.pop(); // This needs non-lexical lifetimes + self.lex() + }, + } + } + } + } + +} + +#[cfg(test)] +mod tests { + use mango::io::fortest::StringReader; + use mango::lexing::util::lex_all::{lex_all, LexList}; + use mango::token::Tokens; + use mango::token::tokens::AssociationToken; + use mango::token::tokens::EndBlockToken; + use mango::token::tokens::EndStatementToken; + use mango::token::tokens::IdentifierToken; + use mango::token::tokens::KeywordToken; + use mango::token::tokens::LiteralToken; + use mango::token::tokens::OperatorToken; + use mango::token::tokens::ParenthesisCloseToken; + use mango::token::tokens::ParenthesisOpenToken; + use mango::token::tokens::StartBlockToken; + use std::cell::RefCell; + use std::rc::Rc; + use super::CombiLexer; + + fn assert_text_to_tokens(text: &str, tokens: Vec) { + assert_eq!( + LexList::from_tokens(tokens), + lex_all(&mut CombiLexer::new(Rc::new(RefCell::new( + StringReader::new(text.to_owned()) + )))) + ) + } + + #[test] + fn test_lexing_individual() { + assert_text_to_tokens( + "if", + vec![Tokens::Keyword( + KeywordToken::from_str("if".to_owned()).unwrap(), + )], + ); + // todo: more + } + + #[test] + fn test_lexing_combined() { + assert_text_to_tokens( + "let x = 0\nfor x < 128\n\tx += 1", + vec![ + Tokens::Keyword(KeywordToken::from_str("let".to_owned()).unwrap()), + Tokens::Identifier(IdentifierToken::from_str("x".to_owned()).unwrap()), + Tokens::Association(AssociationToken::from_unprefixed()), + Tokens::Literal(LiteralToken::Int(0)), + Tokens::EndStatement(EndStatementToken::new_end_line()), + Tokens::Keyword(KeywordToken::from_str("for".to_owned()).unwrap()), + Tokens::Operator(OperatorToken::from_str("<").unwrap()), + Tokens::Literal(LiteralToken::Int(128)), + Tokens::EndStatement(EndStatementToken::new_end_line()), + Tokens::StartBlock(StartBlockToken::new()), + Tokens::Identifier(IdentifierToken::from_str("x".to_owned()).unwrap()), + Tokens::Association(AssociationToken::from_str("+".to_owned()).unwrap()), + Tokens::Literal(LiteralToken::Int(1)), + Tokens::EndBlock(EndBlockToken::new(true, false)), + ], + ); + } + + #[test] + fn test_lexing_delegation() {} +} diff --git a/src/mango/lexing/gen_code_lexer.rs b/src/mango/lexing/gen_code_lexer.rs index 655ad282..35d9737c 100644 --- a/src/mango/lexing/gen_code_lexer.rs +++ b/src/mango/lexing/gen_code_lexer.rs @@ -42,6 +42,7 @@ use mango::util::strslice::slice::glyphat; // TODO: this is problematic because the generator wants references to the container, // TODO: and the container obviously stores the generator +// TODO: use generator: Box> directory struct CodeLexer> { indent: i32, delegate: Option>, diff --git a/src/mango/lexing/mod.rs b/src/mango/lexing/mod.rs index 254d9a1c..9d251b70 100644 --- a/src/mango/lexing/mod.rs +++ b/src/mango/lexing/mod.rs @@ -1,9 +1,11 @@ -pub mod typ; +mod typ; -pub mod code_lexer; +mod combi_lexer; -pub mod comment_lexer; +mod code_lexer; -pub mod string_lexer; +mod comment_lexer; -pub mod util; +mod string_lexer; + +mod util; diff --git a/src/mango/lexing/string_lexer.rs b/src/mango/lexing/string_lexer.rs index 8e4adc83..18d313d7 100644 --- a/src/mango/lexing/string_lexer.rs +++ b/src/mango/lexing/string_lexer.rs @@ -42,7 +42,7 @@ impl Lexer for StringLexer { } } - fn get_reader(&self) -> Rc> { - self.reader.clone() - } +// fn get_reader(&self) -> Rc> { +// self.reader.clone() +// } } diff --git a/src/mango/lexing/typ.rs b/src/mango/lexing/typ.rs index 8ea53ba5..ee98c1fc 100644 --- a/src/mango/lexing/typ.rs +++ b/src/mango/lexing/typ.rs @@ -1,7 +1,19 @@ use mango::io::typ::Reader; use mango::token::Tokens; -use std::cell::RefCell; -use std::rc::Rc; + +// TODO: I don't want this to be public outside the crate +pub enum SubLexerResult { + Tokens(Vec), + Delegate(Box), + End, +} + +// TODO: I don't want this to be public outside the crate +pub trait SubLexer { + /// Does one iteration of a sublexer, which should either delegate or return tokens. + /// If an empty vector of tokens is returned, the reader should have advanced (to prevent infinite loops). + fn lex_pass(&mut self, reader: Box) -> SubLexerResult; +} pub enum MaybeToken { Token(Tokens), @@ -17,5 +29,5 @@ pub trait Lexer { /// Every call to lex returns a token until the end of the input. fn lex(&mut self) -> MaybeToken; - fn get_reader(&self) -> Rc>; +// fn get_reader(&self) -> Rc>; }