From 10650d5fb0b53ba433dca7ac76f1512963e6eb96 Mon Sep 17 00:00:00 2001 From: Mark Date: Sun, 17 Jun 2018 17:20:04 +0200 Subject: [PATCH] More code lexing functionality reactivated #52 --- src/mango/lexing/code_lexer.rs | 199 +++++++++++++++-------------- src/mango/lexing/combi_lexer.rs | 25 ++-- src/mango/lexing/string_lexer.rs | 6 +- src/mango/lexing/typ.rs | 6 + src/mango/util/collection/queue.rs | 1 - src/mango/util/strslice/slice.rs | 2 +- 6 files changed, 121 insertions(+), 118 deletions(-) diff --git a/src/mango/lexing/code_lexer.rs b/src/mango/lexing/code_lexer.rs index de2d3d09..c92c2026 100644 --- a/src/mango/lexing/code_lexer.rs +++ b/src/mango/lexing/code_lexer.rs @@ -6,7 +6,6 @@ use mango::lexing::typ::MaybeToken; use mango::lexing::typ::SubLexer; use mango::lexing::typ::SubLexerResult; use mango::token::special::UnlexableToken; -use mango::token::Tokens; use mango::token::tokens::AssociationToken; use mango::token::tokens::EndBlockToken; use mango::token::tokens::EndStatementToken; @@ -16,6 +15,7 @@ use mango::token::tokens::OperatorToken; use mango::token::tokens::ParenthesisCloseToken; use mango::token::tokens::ParenthesisOpenToken; use mango::token::tokens::StartBlockToken; +use mango::token::Tokens; use mango::util::collection::Queue; use std::cell::RefCell; use std::rc::Rc; @@ -61,6 +61,13 @@ impl CodeLexer { self.indent = line_indent; tokens } + + fn token_and_indents(&mut self, reader: &mut Box, token: Tokens) -> SubLexerResult { + let mut tokens: Vec = vec![token]; + // This is a new line, so there may be indents. + tokens.append(&mut self.lex_indents(reader)); + return SubLexerResult::Result(tokens); + } } impl SubLexer for CodeLexer { @@ -70,115 +77,109 @@ impl SubLexer for CodeLexer { // TODO: put all these match results inline // End of line continuation - let continue_match_res = reader.matches(r"\.\.\."); - if let Match(_) = continue_match_res { + if let Match(_) = reader.matches(r"\.\.\.") { // Line continuation has no token, it just continues on the next line, ignoring indents (for now). - let newline_match_res = reader.matches(r"\n\r?\t*"); - if let Match(_) = newline_match_res { + if let Match(_) = reader.matches(r"\n\r?\t*") { // There should always be a newline after continuations, so that they can be ignored together. } else { // The rest of this line is unparsable. - let newline_match_res = reader.matches("[^\\n]*\\n\\r?"); - if let Match(word) = newline_match_res { - let mut res: Vec = vec![Tokens::Unlexable(UnlexableToken::new(word))]; + if let Match(word) = reader.matches("[^\\n]*\\n\\r?") { // This is a new line, so there may be indents. - res.append(&mut self.lex_indents(reader)); - return Result(res); + return self + .token_and_indents(reader, Tokens::Unlexable(UnlexableToken::new(word))); } else { // TODO: I don't know yet how to deal with '...' followed by end-of-file panic!() } } } + // Newlines + if let Match(_) = reader.matches("\\n\\r?") { + // Newline WITHOUT line continuation. + // This is a new line, so there may be indents. + return self.token_and_indents( + reader, + Tokens::EndStatement(EndStatementToken::new_end_line()), + ); + } + // End of statement + if let Match(_) = reader.matches(";") { + // Semicolon, which ends a statement. + if let Match(_) = reader.matches("\\n\\r?") { + // If semicolon is followed by a newline, it is redundant. Deal with indents (but ignore the newline itself). + return self.token_and_indents( + reader, + Tokens::EndStatement(EndStatementToken::new_semicolon()), + ); + } else { + return SubLexerResult::single(Tokens::EndStatement( + EndStatementToken::new_semicolon(), + )); + } + } + // + // Indentation done; do the rest of lexing. + // + // Parse identifiers and keywords. This assumes that keywords are a subset of identifiers. + if let Match(word) = reader.matches(IdentifierToken::subpattern()) { + // TODO: maybe turn identifier into keyword to avoid a string copy? kind of elaborate... + if let Ok(keyword) = KeywordToken::from_str(word.clone()) { + return SubLexerResult::single(Tokens::Keyword(keyword)); + } + return SubLexerResult::single(Tokens::Identifier( + IdentifierToken::from_str(word).unwrap(), + )); + } + // // Literal + // let string_match_res = reader.matches("[a-z]?\""); + // if let Match(_) = string_match_res { + // let sublexer: Box = + // Box::new(StringLexer::new_double_quoted(self.reader.clone())); + // self.reader_or_delegate = ReaderOrDelegate::Delegate(sublexer); + // return self.lex(); + // } + // // Association (before operator) + // let association_match_res = self + // .reader + // .borrow_mut() + // .matches(&AssociationToken::subpattern()); + // if let Match(token) = association_match_res { + // if token.chars().last().unwrap() == '=' { + // // return Token(Tokens::Association(AssociationToken::from_str(token[..1]).unwrap())); + // return Token(Tokens::Association(AssociationToken::from_unprefixed())); // TODO + // } else { + // return Token(Tokens::Association(AssociationToken::from_unprefixed())); + // } + // } + // // Operator + // let operator_match_res = self + // .reader + // .borrow_mut() + // .matches(OperatorToken::subpattern()); + // if let Match(token) = operator_match_res { + // return Token(Tokens::Operator(OperatorToken::from_str(&token).unwrap())); + // } + // // Grouping symbols + // if let Match(_) = reader.matches(r"\(") { + // return Token(Tokens::ParenthesisOpen(ParenthesisOpenToken::new())); + // } + // if let Match(_) = reader.matches(r"\)") { + // return Token(Tokens::ParenthesisClose(ParenthesisCloseToken::new())); + // } + // + // let unknown_word = reader.matches("[^\\s]+"); + // match unknown_word { + // Match(word) => return Token(Tokens::Unlexable(UnlexableToken::new(word))), + // NoMatch() => { + // println!("END {:?}", self.reader.borrow()); // TODO + // panic!("Do not know how to proceed with parsing") + // } + // EOF() => { + // // TODO: also dedent and end statement here + // End + // } + // } - panic!(); -// let newline_match_res = reader.matches("\\n\\r?"); -// if let Match(_) = newline_match_res { -// // Newline WITHOUT line continuation. -// // This is a new line, so there may be indents. -// self.buffer -// .push(Tokens::EndStatement(EndStatementToken::new_end_line())); -// self.lex_indents(); -// return self.lex(); -// } -// let end_statement_match_res = reader.matches(";"); -// if let Match(_) = end_statement_match_res { -// // Semicolon, which ends a statement. -// // Need to do some extra work with buffer, because there may be a newline followed by indentation, which ; should precede. -// self.buffer -// .push(Tokens::EndStatement(EndStatementToken::new_semicolon())); -// let end_line_match_res = reader.matches("\\n\\r?"); -// if let Match(_) = end_line_match_res { -// // If semicolon is followed by a newline (redundant), then we need to deal with indents (but ignore the newline itself). -// // This will return the queue of tokens, including the semicolon. -// return self.lex_indents(); -// } -// // No newline, can just return the semicolon (which is certainly on the queue, and should be the only thing, but it is fine here if not). -// return Token(self.buffer.pop().unwrap()); -// } -// // -// // Indentation done; do the rest of lexing. -// // -// // Parse identifiers and keywords. This assumes that keywords are a subset of identifiers. -// if let Match(word) = self -// .reader -// .borrow_mut() -// .matches(IdentifierToken::subpattern()) -// { -// // later: maybe turn identifier into keyword to avoid a string copy? kind of elaborate... -// if let Ok(keyword) = KeywordToken::from_str(word.clone()) { -// return Token(Tokens::Keyword(keyword)); -// } -// return Token(Tokens::Identifier(IdentifierToken::from_str(word).unwrap())); -// } -// // Literal -// let string_match_res = reader.matches("[a-z]?\""); -// if let Match(_) = string_match_res { -// let sublexer: Box = -// Box::new(StringLexer::new_double_quoted(self.reader.clone())); -// self.reader_or_delegate = ReaderOrDelegate::Delegate(sublexer); -// return self.lex(); -// } -// // Association (before operator) -// let association_match_res = self -// .reader -// .borrow_mut() -// .matches(&AssociationToken::subpattern()); -// if let Match(token) = association_match_res { -// if token.chars().last().unwrap() == '=' { -// // return Token(Tokens::Association(AssociationToken::from_str(token[..1]).unwrap())); -// return Token(Tokens::Association(AssociationToken::from_unprefixed())); // TODO -// } else { -// return Token(Tokens::Association(AssociationToken::from_unprefixed())); -// } -// } -// // Operator -// let operator_match_res = self -// .reader -// .borrow_mut() -// .matches(OperatorToken::subpattern()); -// if let Match(token) = operator_match_res { -// return Token(Tokens::Operator(OperatorToken::from_str(&token).unwrap())); -// } -// // Grouping symbols -// if let Match(_) = reader.matches(r"\(") { -// return Token(Tokens::ParenthesisOpen(ParenthesisOpenToken::new())); -// } -// if let Match(_) = reader.matches(r"\)") { -// return Token(Tokens::ParenthesisClose(ParenthesisCloseToken::new())); -// } -// -// let unknown_word = reader.matches("[^\\s]+"); -// match unknown_word { -// Match(word) => return Token(Tokens::Unlexable(UnlexableToken::new(word))), -// NoMatch() => { -// println!("END {:?}", self.reader.borrow()); // TODO -// panic!("Do not know how to proceed with parsing") -// } -// EOF() => { -// // TODO: also dedent and end statement here -// End -// } -// } + panic!() // TODO TMP } } diff --git a/src/mango/lexing/combi_lexer.rs b/src/mango/lexing/combi_lexer.rs index b0fc9b84..7c353ffb 100644 --- a/src/mango/lexing/combi_lexer.rs +++ b/src/mango/lexing/combi_lexer.rs @@ -3,11 +3,10 @@ use mango::io::typ::ReaderResult::*; use mango::lexing::code_lexer::CodeLexer; use mango::lexing::string_lexer::StringLexer; use mango::lexing::typ::Lexer; -use mango::lexing::typ::SubLexer; use mango::lexing::typ::MaybeToken; +use mango::lexing::typ::SubLexer; use mango::lexing::typ::SubLexerResult; use mango::token::special::UnlexableToken; -use mango::token::Tokens; use mango::token::tokens::AssociationToken; use mango::token::tokens::EndBlockToken; use mango::token::tokens::EndStatementToken; @@ -17,12 +16,12 @@ use mango::token::tokens::OperatorToken; use mango::token::tokens::ParenthesisCloseToken; use mango::token::tokens::ParenthesisOpenToken; use mango::token::tokens::StartBlockToken; +use mango::token::Tokens; use mango::util::collection::Queue; use mango::util::collection::Stack; use std::cell::RefCell; use std::rc::Rc; - pub struct CombiLexer { reader: Box, lexers: Stack>, @@ -43,7 +42,6 @@ impl CombiLexer { impl Lexer for CombiLexer { fn lex(&mut self) -> MaybeToken { - // If there are tokens in the buffer, return from there; if let Option::Some(token) = self.buffer.pop() { return MaybeToken::Token(token); @@ -64,29 +62,28 @@ impl Lexer for CombiLexer { // TODO: check reader state self.lex() } - }, + } SubLexerResult::Delegate(lexer) => { // Switch to a different delegate lexer. self.lexers.push(lexer); self.lex() - }, + } SubLexerResult::End => { // The sublexer is done, remove it from the stack and continue with the next. - self.lexers.pop(); // This needs non-lexical lifetimes + self.lexers.pop(); // This needs non-lexical lifetimes self.lex() - }, + } } } } } - } #[cfg(test)] mod tests { + use super::CombiLexer; use mango::io::fortest::StringReader; use mango::lexing::util::lex_all::{lex_all, LexList}; - use mango::token::Tokens; use mango::token::tokens::AssociationToken; use mango::token::tokens::EndBlockToken; use mango::token::tokens::EndStatementToken; @@ -97,16 +94,16 @@ mod tests { use mango::token::tokens::ParenthesisCloseToken; use mango::token::tokens::ParenthesisOpenToken; use mango::token::tokens::StartBlockToken; + use mango::token::Tokens; use std::cell::RefCell; use std::rc::Rc; - use super::CombiLexer; fn assert_text_to_tokens(text: &str, tokens: Vec) { assert_eq!( LexList::from_tokens(tokens), - lex_all(&mut CombiLexer::new(Box::new( - StringReader::new(text.to_owned()) - ))) + lex_all(&mut CombiLexer::new(Box::new(StringReader::new( + text.to_owned() + )))) ) } diff --git a/src/mango/lexing/string_lexer.rs b/src/mango/lexing/string_lexer.rs index 18d313d7..4a433a00 100644 --- a/src/mango/lexing/string_lexer.rs +++ b/src/mango/lexing/string_lexer.rs @@ -42,7 +42,7 @@ impl Lexer for StringLexer { } } -// fn get_reader(&self) -> Rc> { -// self.reader.clone() -// } + // fn get_reader(&self) -> Rc> { + // self.reader.clone() + // } } diff --git a/src/mango/lexing/typ.rs b/src/mango/lexing/typ.rs index 0aba1981..6911c479 100644 --- a/src/mango/lexing/typ.rs +++ b/src/mango/lexing/typ.rs @@ -8,6 +8,12 @@ pub enum SubLexerResult { End, } +impl SubLexerResult { + pub fn single(token: Tokens) -> Self { + SubLexerResult::Result(vec![token]) + } +} + // TODO: I don't want this to be public outside the crate pub trait SubLexer { /// Does one iteration of a sublexer, which should either delegate or return tokens. diff --git a/src/mango/util/collection/queue.rs b/src/mango/util/collection/queue.rs index 8eb43ec1..bd239bef 100644 --- a/src/mango/util/collection/queue.rs +++ b/src/mango/util/collection/queue.rs @@ -1,4 +1,3 @@ - /// A one-ended queue. See also [Stack]. /// This is just a wrapper around vec so nobody pushes or pops the wrong end. pub struct Queue { diff --git a/src/mango/util/strslice/slice.rs b/src/mango/util/strslice/slice.rs index 95055439..60ab23a8 100644 --- a/src/mango/util/strslice/slice.rs +++ b/src/mango/util/strslice/slice.rs @@ -49,7 +49,7 @@ pub fn charsliceto>(text: S, end: isize) -> String { } pub fn glyphat>(text: S, pos: isize) -> String { - charslice(text, pos, pos+1) + charslice(text, pos, pos + 1) } #[cfg(test)]