From e5ce31cfb99b6aa78f0e160563edd02a396d998b Mon Sep 17 00:00:00 2001 From: Mark Date: Tue, 12 Jun 2018 22:00:10 +0200 Subject: [PATCH] Restructured lexing using generator, now just borrow/type problem #52 --- src/mango/lexing/gen_code_lexer.rs | 251 +++++++++++++++++------------ src/mango/util/strslice/slice.rs | 7 +- 2 files changed, 155 insertions(+), 103 deletions(-) diff --git a/src/mango/lexing/gen_code_lexer.rs b/src/mango/lexing/gen_code_lexer.rs index ebf4b8e5..a045b377 100644 --- a/src/mango/lexing/gen_code_lexer.rs +++ b/src/mango/lexing/gen_code_lexer.rs @@ -19,6 +19,9 @@ use mango::util::collection::Queue; use std::cell::RefCell; use std::ops::{Generator, GeneratorState}; use std::rc::Rc; +use std::borrow::BorrowMut; +use mango::util::strslice::charsliceto; +use mango::util::strslice::slice::glyphat; /// This generator does the real lexing work, but is wrapped in a normal /// class to satisfy an interface that doesn't expose nightly or unsafe features. @@ -34,126 +37,170 @@ use std::rc::Rc; //} struct Container> { + delegate: Option>, + reader: Rc>, generator: G, } impl Container>> { - pub fn new(reader: Box) -> Box { + + fn lex_indents(&mut self) -> Vec { + let mut line_indent = 0; + while let Match(_) = self.reader.borrow_mut().matches("\\t") { + line_indent += 1; + } + for _ in line_indent..self.indent { + // This line is dedented, make end tokens. + // TODO: turn this "new" into a constant + if let Match(_) = self.reader.borrow_mut().matches("end") { + // If this is followed by an 'end' keyword, then that 'end' is redundant. + self.buffer + .push(Tokens::EndBlock(EndBlockToken::new(true, true))); + } else { + self.buffer + .push(Tokens::EndBlock(EndBlockToken::new(true, false))); + } + } + for _ in self.indent..line_indent { + // This line is indented, make start tokens. + self.buffer.push(Tokens::StartBlock(StartBlockToken::new())); + } + self.indent = line_indent; + self.lex() + } + + pub fn new(&mut self, reader: Rc>) -> Box { let q = 42; Box::new(Container { + reader: reader, + delegate: Option::None, generator: Box::new(move || { - // If there is a buffer due to indentation or continuations, return from that. - if let Some(token) = self.buffer.pop() { - yield token; - } - // Past this point, we assume that hte buffer is empty. When adding stuff, pop it or re-enter lex() soon. - let continue_match_res = self.reader.borrow_mut().matches("\\.\\.\\."); - if let Match(_) = continue_match_res { - // Line continuation has no token, it just continues on the next line. + loop { + + // Delegate to another lexer if one is set. + if let Option::Some(delegate) = self.delegate { + match delegate.lex() { + MaybeToken::Token(token) => { + yield token; + continue; + } + MaybeToken::End => { + self.delegate = Option::None; + } + } + } + + // TODO: see if all these match_res can be removed (they couldn't before due to borrowchecker, even with non-lexical lifetimes) + let continue_match_res = self.reader.borrow_mut().matches("\\.\\.\\."); + if let Match(_) = continue_match_res { + // Line continuation has no token, it just continues on the next line. + let newline_match_res = self.reader.borrow_mut().matches("\\n\\r?"); + if let Match(_) = newline_match_res { + // There should always be a newline after continuations, so that they can be ignored together. + } else { + // All the text between ... and the end of the line is unlexable. + let newline_match_res = self.reader.borrow_mut().matches("[^\\n]*\\n\\r?"); + if let Match(word) = newline_match_res { + yield Tokens::Unlexable(UnlexableToken::new(word)); + // This is a new line, so there may be indents. + // TODO: is there any yield-from like Python? + for res in self.lex_indents() { + yield res; + } + } else { + // TODO: I don't know yet how to deal with '...' followed by end-of-file + panic!() + } + } + // TODO: are continues necessary? it seems more state-independent to restart for each token + continue; + } let newline_match_res = self.reader.borrow_mut().matches("\\n\\r?"); if let Match(_) = newline_match_res { - // There should always be a newline after continuations, so that they can be ignored together. - } else { - let newline_match_res = self.reader.borrow_mut().matches("[^\\n]*\\n\\r?"); - if let Match(word) = newline_match_res { - self.buffer - .push(Tokens::Unlexable(UnlexableToken::new(word))); - // This is a new line, so there may be indents. - self.lex_indents(); - yield self.lex(); + // Newline WITHOUT line continuation. + // This is a new line, so there may be indents. + yield Tokens::EndStatement(EndStatementToken::new_end_line()); + for res in self.lex_indents() { + yield res; + } + continue; + } + let end_statement_match_res = self.reader.borrow_mut().matches(";"); + if let Match(_) = end_statement_match_res { + // Semicolon, which ends a statement. + // Need to do some extra work with buffer, because there may be a newline followed by indentation, which ; should precede. + yield Tokens::EndStatement(EndStatementToken::new_semicolon()); + let end_line_match_res = self.reader.borrow_mut().matches("\\n\\r?"); + if let Match(_) = end_line_match_res { + // If semicolon is followed by a newline (redundant), then we need to deal with indents (but ignore the newline itself). + // This will return the queue of tokens, including the semicolon. + for res in self.lex_indents() { + yield res; + } + } + // No newline, can just return the semicolon (which is certainly on the queue, and should be the only thing, but it is fine here if not). + continue; + } + + // + // Indentation done; do the rest of lexing. + // + // Parse identifiers and keywords. This assumes that keywords are a subset of identifiers. + let word_match_res = self.reader.borrow_mut().matches(IdentifierToken::subpattern()); + if let Match(word) = word_match_res { + // Check if it is a keyword. + // TODO: maybe turn identifier into keyword to avoid a string copy? kind of elaborate... + if let Ok(keyword) = KeywordToken::from_str(word.clone()) { + yield Tokens::Keyword(keyword); + } + yield Tokens::Identifier(IdentifierToken::from_str(word).unwrap()); + continue; + } + // String literal (delegated). + let string_match_res = self.reader.borrow_mut().matches("[a-z]?\""); + if let Match(_) = string_match_res { + let sublexer: Box = Box::new(StringLexer::new_double_quoted(self.reader.clone())); + self.delegate = Option::Some(sublexer); + continue; + } + // Association (before operator). + let association_match_res = self.reader.borrow_mut().matches(&AssociationToken::subpattern()); + if let Match(token) = association_match_res { + if glyphat(token, -1) == "=" { + yield Tokens::Association(AssociationToken::from_unprefixed()); // TODO } else { - // TODO: I don't know yet how to deal with '...' followed by end-of-file - panic!() + yield Tokens::Association(AssociationToken::from_str(charsliceto(token, -1)).unwrap()); } + continue; } - } - let newline_match_res = self.reader.borrow_mut().matches("\\n\\r?"); - if let Match(_) = newline_match_res { - // Newline WITHOUT line continuation. - // This is a new line, so there may be indents. - self.buffer - .push(Tokens::EndStatement(EndStatementToken::new_end_line())); - self.lex_indents(); - yield self.lex(); - } - let end_statement_match_res = self.reader.borrow_mut().matches(";"); - if let Match(_) = end_statement_match_res { - // Semicolon, which ends a statement. - // Need to do some extra work with buffer, because there may be a newline followed by indentation, which ; should precede. - self.buffer - .push(Tokens::EndStatement(EndStatementToken::new_semicolon())); - let end_line_match_res = self.reader.borrow_mut().matches("\\n\\r?"); - if let Match(_) = end_line_match_res { - // If semicolon is followed by a newline (redundant), then we need to deal with indents (but ignore the newline itself). - // This will return the queue of tokens, including the semicolon. - yield self.lex_indents(); + // Operator. + let operator_match_res = self.reader.borrow_mut().matches(OperatorToken::subpattern()); + if let Match(token) = operator_match_res { + yield Tokens::Operator(OperatorToken::from_str(&token).unwrap()); + continue; } - // No newline, can just return the semicolon (which is certainly on the queue, and should be the only thing, but it is fine here if not). - yield self.buffer.pop().unwrap(); - } - // - // Indentation done; do the rest of lexing. - // - // Parse identifiers and keywords. This assumes that keywords are a subset of identifiers. - if let Match(word) = self - .reader - .borrow_mut() - .matches(IdentifierToken::subpattern()) - { - // later: maybe turn identifier into keyword to avoid a string copy? kind of elaborate... - if let Ok(keyword) = KeywordToken::from_str(word.clone()) { - yield Tokens::Keyword(keyword); + // Grouping symbols + if let Match(_) = self.reader.borrow_mut().matches(r"\(") { + yield Tokens::ParenthesisOpen(ParenthesisOpenToken::new()); + continue; } - yield Tokens::Identifier(IdentifierToken::from_str(word).unwrap()); - } - // Literal - let string_match_res = self.reader.borrow_mut().matches("[a-z]?\""); - if let Match(_) = string_match_res { - let sublexer: Box = - Box::new(StringLexer::new_double_quoted(self.reader.clone())); - self.reader_or_delegate = ReaderOrDelegate::Delegate(sublexer); - yield self.lex(); - } - // Association (before operator) - let association_match_res = self - .reader - .borrow_mut() - .matches(&AssociationToken::subpattern()); - if let Match(token) = association_match_res { - if token.chars().last().unwrap() == '=' { - // return Token(Tokens::Association(AssociationToken::from_str(token[..1]).unwrap())); - yield Tokens::Association(AssociationToken::from_unprefixed()); // TODO - } else { - yield Tokens::Association(AssociationToken::from_unprefixed()); + if let Match(_) = self.reader.borrow_mut().matches(r"\)") { + yield Tokens::ParenthesisClose(ParenthesisCloseToken::new()); + continue; } - } - // Operator - let operator_match_res = self - .reader - .borrow_mut() - .matches(OperatorToken::subpattern()); - if let Match(token) = operator_match_res { - yield Tokens::Operator(OperatorToken::from_str(&token).unwrap()); - } - // Grouping symbols - if let Match(_) = self.reader.borrow_mut().matches(r"\(") { - yield Tokens::ParenthesisOpen(ParenthesisOpenToken::new()); - } - if let Match(_) = self.reader.borrow_mut().matches(r"\)") { - yield Tokens::ParenthesisClose(ParenthesisCloseToken::new()); - } - let unknown_word = self.reader.borrow_mut().matches("[^\\s]+"); - match unknown_word { - Match(word) => yield Tokens::Unlexable(UnlexableToken::new(word)), - NoMatch() => { - panic!("Do not know how to proceed with parsing") - } - EOF() => { - // TODO: also dedent and end statement here - return + + let unknown_word = self.reader.borrow_mut().matches("[^\\s]+"); + match unknown_word { + Match(word) => yield Tokens::Unlexable(UnlexableToken::new(word)), + NoMatch() => panic!("Do not know how to proceed with parsing"), + EOF() => { + // TODO: also dedent and end statement here + return + } } + continue; } }), diff --git a/src/mango/util/strslice/slice.rs b/src/mango/util/strslice/slice.rs index acf32ef1..95055439 100644 --- a/src/mango/util/strslice/slice.rs +++ b/src/mango/util/strslice/slice.rs @@ -48,6 +48,10 @@ pub fn charsliceto>(text: S, end: isize) -> String { charslice(text, 0, end) } +pub fn glyphat>(text: S, pos: isize) -> String { + charslice(text, pos, pos+1) +} + #[cfg(test)] mod tests { use super::*; @@ -58,9 +62,10 @@ mod tests { assert_eq!("你好", charslice("你好!", 0, 2)); assert_eq!("!", charslicefrom("你好!", 2)); assert_eq!("你好", charsliceto("你好!", 2)); + assert_eq!("好", glyphat("你好!", 1)); // Negative indices should match Python 3 behaviour: assert_eq!("你好", charslice("你好!", -3, -1)); assert_eq!("!", charslicefrom("你好!", -1)); - assert_eq!("你好", charsliceto("你好!", -1)); + assert_eq!("好", glyphat("你好!", -2)); } }