From f95c8927e8dd5536c86973a2cad85c45ba741908 Mon Sep 17 00:00:00 2001 From: Mark Date: Thu, 14 Jun 2018 15:16:24 +0200 Subject: [PATCH] Progress on rewriting lexer #52 --- src/mango/lexing/gen_code_lexer.rs | 230 +++++++++++++++-------------- 1 file changed, 116 insertions(+), 114 deletions(-) diff --git a/src/mango/lexing/gen_code_lexer.rs b/src/mango/lexing/gen_code_lexer.rs index a045b377..e8a86956 100644 --- a/src/mango/lexing/gen_code_lexer.rs +++ b/src/mango/lexing/gen_code_lexer.rs @@ -37,6 +37,7 @@ use mango::util::strslice::slice::glyphat; //} struct Container> { + indent: i32, delegate: Option>, reader: Rc>, generator: G, @@ -46,6 +47,7 @@ impl Container>> { fn lex_indents(&mut self) -> Vec { let mut line_indent = 0; + let mut res = Vec::with_capacity(12); while let Match(_) = self.reader.borrow_mut().matches("\\t") { line_indent += 1; } @@ -54,11 +56,9 @@ impl Container>> { // TODO: turn this "new" into a constant if let Match(_) = self.reader.borrow_mut().matches("end") { // If this is followed by an 'end' keyword, then that 'end' is redundant. - self.buffer - .push(Tokens::EndBlock(EndBlockToken::new(true, true))); + yield Tokens::EndBlock(EndBlockToken::new(true, true)); } else { - self.buffer - .push(Tokens::EndBlock(EndBlockToken::new(true, false))); + yield Tokens::EndBlock(EndBlockToken::new(true, false)); } } for _ in self.indent..line_indent { @@ -72,6 +72,7 @@ impl Container>> { pub fn new(&mut self, reader: Rc>) -> Box { let q = 42; Box::new(Container { + indent: 0, reader: reader, delegate: Option::None, generator: Box::new(move || { @@ -91,116 +92,117 @@ impl Container>> { } } - // TODO: see if all these match_res can be removed (they couldn't before due to borrowchecker, even with non-lexical lifetimes) - let continue_match_res = self.reader.borrow_mut().matches("\\.\\.\\."); - if let Match(_) = continue_match_res { - // Line continuation has no token, it just continues on the next line. - let newline_match_res = self.reader.borrow_mut().matches("\\n\\r?"); - if let Match(_) = newline_match_res { - // There should always be a newline after continuations, so that they can be ignored together. - } else { - // All the text between ... and the end of the line is unlexable. - let newline_match_res = self.reader.borrow_mut().matches("[^\\n]*\\n\\r?"); - if let Match(word) = newline_match_res { - yield Tokens::Unlexable(UnlexableToken::new(word)); - // This is a new line, so there may be indents. - // TODO: is there any yield-from like Python? - for res in self.lex_indents() { - yield res; - } - } else { - // TODO: I don't know yet how to deal with '...' followed by end-of-file - panic!() - } - } - // TODO: are continues necessary? it seems more state-independent to restart for each token - continue; - } - let newline_match_res = self.reader.borrow_mut().matches("\\n\\r?"); - if let Match(_) = newline_match_res { - // Newline WITHOUT line continuation. - // This is a new line, so there may be indents. - yield Tokens::EndStatement(EndStatementToken::new_end_line()); - for res in self.lex_indents() { - yield res; - } - continue; - } - let end_statement_match_res = self.reader.borrow_mut().matches(";"); - if let Match(_) = end_statement_match_res { - // Semicolon, which ends a statement. - // Need to do some extra work with buffer, because there may be a newline followed by indentation, which ; should precede. - yield Tokens::EndStatement(EndStatementToken::new_semicolon()); - let end_line_match_res = self.reader.borrow_mut().matches("\\n\\r?"); - if let Match(_) = end_line_match_res { - // If semicolon is followed by a newline (redundant), then we need to deal with indents (but ignore the newline itself). - // This will return the queue of tokens, including the semicolon. - for res in self.lex_indents() { - yield res; - } - } - // No newline, can just return the semicolon (which is certainly on the queue, and should be the only thing, but it is fine here if not). - continue; - } - - // - // Indentation done; do the rest of lexing. - // - // Parse identifiers and keywords. This assumes that keywords are a subset of identifiers. - let word_match_res = self.reader.borrow_mut().matches(IdentifierToken::subpattern()); - if let Match(word) = word_match_res { - // Check if it is a keyword. - // TODO: maybe turn identifier into keyword to avoid a string copy? kind of elaborate... - if let Ok(keyword) = KeywordToken::from_str(word.clone()) { - yield Tokens::Keyword(keyword); - } - yield Tokens::Identifier(IdentifierToken::from_str(word).unwrap()); - continue; - } - // String literal (delegated). - let string_match_res = self.reader.borrow_mut().matches("[a-z]?\""); - if let Match(_) = string_match_res { - let sublexer: Box = Box::new(StringLexer::new_double_quoted(self.reader.clone())); - self.delegate = Option::Some(sublexer); - continue; - } - // Association (before operator). - let association_match_res = self.reader.borrow_mut().matches(&AssociationToken::subpattern()); - if let Match(token) = association_match_res { - if glyphat(token, -1) == "=" { - yield Tokens::Association(AssociationToken::from_unprefixed()); // TODO - } else { - yield Tokens::Association(AssociationToken::from_str(charsliceto(token, -1)).unwrap()); - } - continue; - } - // Operator. - let operator_match_res = self.reader.borrow_mut().matches(OperatorToken::subpattern()); - if let Match(token) = operator_match_res { - yield Tokens::Operator(OperatorToken::from_str(&token).unwrap()); - continue; - } - // Grouping symbols - if let Match(_) = self.reader.borrow_mut().matches(r"\(") { - yield Tokens::ParenthesisOpen(ParenthesisOpenToken::new()); - continue; - } - if let Match(_) = self.reader.borrow_mut().matches(r"\)") { - yield Tokens::ParenthesisClose(ParenthesisCloseToken::new()); - continue; - } - - - let unknown_word = self.reader.borrow_mut().matches("[^\\s]+"); - match unknown_word { - Match(word) => yield Tokens::Unlexable(UnlexableToken::new(word)), - NoMatch() => panic!("Do not know how to proceed with parsing"), - EOF() => { - // TODO: also dedent and end statement here - return - } - } - continue; +// // TODO: see if all these match_res can be removed (they couldn't before due to borrowchecker, even with non-lexical lifetimes) +// let continue_match_res = self.reader.borrow_mut().matches("\\.\\.\\."); +// if let Match(_) = continue_match_res { +// // Line continuation has no token, it just continues on the next line. +// let newline_match_res = self.reader.borrow_mut().matches("\\n\\r?"); +// if let Match(_) = newline_match_res { +// // There should always be a newline after continuations, so that they can be ignored together. +// } else { +// // All the text between ... and the end of the line is unlexable. +// let newline_match_res = self.reader.borrow_mut().matches("[^\\n]*\\n\\r?"); +// if let Match(word) = newline_match_res { +// yield Tokens::Unlexable(UnlexableToken::new(word)); +// // This is a new line, so there may be indents. +// // TODO: is there any yield-from like Python? +// for res in self.lex_indents() { +// yield res; +// } +// } else { +// // TODO: I don't know yet how to deal with '...' followed by end-of-file +// panic!() +// } +// } +// // TODO: are continues necessary? it seems more state-independent to restart for each token +// continue; +// } +// let newline_match_res = self.reader.borrow_mut().matches("\\n\\r?"); +// if let Match(_) = newline_match_res { +// // Newline WITHOUT line continuation. +// // This is a new line, so there may be indents. +// yield Tokens::EndStatement(EndStatementToken::new_end_line()); +// for res in self.lex_indents() { +// yield res; +// } +// continue; +// } +// let end_statement_match_res = self.reader.borrow_mut().matches(";"); +// if let Match(_) = end_statement_match_res { +// // Semicolon, which ends a statement. +// // Need to do some extra work with buffer, because there may be a newline followed by indentation, which ; should precede. +// yield Tokens::EndStatement(EndStatementToken::new_semicolon()); +// let end_line_match_res = self.reader.borrow_mut().matches("\\n\\r?"); +// if let Match(_) = end_line_match_res { +// // If semicolon is followed by a newline (redundant), then we need to deal with indents (but ignore the newline itself). +// // This will return the queue of tokens, including the semicolon. +// for res in self.lex_indents() { +// yield res; +// } +// } +// // No newline, can just return the semicolon (which is certainly on the queue, and should be the only thing, but it is fine here if not). +// continue; +// } +// +// // +// // Indentation done; do the rest of lexing. +// // +// // Parse identifiers and keywords. This assumes that keywords are a subset of identifiers. +// let word_match_res = self.reader.borrow_mut().matches(IdentifierToken::subpattern()); +// if let Match(word) = word_match_res { +// // Check if it is a keyword. +// // TODO: maybe turn identifier into keyword to avoid a string copy? kind of elaborate... +// if word == "end" { +// yield Tokens::EndBlock(EndBlockToken::new(false, true)); +// } else if let Ok(keyword) = KeywordToken::from_str(word.clone()) { +// yield Tokens::Keyword(keyword); +// } +// yield Tokens::Identifier(IdentifierToken::from_str(word).unwrap()); +// continue; +// } +// // String literal (delegated). +// let string_match_res = self.reader.borrow_mut().matches("[a-z]?\""); +// if let Match(_) = string_match_res { +// let sublexer: Box = Box::new(StringLexer::new_double_quoted(self.reader.clone())); +// self.delegate = Option::Some(sublexer); +// continue; +// } +// // Association (before operator). +// let association_match_res = self.reader.borrow_mut().matches(&AssociationToken::subpattern()); +// if let Match(token) = association_match_res { +// if glyphat(token, -1) == "=" { +// yield Tokens::Association(AssociationToken::from_unprefixed()); // TODO +// } else { +// yield Tokens::Association(AssociationToken::from_str(charsliceto(token, -1)).unwrap()); +// } +// continue; +// } +// // Operator. +// let operator_match_res = self.reader.borrow_mut().matches(OperatorToken::subpattern()); +// if let Match(token) = operator_match_res { +// yield Tokens::Operator(OperatorToken::from_str(&token).unwrap()); +// continue; +// } +// // Grouping symbols +// if let Match(_) = self.reader.borrow_mut().matches(r"\(") { +// yield Tokens::ParenthesisOpen(ParenthesisOpenToken::new()); +// continue; +// } +// if let Match(_) = self.reader.borrow_mut().matches(r"\)") { +// yield Tokens::ParenthesisClose(ParenthesisCloseToken::new()); +// continue; +// } +// +// +// let unknown_word = self.reader.borrow_mut().matches("[^\\s]+"); +// match unknown_word { +// Match(word) => yield Tokens::Unlexable(UnlexableToken::new(word)), +// NoMatch() => panic!("Do not know how to proceed with parsing"), +// EOF() => { +// // TODO: also dedent and end statement here +// return +// } +// } } }),