Restructured lexing using generator, now just borrow/type problem #52

mangolang · Jun 12, 2018 · e5ce31c · e5ce31c
1 parent 3c85472
commit e5ce31c
Show file tree

Hide file tree

Showing 2 changed files with 155 additions and 103 deletions.
diff --git a/src/mango/lexing/gen_code_lexer.rs b/src/mango/lexing/gen_code_lexer.rs
@@ -19,6 +19,9 @@ use mango::util::collection::Queue;
 use std::cell::RefCell;
 use std::ops::{Generator, GeneratorState};
 use std::rc::Rc;
+use std::borrow::BorrowMut;
+use mango::util::strslice::charsliceto;
+use mango::util::strslice::slice::glyphat;
 
 /// This generator does the real lexing work, but is wrapped in a normal
 /// class to satisfy an interface that doesn't expose nightly or unsafe features.
@@ -34,126 +37,170 @@ use std::rc::Rc;
 //}
 
 struct Container<G: Generator<Yield = Tokens, Return = ()>> {
+    delegate: Option<Box<Lexer>>,
+    reader: Rc<RefCell<Reader>>,
     generator: G,
 }
 
 impl Container<Box<Generator<Yield = Tokens, Return = ()>>> {
-    pub fn new(reader: Box<Reader>) -> Box<Self> {
+
+    fn lex_indents(&mut self) -> Vec<Tokens> {
+        let mut line_indent = 0;
+        while let Match(_) = self.reader.borrow_mut().matches("\\t") {
+            line_indent += 1;
+        }
+        for _ in line_indent..self.indent {
+            // This line is dedented, make end tokens.
+            // TODO: turn this "new" into a constant
+            if let Match(_) = self.reader.borrow_mut().matches("end") {
+                // If this is followed by an 'end' keyword, then that 'end' is redundant.
+                self.buffer
+                    .push(Tokens::EndBlock(EndBlockToken::new(true, true)));
+            } else {
+                self.buffer
+                    .push(Tokens::EndBlock(EndBlockToken::new(true, false)));
+            }
+        }
+        for _ in self.indent..line_indent {
+            // This line is indented, make start tokens.
+            self.buffer.push(Tokens::StartBlock(StartBlockToken::new()));
+        }
+        self.indent = line_indent;
+        self.lex()
+    }
+
+    pub fn new(&mut self, reader: Rc<RefCell<Reader>>) -> Box<Self> {
         let q = 42;
         Box::new(Container {
+            reader: reader,
+            delegate: Option::None,
             generator: Box::new(move || {
 
-                // If there is a buffer due to indentation or continuations, return from that.
-                if let Some(token) = self.buffer.pop() {
-                    yield token;
-                }
-                // Past this point, we assume that hte buffer is empty. When adding stuff, pop it or re-enter lex() soon.
-                let continue_match_res = self.reader.borrow_mut().matches("\\.\\.\\.");
-                if let Match(_) = continue_match_res {
-                    // Line continuation has no token, it just continues on the next line.
+                loop {
+
+                    // Delegate to another lexer if one is set.
+                    if let Option::Some(delegate) = self.delegate {
+                        match delegate.lex() {
+                            MaybeToken::Token(token) => {
+                                yield token;
+                                continue;
+                            }
+                            MaybeToken::End => {
+                                self.delegate = Option::None;
+                            }
+                        }
+                    }
+
+                    // TODO: see if all these match_res can be removed (they couldn't before due to borrowchecker, even with non-lexical lifetimes)
+                    let continue_match_res = self.reader.borrow_mut().matches("\\.\\.\\.");
+                    if let Match(_) = continue_match_res {
+                        // Line continuation has no token, it just continues on the next line.
+                        let newline_match_res = self.reader.borrow_mut().matches("\\n\\r?");
+                        if let Match(_) = newline_match_res {
+                            // There should always be a newline after continuations, so that they can be ignored together.
+                        } else {
+                            // All the text between ... and the end of the line is unlexable.
+                            let newline_match_res = self.reader.borrow_mut().matches("[^\\n]*\\n\\r?");
+                            if let Match(word) = newline_match_res {
+                                yield Tokens::Unlexable(UnlexableToken::new(word));
+                                // This is a new line, so there may be indents.
+                                // TODO: is there any yield-from like Python?
+                                for res in self.lex_indents() {
+                                    yield res;
+                                }
+                            } else {
+                                // TODO: I don't know yet how to deal with '...' followed by end-of-file
+                                panic!()
+                            }
+                        }
+                        // TODO: are continues necessary? it seems more state-independent to restart for each token
+                        continue;
+                    }
                     let newline_match_res = self.reader.borrow_mut().matches("\\n\\r?");
                     if let Match(_) = newline_match_res {
-                        // There should always be a newline after continuations, so that they can be ignored together.
-                    } else {
-                        let newline_match_res = self.reader.borrow_mut().matches("[^\\n]*\\n\\r?");
-                        if let Match(word) = newline_match_res {
-                            self.buffer
-                                .push(Tokens::Unlexable(UnlexableToken::new(word)));
-                            // This is a new line, so there may be indents.
-                            self.lex_indents();
-                            yield self.lex();
+                        // Newline WITHOUT line continuation.
+                        // This is a new line, so there may be indents.
+                        yield Tokens::EndStatement(EndStatementToken::new_end_line());
+                        for res in self.lex_indents() {
+                            yield res;
+                        }
+                        continue;
+                    }
+                    let end_statement_match_res = self.reader.borrow_mut().matches(";");
+                    if let Match(_) = end_statement_match_res {
+                        // Semicolon, which ends a statement.
+                        // Need to do some extra work with buffer, because there may be a newline followed by indentation, which ; should precede.
+                        yield Tokens::EndStatement(EndStatementToken::new_semicolon());
+                        let end_line_match_res = self.reader.borrow_mut().matches("\\n\\r?");
+                        if let Match(_) = end_line_match_res {
+                            // If semicolon is followed by a newline (redundant), then we need to deal with indents (but ignore the newline itself).
+                            // This will return the queue of tokens, including the semicolon.
+                            for res in self.lex_indents() {
+                                yield res;
+                            }
+                        }
+                        // No newline, can just return the semicolon (which is certainly on the queue, and should be the only thing, but it is fine here if not).
+                        continue;
+                    }
+
+                    //
+                    // Indentation done; do the rest of lexing.
+                    //
+                    // Parse identifiers and keywords. This assumes that keywords are a subset of identifiers.
+                    let word_match_res = self.reader.borrow_mut().matches(IdentifierToken::subpattern());
+                    if let Match(word) = word_match_res {
+                        // Check if it is a keyword.
+                        // TODO: maybe turn identifier into keyword to avoid a string copy? kind of elaborate...
+                        if let Ok(keyword) = KeywordToken::from_str(word.clone()) {
+                            yield Tokens::Keyword(keyword);
+                        }
+                        yield Tokens::Identifier(IdentifierToken::from_str(word).unwrap());
+                        continue;
+                    }
+                    // String literal (delegated).
+                    let string_match_res = self.reader.borrow_mut().matches("[a-z]?\"");
+                    if let Match(_) = string_match_res {
+                        let sublexer: Box<Lexer> = Box::new(StringLexer::new_double_quoted(self.reader.clone()));
+                        self.delegate = Option::Some(sublexer);
+                        continue;
+                    }
+                    // Association (before operator).
+                    let association_match_res = self.reader.borrow_mut().matches(&AssociationToken::subpattern());
+                    if let Match(token) = association_match_res {
+                        if glyphat(token, -1) == "=" {
+                            yield Tokens::Association(AssociationToken::from_unprefixed()); // TODO
                         } else {
-                            // TODO: I don't know yet how to deal with '...' followed by end-of-file
-                            panic!()
+                            yield Tokens::Association(AssociationToken::from_str(charsliceto(token, -1)).unwrap());
                         }
+                        continue;
                     }
-                }
-                let newline_match_res = self.reader.borrow_mut().matches("\\n\\r?");
-                if let Match(_) = newline_match_res {
-                    // Newline WITHOUT line continuation.
-                    // This is a new line, so there may be indents.
-                    self.buffer
-                        .push(Tokens::EndStatement(EndStatementToken::new_end_line()));
-                    self.lex_indents();
-                    yield self.lex();
-                }
-                let end_statement_match_res = self.reader.borrow_mut().matches(";");
-                if let Match(_) = end_statement_match_res {
-                    // Semicolon, which ends a statement.
-                    // Need to do some extra work with buffer, because there may be a newline followed by indentation, which ; should precede.
-                    self.buffer
-                        .push(Tokens::EndStatement(EndStatementToken::new_semicolon()));
-                    let end_line_match_res = self.reader.borrow_mut().matches("\\n\\r?");
-                    if let Match(_) = end_line_match_res {
-                        // If semicolon is followed by a newline (redundant), then we need to deal with indents (but ignore the newline itself).
-                        // This will return the queue of tokens, including the semicolon.
-                        yield self.lex_indents();
+                    // Operator.
+                    let operator_match_res = self.reader.borrow_mut().matches(OperatorToken::subpattern());
+                    if let Match(token) = operator_match_res {
+                        yield Tokens::Operator(OperatorToken::from_str(&token).unwrap());
+                        continue;
                     }
-                    // No newline, can just return the semicolon (which is certainly on the queue, and should be the only thing, but it is fine here if not).
-                    yield self.buffer.pop().unwrap();
-                }
-                //
-                // Indentation done; do the rest of lexing.
-                //
-                // Parse identifiers and keywords. This assumes that keywords are a subset of identifiers.
-                if let Match(word) = self
-                    .reader
-                    .borrow_mut()
-                    .matches(IdentifierToken::subpattern())
-                {
-                    // later: maybe turn identifier into keyword to avoid a string copy? kind of elaborate...
-                    if let Ok(keyword) = KeywordToken::from_str(word.clone()) {
-                        yield Tokens::Keyword(keyword);
+                    // Grouping symbols
+                    if let Match(_) = self.reader.borrow_mut().matches(r"\(") {
+                        yield Tokens::ParenthesisOpen(ParenthesisOpenToken::new());
+                        continue;
                     }
-                    yield Tokens::Identifier(IdentifierToken::from_str(word).unwrap());
-                }
-                // Literal
-                let string_match_res = self.reader.borrow_mut().matches("[a-z]?\"");
-                if let Match(_) = string_match_res {
-                    let sublexer: Box<Lexer> =
-                        Box::new(StringLexer::new_double_quoted(self.reader.clone()));
-                    self.reader_or_delegate = ReaderOrDelegate::Delegate(sublexer);
-                    yield self.lex();
-                }
-                // Association (before operator)
-                let association_match_res = self
-                    .reader
-                    .borrow_mut()
-                    .matches(&AssociationToken::subpattern());
-                if let Match(token) = association_match_res {
-                    if token.chars().last().unwrap() == '=' {
-                        //                        return Token(Tokens::Association(AssociationToken::from_str(token[..1]).unwrap()));
-                        yield Tokens::Association(AssociationToken::from_unprefixed()); // TODO
-                    } else {
-                        yield Tokens::Association(AssociationToken::from_unprefixed());
+                    if let Match(_) = self.reader.borrow_mut().matches(r"\)") {
+                        yield Tokens::ParenthesisClose(ParenthesisCloseToken::new());
+                        continue;
                     }
-                }
-                // Operator
-                let operator_match_res = self
-                    .reader
-                    .borrow_mut()
-                    .matches(OperatorToken::subpattern());
-                if let Match(token) = operator_match_res {
-                    yield Tokens::Operator(OperatorToken::from_str(&token).unwrap());
-                }
-                // Grouping symbols
-                if let Match(_) = self.reader.borrow_mut().matches(r"\(") {
-                    yield Tokens::ParenthesisOpen(ParenthesisOpenToken::new());
-                }
-                if let Match(_) = self.reader.borrow_mut().matches(r"\)") {
-                    yield Tokens::ParenthesisClose(ParenthesisCloseToken::new());
-                }
 
-                let unknown_word = self.reader.borrow_mut().matches("[^\\s]+");
-                match unknown_word {
-                    Match(word) => yield Tokens::Unlexable(UnlexableToken::new(word)),
-                    NoMatch() => {
-                        panic!("Do not know how to proceed with parsing")
-                    }
-                    EOF() => {
-                        // TODO: also dedent and end statement here
-                        return
+
+                    let unknown_word = self.reader.borrow_mut().matches("[^\\s]+");
+                    match unknown_word {
+                        Match(word) => yield Tokens::Unlexable(UnlexableToken::new(word)),
+                        NoMatch() => panic!("Do not know how to proceed with parsing"),
+                        EOF() => {
+                            // TODO: also dedent and end statement here
+                            return
+                        }
                     }
+                    continue;
                 }
 
             }),

diff --git a/src/mango/util/strslice/slice.rs b/src/mango/util/strslice/slice.rs
@@ -48,6 +48,10 @@ pub fn charsliceto<S: Into<String>>(text: S, end: isize) -> String {
     charslice(text, 0, end)
 }
 
+pub fn glyphat<S: Into<String>>(text: S, pos: isize) -> String {
+    charslice(text, pos, pos+1)
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
@@ -58,9 +62,10 @@ mod tests {
         assert_eq!("你好", charslice("你好!", 0, 2));
         assert_eq!("!", charslicefrom("你好!", 2));
         assert_eq!("你好", charsliceto("你好!", 2));
+        assert_eq!("好", glyphat("你好!", 1));
         // Negative indices should match Python 3 behaviour:
         assert_eq!("你好", charslice("你好!", -3, -1));
         assert_eq!("!", charslicefrom("你好!", -1));
-        assert_eq!("你好", charsliceto("你好!", -1));
+        assert_eq!("好", glyphat("你好!", -2));
     }
 }