More code lexing functionality reactivated #52

mangolang · Jun 17, 2018 · 10650d5 · 10650d5
1 parent 491e214
commit 10650d5
Show file tree

Hide file tree

Showing 6 changed files with 121 additions and 118 deletions.
diff --git a/src/mango/lexing/code_lexer.rs b/src/mango/lexing/code_lexer.rs
@@ -6,7 +6,6 @@ use mango::lexing::typ::MaybeToken;
 use mango::lexing::typ::SubLexer;
 use mango::lexing::typ::SubLexerResult;
 use mango::token::special::UnlexableToken;
-use mango::token::Tokens;
 use mango::token::tokens::AssociationToken;
 use mango::token::tokens::EndBlockToken;
 use mango::token::tokens::EndStatementToken;
@@ -16,6 +15,7 @@ use mango::token::tokens::OperatorToken;
 use mango::token::tokens::ParenthesisCloseToken;
 use mango::token::tokens::ParenthesisOpenToken;
 use mango::token::tokens::StartBlockToken;
+use mango::token::Tokens;
 use mango::util::collection::Queue;
 use std::cell::RefCell;
 use std::rc::Rc;
@@ -61,6 +61,13 @@ impl CodeLexer {
         self.indent = line_indent;
         tokens
     }
+
+    fn token_and_indents(&mut self, reader: &mut Box<Reader>, token: Tokens) -> SubLexerResult {
+        let mut tokens: Vec<Tokens> = vec![token];
+        // This is a new line, so there may be indents.
+        tokens.append(&mut self.lex_indents(reader));
+        return SubLexerResult::Result(tokens);
+    }
 }
 
 impl SubLexer for CodeLexer {
@@ -70,115 +77,109 @@ impl SubLexer for CodeLexer {
         // TODO: put all these match results inline
 
         // End of line continuation
-        let continue_match_res = reader.matches(r"\.\.\.");
-        if let Match(_) = continue_match_res {
+        if let Match(_) = reader.matches(r"\.\.\.") {
             // Line continuation has no token, it just continues on the next line, ignoring indents (for now).
-            let newline_match_res = reader.matches(r"\n\r?\t*");
-            if let Match(_) = newline_match_res {
+            if let Match(_) = reader.matches(r"\n\r?\t*") {
                 // There should always be a newline after continuations, so that they can be ignored together.
             } else {
                 // The rest of this line is unparsable.
-                let newline_match_res = reader.matches("[^\\n]*\\n\\r?");
-                if let Match(word) = newline_match_res {
-                    let mut res: Vec<Tokens> = vec![Tokens::Unlexable(UnlexableToken::new(word))];
+                if let Match(word) = reader.matches("[^\\n]*\\n\\r?") {
                     // This is a new line, so there may be indents.
-                    res.append(&mut self.lex_indents(reader));
-                    return Result(res);
+                    return self
+                        .token_and_indents(reader, Tokens::Unlexable(UnlexableToken::new(word)));
                 } else {
                     // TODO: I don't know yet how to deal with '...' followed by end-of-file
                     panic!()
                 }
             }
         }
+        // Newlines
+        if let Match(_) = reader.matches("\\n\\r?") {
+            // Newline WITHOUT line continuation.
+            // This is a new line, so there may be indents.
+            return self.token_and_indents(
+                reader,
+                Tokens::EndStatement(EndStatementToken::new_end_line()),
+            );
+        }
+        // End of statement
+        if let Match(_) = reader.matches(";") {
+            // Semicolon, which ends a statement.
+            if let Match(_) = reader.matches("\\n\\r?") {
+                // If semicolon is followed by a newline, it is redundant. Deal with indents (but ignore the newline itself).
+                return self.token_and_indents(
+                    reader,
+                    Tokens::EndStatement(EndStatementToken::new_semicolon()),
+                );
+            } else {
+                return SubLexerResult::single(Tokens::EndStatement(
+                    EndStatementToken::new_semicolon(),
+                ));
+            }
+        }
+        //
+        // Indentation done; do the rest of lexing.
+        //
+        // Parse identifiers and keywords. This assumes that keywords are a subset of identifiers.
+        if let Match(word) = reader.matches(IdentifierToken::subpattern()) {
+            // TODO: maybe turn identifier into keyword to avoid a string copy? kind of elaborate...
+            if let Ok(keyword) = KeywordToken::from_str(word.clone()) {
+                return SubLexerResult::single(Tokens::Keyword(keyword));
+            }
+            return SubLexerResult::single(Tokens::Identifier(
+                IdentifierToken::from_str(word).unwrap(),
+            ));
+        }
+        //        // Literal
+        //        let string_match_res = reader.matches("[a-z]?\"");
+        //        if let Match(_) = string_match_res {
+        //            let sublexer: Box<Lexer> =
+        //                Box::new(StringLexer::new_double_quoted(self.reader.clone()));
+        //            self.reader_or_delegate = ReaderOrDelegate::Delegate(sublexer);
+        //            return self.lex();
+        //        }
+        //        // Association (before operator)
+        //        let association_match_res = self
+        //            .reader
+        //            .borrow_mut()
+        //            .matches(&AssociationToken::subpattern());
+        //        if let Match(token) = association_match_res {
+        //            if token.chars().last().unwrap() == '=' {
+        //                //                        return Token(Tokens::Association(AssociationToken::from_str(token[..1]).unwrap()));
+        //                return Token(Tokens::Association(AssociationToken::from_unprefixed())); // TODO
+        //            } else {
+        //                return Token(Tokens::Association(AssociationToken::from_unprefixed()));
+        //            }
+        //        }
+        //        // Operator
+        //        let operator_match_res = self
+        //            .reader
+        //            .borrow_mut()
+        //            .matches(OperatorToken::subpattern());
+        //        if let Match(token) = operator_match_res {
+        //            return Token(Tokens::Operator(OperatorToken::from_str(&token).unwrap()));
+        //        }
+        //        // Grouping symbols
+        //        if let Match(_) = reader.matches(r"\(") {
+        //            return Token(Tokens::ParenthesisOpen(ParenthesisOpenToken::new()));
+        //        }
+        //        if let Match(_) = reader.matches(r"\)") {
+        //            return Token(Tokens::ParenthesisClose(ParenthesisCloseToken::new()));
+        //        }
+        //
+        //        let unknown_word = reader.matches("[^\\s]+");
+        //        match unknown_word {
+        //            Match(word) => return Token(Tokens::Unlexable(UnlexableToken::new(word))),
+        //            NoMatch() => {
+        //                println!("END {:?}", self.reader.borrow()); // TODO
+        //                panic!("Do not know how to proceed with parsing")
+        //            }
+        //            EOF() => {
+        //                // TODO: also dedent and end statement here
+        //                End
+        //            }
+        //        }
 
-        panic!();
-//        let newline_match_res = reader.matches("\\n\\r?");
-//        if let Match(_) = newline_match_res {
-//            // Newline WITHOUT line continuation.
-//            // This is a new line, so there may be indents.
-//            self.buffer
-//                .push(Tokens::EndStatement(EndStatementToken::new_end_line()));
-//            self.lex_indents();
-//            return self.lex();
-//        }
-//        let end_statement_match_res = reader.matches(";");
-//        if let Match(_) = end_statement_match_res {
-//            // Semicolon, which ends a statement.
-//            // Need to do some extra work with buffer, because there may be a newline followed by indentation, which ; should precede.
-//            self.buffer
-//                .push(Tokens::EndStatement(EndStatementToken::new_semicolon()));
-//            let end_line_match_res = reader.matches("\\n\\r?");
-//            if let Match(_) = end_line_match_res {
-//                // If semicolon is followed by a newline (redundant), then we need to deal with indents (but ignore the newline itself).
-//                // This will return the queue of tokens, including the semicolon.
-//                return self.lex_indents();
-//            }
-//            // No newline, can just return the semicolon (which is certainly on the queue, and should be the only thing, but it is fine here if not).
-//            return Token(self.buffer.pop().unwrap());
-//        }
-//        //
-//        // Indentation done; do the rest of lexing.
-//        //
-//        // Parse identifiers and keywords. This assumes that keywords are a subset of identifiers.
-//        if let Match(word) = self
-//            .reader
-//            .borrow_mut()
-//            .matches(IdentifierToken::subpattern())
-//            {
-//                // later: maybe turn identifier into keyword to avoid a string copy? kind of elaborate...
-//                if let Ok(keyword) = KeywordToken::from_str(word.clone()) {
-//                    return Token(Tokens::Keyword(keyword));
-//                }
-//                return Token(Tokens::Identifier(IdentifierToken::from_str(word).unwrap()));
-//            }
-//        // Literal
-//        let string_match_res = reader.matches("[a-z]?\"");
-//        if let Match(_) = string_match_res {
-//            let sublexer: Box<Lexer> =
-//                Box::new(StringLexer::new_double_quoted(self.reader.clone()));
-//            self.reader_or_delegate = ReaderOrDelegate::Delegate(sublexer);
-//            return self.lex();
-//        }
-//        // Association (before operator)
-//        let association_match_res = self
-//            .reader
-//            .borrow_mut()
-//            .matches(&AssociationToken::subpattern());
-//        if let Match(token) = association_match_res {
-//            if token.chars().last().unwrap() == '=' {
-//                //                        return Token(Tokens::Association(AssociationToken::from_str(token[..1]).unwrap()));
-//                return Token(Tokens::Association(AssociationToken::from_unprefixed())); // TODO
-//            } else {
-//                return Token(Tokens::Association(AssociationToken::from_unprefixed()));
-//            }
-//        }
-//        // Operator
-//        let operator_match_res = self
-//            .reader
-//            .borrow_mut()
-//            .matches(OperatorToken::subpattern());
-//        if let Match(token) = operator_match_res {
-//            return Token(Tokens::Operator(OperatorToken::from_str(&token).unwrap()));
-//        }
-//        // Grouping symbols
-//        if let Match(_) = reader.matches(r"\(") {
-//            return Token(Tokens::ParenthesisOpen(ParenthesisOpenToken::new()));
-//        }
-//        if let Match(_) = reader.matches(r"\)") {
-//            return Token(Tokens::ParenthesisClose(ParenthesisCloseToken::new()));
-//        }
-//
-//        let unknown_word = reader.matches("[^\\s]+");
-//        match unknown_word {
-//            Match(word) => return Token(Tokens::Unlexable(UnlexableToken::new(word))),
-//            NoMatch() => {
-//                println!("END {:?}", self.reader.borrow()); // TODO
-//                panic!("Do not know how to proceed with parsing")
-//            }
-//            EOF() => {
-//                // TODO: also dedent and end statement here
-//                End
-//            }
-//        }
+        panic!() // TODO TMP
     }
 }
diff --git a/src/mango/lexing/combi_lexer.rs b/src/mango/lexing/combi_lexer.rs
@@ -3,11 +3,10 @@ use mango::io::typ::ReaderResult::*;
 use mango::lexing::code_lexer::CodeLexer;
 use mango::lexing::string_lexer::StringLexer;
 use mango::lexing::typ::Lexer;
-use mango::lexing::typ::SubLexer;
 use mango::lexing::typ::MaybeToken;
+use mango::lexing::typ::SubLexer;
 use mango::lexing::typ::SubLexerResult;
 use mango::token::special::UnlexableToken;
-use mango::token::Tokens;
 use mango::token::tokens::AssociationToken;
 use mango::token::tokens::EndBlockToken;
 use mango::token::tokens::EndStatementToken;
@@ -17,12 +16,12 @@ use mango::token::tokens::OperatorToken;
 use mango::token::tokens::ParenthesisCloseToken;
 use mango::token::tokens::ParenthesisOpenToken;
 use mango::token::tokens::StartBlockToken;
+use mango::token::Tokens;
 use mango::util::collection::Queue;
 use mango::util::collection::Stack;
 use std::cell::RefCell;
 use std::rc::Rc;
 
-
 pub struct CombiLexer {
     reader: Box<Reader>,
     lexers: Stack<Box<SubLexer>>,
@@ -43,7 +42,6 @@ impl CombiLexer {
 
 impl Lexer for CombiLexer {
     fn lex(&mut self) -> MaybeToken {
-
         // If there are tokens in the buffer, return from there;
         if let Option::Some(token) = self.buffer.pop() {
             return MaybeToken::Token(token);
@@ -64,29 +62,28 @@ impl Lexer for CombiLexer {
                             // TODO: check reader state
                             self.lex()
                         }
-                    },
+                    }
                     SubLexerResult::Delegate(lexer) => {
                         // Switch to a different delegate lexer.
                         self.lexers.push(lexer);
                         self.lex()
-                    },
+                    }
                     SubLexerResult::End => {
                         // The sublexer is done, remove it from the stack and continue with the next.
-                        self.lexers.pop();  // This needs non-lexical lifetimes
+                        self.lexers.pop(); // This needs non-lexical lifetimes
                         self.lex()
-                    },
+                    }
                 }
             }
         }
     }
-
 }
 
 #[cfg(test)]
 mod tests {
+    use super::CombiLexer;
     use mango::io::fortest::StringReader;
     use mango::lexing::util::lex_all::{lex_all, LexList};
-    use mango::token::Tokens;
     use mango::token::tokens::AssociationToken;
     use mango::token::tokens::EndBlockToken;
     use mango::token::tokens::EndStatementToken;
@@ -97,16 +94,16 @@ mod tests {
     use mango::token::tokens::ParenthesisCloseToken;
     use mango::token::tokens::ParenthesisOpenToken;
     use mango::token::tokens::StartBlockToken;
+    use mango::token::Tokens;
     use std::cell::RefCell;
     use std::rc::Rc;
-    use super::CombiLexer;
 
     fn assert_text_to_tokens(text: &str, tokens: Vec<Tokens>) {
         assert_eq!(
             LexList::from_tokens(tokens),
-            lex_all(&mut CombiLexer::new(Box::new(
-                StringReader::new(text.to_owned())
-            )))
+            lex_all(&mut CombiLexer::new(Box::new(StringReader::new(
+                text.to_owned()
+            ))))
         )
     }
 

diff --git a/src/mango/lexing/string_lexer.rs b/src/mango/lexing/string_lexer.rs
@@ -42,7 +42,7 @@ impl Lexer for StringLexer {
         }
     }
 
-//    fn get_reader(&self) -> Rc<RefCell<Reader>> {
-//        self.reader.clone()
-//    }
+    //    fn get_reader(&self) -> Rc<RefCell<Reader>> {
+    //        self.reader.clone()
+    //    }
 }
diff --git a/src/mango/lexing/typ.rs b/src/mango/lexing/typ.rs
@@ -8,6 +8,12 @@ pub enum SubLexerResult {
     End,
 }
 
+impl SubLexerResult {
+    pub fn single(token: Tokens) -> Self {
+        SubLexerResult::Result(vec![token])
+    }
+}
+
 // TODO: I don't want this to be public outside the crate
 pub trait SubLexer {
     /// Does one iteration of a sublexer, which should either delegate or return tokens.

diff --git a/src/mango/util/collection/queue.rs b/src/mango/util/collection/queue.rs
@@ -1,4 +1,3 @@
-
 /// A one-ended queue. See also [Stack].
 /// This is just a wrapper around vec so nobody pushes or pops the wrong end.
 pub struct Queue<T> {

diff --git a/src/mango/util/strslice/slice.rs b/src/mango/util/strslice/slice.rs
@@ -49,7 +49,7 @@ pub fn charsliceto<S: Into<String>>(text: S, end: isize) -> String {
 }
 
 pub fn glyphat<S: Into<String>>(text: S, pos: isize) -> String {
-    charslice(text, pos, pos+1)
+    charslice(text, pos, pos + 1)
 }
 
 #[cfg(test)]