Add identifier and keyword lexing #56

mangolang · May 22, 2018 · ea5a4cd · ea5a4cd
1 parent 3728f4e
commit ea5a4cd
Show file tree

Hide file tree

Showing 3 changed files with 49 additions and 18 deletions.
diff --git a/src/mango/lexing/code_lexer.rs b/src/mango/lexing/code_lexer.rs
@@ -5,11 +5,12 @@ use mango::lexing::typ::MaybeToken;
 use mango::token::special::UnlexableToken;
 use mango::token::tokens::EndBlockToken;
 use mango::token::tokens::EndStatementToken;
+use mango::token::tokens::IdentifierToken;
+use mango::token::tokens::KeywordToken;
 use mango::token::tokens::ParenthesisCloseToken;
 use mango::token::tokens::ParenthesisOpenToken;
 use mango::token::tokens::StartBlockToken;
 use mango::token::Tokens;
-use mango::util::codeparts::Keyword;
 use mango::util::collection::Queue;
 
 pub struct CodeLexer<'r> {
@@ -35,6 +36,7 @@ impl<'r> CodeLexer<'r> {
         }
         for _ in line_indent..self.indent {
             // This line is dedented, make end tokens.
+            // TODO: turn this "new" into a constant
             if let Match(_) = self.reader.matches("end") {
                 // If this is followed by an 'end' keyword, then that 'end' is redundant.
                 self.buffer
@@ -55,54 +57,72 @@ impl<'r> CodeLexer<'r> {
 
 impl<'r> Lexer<'r> for CodeLexer<'r> {
     fn lex(&mut self) -> MaybeToken {
+        use self::MaybeToken::*;
+
         // If there is a buffer due to indentation or continuations, return from that.
         if let Some(token) = self.buffer.pop() {
-            return MaybeToken::Token(token);
+            return Token(token);
         }
         // Past this point, we assume that hte buffer is empty. When adding stuff, pop it or re-enter lex() soon.
-        if let Match(word) = self.reader.matches("\\.\\.\\.") {
+        if let Match(_) = self.reader.matches("\\.\\.\\.") {
             // Line continuation has no token, it just continues on the next line.
-            if let Match(word) = self.reader.matches("\\n\\r?") {
+            if let Match(_) = self.reader.matches("\\n\\r?") {
                 // There should always be a newline after continuations, so that they can be ignored together.
             } else if let Match(word) = self.reader.matches("[^\\n]*\\n\\r?") {
-                return MaybeToken::Token(Tokens::Unlexable(UnlexableToken::new(word)));
+                return Token(Tokens::Unlexable(UnlexableToken::new(word)));
             } else {
                 // TODO: I don't know yet how to deal with ... followed by end-of-file
                 panic!()
             }
             // This is a new line, so there may be indents.
             return self.lex_indents();
         }
-        if let Match(word) = self.reader.matches("\\n\\r?") {
+        if let Match(_) = self.reader.matches("\\n\\r?") {
             // Newline WITHOUT line continuation.
-            return MaybeToken::Token(Tokens::EndStatement(EndStatementToken::new_end_line()));
+            return Token(Tokens::EndStatement(EndStatementToken::new_end_line()));
         }
-        if let Match(word) = self.reader.matches(";") {
+        if let Match(_) = self.reader.matches(";") {
             // Semicolon, which ends a statement.
             // Need to do some extra work with buffer, because there may be a newline followed by indentation, which ; should precede.
             self.buffer
                 .push(Tokens::EndStatement(EndStatementToken::new_semicolon()));
-            if let Match(word) = self.reader.matches("\\n\\r?") {
+            if let Match(_) = self.reader.matches("\\n\\r?") {
                 // If semicolon is followed by a newline (redundant), then we need to deal with indents (but ignore the newline itself).
                 // This will return the queue of tokens, including the semicolon.
                 return self.lex_indents();
             }
             // No newline, can just return the semicolon (which is certainly on the queue, and should be the only thing, but it is fine here if not).
-            return MaybeToken::Token(self.buffer.pop().unwrap());
+            return Token(self.buffer.pop().unwrap());
         }
         //
         // Indentation done; do the rest of lexing.
         //
-        if let Match(word) = self.reader.matches("(") {
-            return MaybeToken::Token(Tokens::ParenthesisOpen(ParenthesisOpenToken::new()));
+        // Parse identifers and keywords. This assumes that keywords are a subset of identifiers.
+        if let Match(word) = self.reader.matches(IdentifierToken::subpattern()) {
+            // later: maybe turn identifier into keyword to avoid a string copy? kind of elaborate...
+            if let Ok(keyword) = KeywordToken::from_str(word.clone()) {
+                return Token(Tokens::Keyword(keyword));
+            }
+            return Token(Tokens::Identifier(IdentifierToken::from_str(word).unwrap()));
         }
-        if let Match(word) = self.reader.matches(")") {
-            return MaybeToken::Token(Tokens::ParenthesisClose(ParenthesisCloseToken::new()));
+        // Literal
+        // todo
+        //        if let Match(word) = self.reader.matches(LiteralToken::subpattern()) {
+        //            return Token(LiteralToken::Literal(IdentifierToken::from_str(word).unwrap()));
+        //        }
+        // Operator
+        // todo
+        // Association
+        // todo
+        // Grouping symbols
+        if let Match(_) = self.reader.matches("(") {
+            return Token(Tokens::ParenthesisOpen(ParenthesisOpenToken::new()));
+        }
+        if let Match(_) = self.reader.matches(")") {
+            return Token(Tokens::ParenthesisClose(ParenthesisCloseToken::new()));
         }
-
-        // TODO: a lot more
 
         // TODO: specify the unlexable word
-        return MaybeToken::Token(Tokens::Unlexable(UnlexableToken::new("TODO".to_owned())));
+        return Token(Tokens::Unlexable(UnlexableToken::new("TODO".to_owned())));
     }
 }
diff --git a/src/mango/token/tokens/identifier.rs b/src/mango/token/tokens/identifier.rs
@@ -19,6 +19,10 @@ impl IdentifierToken {
     pub fn from_name(name: Name) -> Self {
         IdentifierToken { name }
     }
+
+    pub fn subpattern() -> &'static str {
+        Name::subpattern()
+    }
 }
 
 impl ToText for IdentifierToken {

diff --git a/src/mango/util/strtype/name.rs b/src/mango/util/strtype/name.rs
@@ -6,8 +6,10 @@ use std::fmt;
 use std::sync::Mutex;
 use string_interner::StringInterner;
 
+const VALID_IDENTIFIER_SUBPATTERN: &'static str = r"[a-zA-Z_][a-zA-Z0-9_]*";
 lazy_static! {
-    static ref VALID_IDENTIFIER: Regex = Regex::new(r"^[a-zA-Z_][a-zA-Z0-9_]*$").unwrap();
+    static ref VALID_IDENTIFIER: Regex =
+        Regex::new(&format!("{}{}{}", r"^", VALID_IDENTIFIER_SUBPATTERN, r"$")).unwrap();
 }
 
 // TODO: this alias just for https://github.com/rust-lang-nursery/rustfmt/issues/2610
@@ -38,6 +40,11 @@ impl Name {
             .unwrap()
             .to_string()
     }
+
+    /// Generate an eager subpattern to match names, that can be composed in a regular expression.
+    pub fn subpattern() -> &'static str {
+        &VALID_IDENTIFIER_SUBPATTERN.clone()
+    }
 }
 
 impl fmt::Display for Name {