diff --git a/src/mango/lexing/code_lexer.rs b/src/mango/lexing/code_lexer.rs index 97a88b1c..00eda5b9 100644 --- a/src/mango/lexing/code_lexer.rs +++ b/src/mango/lexing/code_lexer.rs @@ -5,11 +5,12 @@ use mango::lexing::typ::MaybeToken; use mango::token::special::UnlexableToken; use mango::token::tokens::EndBlockToken; use mango::token::tokens::EndStatementToken; +use mango::token::tokens::IdentifierToken; +use mango::token::tokens::KeywordToken; use mango::token::tokens::ParenthesisCloseToken; use mango::token::tokens::ParenthesisOpenToken; use mango::token::tokens::StartBlockToken; use mango::token::Tokens; -use mango::util::codeparts::Keyword; use mango::util::collection::Queue; pub struct CodeLexer<'r> { @@ -35,6 +36,7 @@ impl<'r> CodeLexer<'r> { } for _ in line_indent..self.indent { // This line is dedented, make end tokens. + // TODO: turn this "new" into a constant if let Match(_) = self.reader.matches("end") { // If this is followed by an 'end' keyword, then that 'end' is redundant. self.buffer @@ -55,17 +57,19 @@ impl<'r> CodeLexer<'r> { impl<'r> Lexer<'r> for CodeLexer<'r> { fn lex(&mut self) -> MaybeToken { + use self::MaybeToken::*; + // If there is a buffer due to indentation or continuations, return from that. if let Some(token) = self.buffer.pop() { - return MaybeToken::Token(token); + return Token(token); } // Past this point, we assume that hte buffer is empty. When adding stuff, pop it or re-enter lex() soon. - if let Match(word) = self.reader.matches("\\.\\.\\.") { + if let Match(_) = self.reader.matches("\\.\\.\\.") { // Line continuation has no token, it just continues on the next line. - if let Match(word) = self.reader.matches("\\n\\r?") { + if let Match(_) = self.reader.matches("\\n\\r?") { // There should always be a newline after continuations, so that they can be ignored together. } else if let Match(word) = self.reader.matches("[^\\n]*\\n\\r?") { - return MaybeToken::Token(Tokens::Unlexable(UnlexableToken::new(word))); + return Token(Tokens::Unlexable(UnlexableToken::new(word))); } else { // TODO: I don't know yet how to deal with ... followed by end-of-file panic!() @@ -73,36 +77,52 @@ impl<'r> Lexer<'r> for CodeLexer<'r> { // This is a new line, so there may be indents. return self.lex_indents(); } - if let Match(word) = self.reader.matches("\\n\\r?") { + if let Match(_) = self.reader.matches("\\n\\r?") { // Newline WITHOUT line continuation. - return MaybeToken::Token(Tokens::EndStatement(EndStatementToken::new_end_line())); + return Token(Tokens::EndStatement(EndStatementToken::new_end_line())); } - if let Match(word) = self.reader.matches(";") { + if let Match(_) = self.reader.matches(";") { // Semicolon, which ends a statement. // Need to do some extra work with buffer, because there may be a newline followed by indentation, which ; should precede. self.buffer .push(Tokens::EndStatement(EndStatementToken::new_semicolon())); - if let Match(word) = self.reader.matches("\\n\\r?") { + if let Match(_) = self.reader.matches("\\n\\r?") { // If semicolon is followed by a newline (redundant), then we need to deal with indents (but ignore the newline itself). // This will return the queue of tokens, including the semicolon. return self.lex_indents(); } // No newline, can just return the semicolon (which is certainly on the queue, and should be the only thing, but it is fine here if not). - return MaybeToken::Token(self.buffer.pop().unwrap()); + return Token(self.buffer.pop().unwrap()); } // // Indentation done; do the rest of lexing. // - if let Match(word) = self.reader.matches("(") { - return MaybeToken::Token(Tokens::ParenthesisOpen(ParenthesisOpenToken::new())); + // Parse identifers and keywords. This assumes that keywords are a subset of identifiers. + if let Match(word) = self.reader.matches(IdentifierToken::subpattern()) { + // later: maybe turn identifier into keyword to avoid a string copy? kind of elaborate... + if let Ok(keyword) = KeywordToken::from_str(word.clone()) { + return Token(Tokens::Keyword(keyword)); + } + return Token(Tokens::Identifier(IdentifierToken::from_str(word).unwrap())); } - if let Match(word) = self.reader.matches(")") { - return MaybeToken::Token(Tokens::ParenthesisClose(ParenthesisCloseToken::new())); + // Literal + // todo + // if let Match(word) = self.reader.matches(LiteralToken::subpattern()) { + // return Token(LiteralToken::Literal(IdentifierToken::from_str(word).unwrap())); + // } + // Operator + // todo + // Association + // todo + // Grouping symbols + if let Match(_) = self.reader.matches("(") { + return Token(Tokens::ParenthesisOpen(ParenthesisOpenToken::new())); + } + if let Match(_) = self.reader.matches(")") { + return Token(Tokens::ParenthesisClose(ParenthesisCloseToken::new())); } - - // TODO: a lot more // TODO: specify the unlexable word - return MaybeToken::Token(Tokens::Unlexable(UnlexableToken::new("TODO".to_owned()))); + return Token(Tokens::Unlexable(UnlexableToken::new("TODO".to_owned()))); } } diff --git a/src/mango/token/tokens/identifier.rs b/src/mango/token/tokens/identifier.rs index c83555d3..2485fe61 100644 --- a/src/mango/token/tokens/identifier.rs +++ b/src/mango/token/tokens/identifier.rs @@ -19,6 +19,10 @@ impl IdentifierToken { pub fn from_name(name: Name) -> Self { IdentifierToken { name } } + + pub fn subpattern() -> &'static str { + Name::subpattern() + } } impl ToText for IdentifierToken { diff --git a/src/mango/util/strtype/name.rs b/src/mango/util/strtype/name.rs index 0405d927..5940158a 100644 --- a/src/mango/util/strtype/name.rs +++ b/src/mango/util/strtype/name.rs @@ -6,8 +6,10 @@ use std::fmt; use std::sync::Mutex; use string_interner::StringInterner; +const VALID_IDENTIFIER_SUBPATTERN: &'static str = r"[a-zA-Z_][a-zA-Z0-9_]*"; lazy_static! { - static ref VALID_IDENTIFIER: Regex = Regex::new(r"^[a-zA-Z_][a-zA-Z0-9_]*$").unwrap(); + static ref VALID_IDENTIFIER: Regex = + Regex::new(&format!("{}{}{}", r"^", VALID_IDENTIFIER_SUBPATTERN, r"$")).unwrap(); } // TODO: this alias just for https://github.com/rust-lang-nursery/rustfmt/issues/2610 @@ -38,6 +40,11 @@ impl Name { .unwrap() .to_string() } + + /// Generate an eager subpattern to match names, that can be composed in a regular expression. + pub fn subpattern() -> &'static str { + &VALID_IDENTIFIER_SUBPATTERN.clone() + } } impl fmt::Display for Name {