Skip to content

Commit

Permalink
Add identifier and keyword lexing #56
Browse files Browse the repository at this point in the history
  • Loading branch information
mverleg committed May 22, 2018
1 parent 3728f4e commit ea5a4cd
Show file tree
Hide file tree
Showing 3 changed files with 49 additions and 18 deletions.
54 changes: 37 additions & 17 deletions src/mango/lexing/code_lexer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,12 @@ use mango::lexing::typ::MaybeToken;
use mango::token::special::UnlexableToken;
use mango::token::tokens::EndBlockToken;
use mango::token::tokens::EndStatementToken;
use mango::token::tokens::IdentifierToken;
use mango::token::tokens::KeywordToken;
use mango::token::tokens::ParenthesisCloseToken;
use mango::token::tokens::ParenthesisOpenToken;
use mango::token::tokens::StartBlockToken;
use mango::token::Tokens;
use mango::util::codeparts::Keyword;
use mango::util::collection::Queue;

pub struct CodeLexer<'r> {
Expand All @@ -35,6 +36,7 @@ impl<'r> CodeLexer<'r> {
}
for _ in line_indent..self.indent {
// This line is dedented, make end tokens.
// TODO: turn this "new" into a constant
if let Match(_) = self.reader.matches("end") {
// If this is followed by an 'end' keyword, then that 'end' is redundant.
self.buffer
Expand All @@ -55,54 +57,72 @@ impl<'r> CodeLexer<'r> {

impl<'r> Lexer<'r> for CodeLexer<'r> {
fn lex(&mut self) -> MaybeToken {
use self::MaybeToken::*;

// If there is a buffer due to indentation or continuations, return from that.
if let Some(token) = self.buffer.pop() {
return MaybeToken::Token(token);
return Token(token);
}
// Past this point, we assume that hte buffer is empty. When adding stuff, pop it or re-enter lex() soon.
if let Match(word) = self.reader.matches("\\.\\.\\.") {
if let Match(_) = self.reader.matches("\\.\\.\\.") {
// Line continuation has no token, it just continues on the next line.
if let Match(word) = self.reader.matches("\\n\\r?") {
if let Match(_) = self.reader.matches("\\n\\r?") {
// There should always be a newline after continuations, so that they can be ignored together.
} else if let Match(word) = self.reader.matches("[^\\n]*\\n\\r?") {
return MaybeToken::Token(Tokens::Unlexable(UnlexableToken::new(word)));
return Token(Tokens::Unlexable(UnlexableToken::new(word)));
} else {
// TODO: I don't know yet how to deal with ... followed by end-of-file
panic!()
}
// This is a new line, so there may be indents.
return self.lex_indents();
}
if let Match(word) = self.reader.matches("\\n\\r?") {
if let Match(_) = self.reader.matches("\\n\\r?") {
// Newline WITHOUT line continuation.
return MaybeToken::Token(Tokens::EndStatement(EndStatementToken::new_end_line()));
return Token(Tokens::EndStatement(EndStatementToken::new_end_line()));
}
if let Match(word) = self.reader.matches(";") {
if let Match(_) = self.reader.matches(";") {
// Semicolon, which ends a statement.
// Need to do some extra work with buffer, because there may be a newline followed by indentation, which ; should precede.
self.buffer
.push(Tokens::EndStatement(EndStatementToken::new_semicolon()));
if let Match(word) = self.reader.matches("\\n\\r?") {
if let Match(_) = self.reader.matches("\\n\\r?") {
// If semicolon is followed by a newline (redundant), then we need to deal with indents (but ignore the newline itself).
// This will return the queue of tokens, including the semicolon.
return self.lex_indents();
}
// No newline, can just return the semicolon (which is certainly on the queue, and should be the only thing, but it is fine here if not).
return MaybeToken::Token(self.buffer.pop().unwrap());
return Token(self.buffer.pop().unwrap());
}
//
// Indentation done; do the rest of lexing.
//
if let Match(word) = self.reader.matches("(") {
return MaybeToken::Token(Tokens::ParenthesisOpen(ParenthesisOpenToken::new()));
// Parse identifers and keywords. This assumes that keywords are a subset of identifiers.
if let Match(word) = self.reader.matches(IdentifierToken::subpattern()) {
// later: maybe turn identifier into keyword to avoid a string copy? kind of elaborate...
if let Ok(keyword) = KeywordToken::from_str(word.clone()) {
return Token(Tokens::Keyword(keyword));
}
return Token(Tokens::Identifier(IdentifierToken::from_str(word).unwrap()));
}
if let Match(word) = self.reader.matches(")") {
return MaybeToken::Token(Tokens::ParenthesisClose(ParenthesisCloseToken::new()));
// Literal
// todo
// if let Match(word) = self.reader.matches(LiteralToken::subpattern()) {
// return Token(LiteralToken::Literal(IdentifierToken::from_str(word).unwrap()));
// }
// Operator
// todo
// Association
// todo
// Grouping symbols
if let Match(_) = self.reader.matches("(") {
return Token(Tokens::ParenthesisOpen(ParenthesisOpenToken::new()));
}
if let Match(_) = self.reader.matches(")") {
return Token(Tokens::ParenthesisClose(ParenthesisCloseToken::new()));
}

// TODO: a lot more

// TODO: specify the unlexable word
return MaybeToken::Token(Tokens::Unlexable(UnlexableToken::new("TODO".to_owned())));
return Token(Tokens::Unlexable(UnlexableToken::new("TODO".to_owned())));
}
}
4 changes: 4 additions & 0 deletions src/mango/token/tokens/identifier.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,10 @@ impl IdentifierToken {
pub fn from_name(name: Name) -> Self {
IdentifierToken { name }
}

pub fn subpattern() -> &'static str {
Name::subpattern()
}
}

impl ToText for IdentifierToken {
Expand Down
9 changes: 8 additions & 1 deletion src/mango/util/strtype/name.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,10 @@ use std::fmt;
use std::sync::Mutex;
use string_interner::StringInterner;

const VALID_IDENTIFIER_SUBPATTERN: &'static str = r"[a-zA-Z_][a-zA-Z0-9_]*";
lazy_static! {
static ref VALID_IDENTIFIER: Regex = Regex::new(r"^[a-zA-Z_][a-zA-Z0-9_]*$").unwrap();
static ref VALID_IDENTIFIER: Regex =
Regex::new(&format!("{}{}{}", r"^", VALID_IDENTIFIER_SUBPATTERN, r"$")).unwrap();
}

// TODO: this alias just for https://github.com/rust-lang-nursery/rustfmt/issues/2610
Expand Down Expand Up @@ -38,6 +40,11 @@ impl Name {
.unwrap()
.to_string()
}

/// Generate an eager subpattern to match names, that can be composed in a regular expression.
pub fn subpattern() -> &'static str {
&VALID_IDENTIFIER_SUBPATTERN.clone()
}
}

impl fmt::Display for Name {
Expand Down

0 comments on commit ea5a4cd

Please sign in to comment.