Skip to content

Commit

Permalink
Progress on sublexers and new tokens, fighting borrow rules #56
Browse files Browse the repository at this point in the history
  • Loading branch information
mverleg committed May 22, 2018
1 parent ea5a4cd commit 03b152a
Show file tree
Hide file tree
Showing 5 changed files with 170 additions and 72 deletions.
184 changes: 114 additions & 70 deletions src/mango/lexing/code_lexer.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
use mango::io::typ::Reader;
use mango::io::typ::ReaderResult::*;
use mango::lexing::string_lexer::StringLexer;
use mango::lexing::typ::Lexer;
use mango::lexing::typ::MaybeToken;
use mango::token::special::UnlexableToken;
Expand All @@ -13,31 +14,49 @@ use mango::token::tokens::StartBlockToken;
use mango::token::Tokens;
use mango::util::collection::Queue;

pub struct CodeLexer<'r> {
reader: &'r mut Reader,
enum ReaderOrDelegate {
Reader(Box<Reader>),
Delegate(Box<Lexer>),
}

impl ReaderOrDelegate {
fn end_delegation(&mut self) {
*self = match self {
ReaderOrDelegate::Delegate(delegate) => ReaderOrDelegate::Reader(delegate.consume()),
ReaderOrDelegate::Reader(reader) => ReaderOrDelegate::Reader(*reader),
}
}
}

pub struct CodeLexer {
// reader: Option<&'r mut Reader>,
indent: i32,
// TODO: both of the next two would be unnecessary with generators...
// This delegate deals with nested structures, like string literals and comments.
// delegate: Option<&'r mut Lexer<'r>>,
reader_or_delegate: ReaderOrDelegate,
// This is unfortunate, would not be needed with 'yield' but is now for indents.
buffer: Queue<Tokens>,
}

impl<'r> CodeLexer<'r> {
fn new(reader: &'r mut Reader) -> Self {
impl CodeLexer {
fn new(reader: Box<Reader>) -> Self {
CodeLexer {
reader,
reader_or_delegate: ReaderOrDelegate::Reader(reader),
indent: 0,
buffer: Queue::new(),
}
}

fn lex_indents(&mut self) -> MaybeToken {
fn lex_indents(&mut self, reader: &mut Box<Reader>) -> MaybeToken {
let mut line_indent = 0;
while let Match(_) = self.reader.matches("\\t") {
while let Match(_) = reader.matches("\\t") {
line_indent += 1;
}
for _ in line_indent..self.indent {
// This line is dedented, make end tokens.
// TODO: turn this "new" into a constant
if let Match(_) = self.reader.matches("end") {
if let Match(_) = reader.matches("end") {
// If this is followed by an 'end' keyword, then that 'end' is redundant.
self.buffer
.push(Tokens::EndBlock(EndBlockToken::new(true, true)));
Expand All @@ -55,74 +74,99 @@ impl<'r> CodeLexer<'r> {
}
}

impl<'r> Lexer<'r> for CodeLexer<'r> {
impl Lexer for CodeLexer {
fn lex(&mut self) -> MaybeToken {
use self::MaybeToken::*;

// If there is a buffer due to indentation or continuations, return from that.
if let Some(token) = self.buffer.pop() {
return Token(token);
}
// Past this point, we assume that hte buffer is empty. When adding stuff, pop it or re-enter lex() soon.
if let Match(_) = self.reader.matches("\\.\\.\\.") {
// Line continuation has no token, it just continues on the next line.
if let Match(_) = self.reader.matches("\\n\\r?") {
// There should always be a newline after continuations, so that they can be ignored together.
} else if let Match(word) = self.reader.matches("[^\\n]*\\n\\r?") {
return Token(Tokens::Unlexable(UnlexableToken::new(word)));
} else {
// TODO: I don't know yet how to deal with ... followed by end-of-file
panic!()
}
// This is a new line, so there may be indents.
return self.lex_indents();
}
if let Match(_) = self.reader.matches("\\n\\r?") {
// Newline WITHOUT line continuation.
return Token(Tokens::EndStatement(EndStatementToken::new_end_line()));
}
if let Match(_) = self.reader.matches(";") {
// Semicolon, which ends a statement.
// Need to do some extra work with buffer, because there may be a newline followed by indentation, which ; should precede.
self.buffer
.push(Tokens::EndStatement(EndStatementToken::new_semicolon()));
if let Match(_) = self.reader.matches("\\n\\r?") {
// If semicolon is followed by a newline (redundant), then we need to deal with indents (but ignore the newline itself).
// This will return the queue of tokens, including the semicolon.
return self.lex_indents();
// If currently delegating to a sub-lexer, return from that.
match self.reader_or_delegate {
ReaderOrDelegate::Delegate(ref mut delegate) => {
match delegate.lex() {
Token(token) => Token(token),
End => self.lex(),
}
// Code to stop delegation cannot be here, because `self` is still mutably borrowed through `delegate`
}
// No newline, can just return the semicolon (which is certainly on the queue, and should be the only thing, but it is fine here if not).
return Token(self.buffer.pop().unwrap());
}
//
// Indentation done; do the rest of lexing.
//
// Parse identifers and keywords. This assumes that keywords are a subset of identifiers.
if let Match(word) = self.reader.matches(IdentifierToken::subpattern()) {
// later: maybe turn identifier into keyword to avoid a string copy? kind of elaborate...
if let Ok(keyword) = KeywordToken::from_str(word.clone()) {
return Token(Tokens::Keyword(keyword));
ReaderOrDelegate::Reader(mut reader) => {
// todo: maybe this branch could be a separate function?

// If there is a buffer due to indentation or continuations, return from that.
if let Some(token) = self.buffer.pop() {
return Token(token);
}
// Past this point, we assume that hte buffer is empty. When adding stuff, pop it or re-enter lex() soon.
if let Match(_) = reader.matches("\\.\\.\\.") {
// Line continuation has no token, it just continues on the next line.
if let Match(_) = reader.matches("\\n\\r?") {
// There should always be a newline after continuations, so that they can be ignored together.
} else if let Match(word) = reader.matches("[^\\n]*\\n\\r?") {
return Token(Tokens::Unlexable(UnlexableToken::new(word)));
} else {
// TODO: I don't know yet how to deal with ... followed by end-of-file
panic!()
}
// This is a new line, so there may be indents.
return self.lex_indents(&mut reader);
}
if let Match(_) = reader.matches("\\n\\r?") {
// Newline WITHOUT line continuation.
return Token(Tokens::EndStatement(EndStatementToken::new_end_line()));
}
if let Match(_) = reader.matches(";") {
// Semicolon, which ends a statement.
// Need to do some extra work with buffer, because there may be a newline followed by indentation, which ; should precede.
self.buffer
.push(Tokens::EndStatement(EndStatementToken::new_semicolon()));
if let Match(_) = reader.matches("\\n\\r?") {
// If semicolon is followed by a newline (redundant), then we need to deal with indents (but ignore the newline itself).
// This will return the queue of tokens, including the semicolon.
return self.lex_indents(&mut reader);
}
// No newline, can just return the semicolon (which is certainly on the queue, and should be the only thing, but it is fine here if not).
return Token(self.buffer.pop().unwrap());
}
//
// Indentation done; do the rest of lexing.
//
// Parse identifers and keywords. This assumes that keywords are a subset of identifiers.
if let Match(word) = reader.matches(IdentifierToken::subpattern()) {
// later: maybe turn identifier into keyword to avoid a string copy? kind of elaborate...
if let Ok(keyword) = KeywordToken::from_str(word.clone()) {
return Token(Tokens::Keyword(keyword));
}
return Token(Tokens::Identifier(IdentifierToken::from_str(word).unwrap()));
}
// Literal
if let Match(word) = reader.matches("[a-z]?\"") {
// TODO: need to keep delegating to this until it exhausts, how to do that?
self.reader_or_delegate = ReaderOrDelegate::Delegate(Box::new(
StringLexer::new_double_quoted(reader),
));
return self.lex();
}
// Operator
// todo
// Association
// todo
// Grouping symbols
if let Match(_) = reader.matches("(") {
return Token(Tokens::ParenthesisOpen(ParenthesisOpenToken::new()));
}
if let Match(_) = reader.matches(")") {
return Token(Tokens::ParenthesisClose(ParenthesisCloseToken::new()));
}

// TODO: specify the unlexable word
return Token(Tokens::Unlexable(UnlexableToken::new("TODO".to_owned())));
}
return Token(Tokens::Identifier(IdentifierToken::from_str(word).unwrap()));
}
// Literal
// todo
// if let Match(word) = self.reader.matches(LiteralToken::subpattern()) {
// return Token(LiteralToken::Literal(IdentifierToken::from_str(word).unwrap()));
// }
// Operator
// todo
// Association
// todo
// Grouping symbols
if let Match(_) = self.reader.matches("(") {
return Token(Tokens::ParenthesisOpen(ParenthesisOpenToken::new()));
}
if let Match(_) = self.reader.matches(")") {
return Token(Tokens::ParenthesisClose(ParenthesisCloseToken::new()));
}
}

// TODO: specify the unlexable word
return Token(Tokens::Unlexable(UnlexableToken::new("TODO".to_owned())));
fn consume(self) -> Box<Reader> {
assert!(false, "I do not think this is ever called, is it?");
match self.reader_or_delegate {
ReaderOrDelegate::Reader(reader) => reader,
ReaderOrDelegate::Delegate(delegate) => delegate.consume(),
}
}
}
2 changes: 2 additions & 0 deletions src/mango/lexing/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,5 @@ pub mod typ;
pub mod code_lexer;

pub mod comment_lexer;

pub mod string_lexer;
45 changes: 45 additions & 0 deletions src/mango/lexing/string_lexer.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
use mango::io::typ::Reader;
use mango::io::typ::ReaderResult::*;
use mango::lexing::typ::Lexer;
use mango::lexing::typ::MaybeToken;
use mango::token::tokens::LiteralToken;
use mango::token::Tokens;

pub enum StringType {
SingleQuotedInline,
DoubleQuotedInline,
MultiLine,
}

/// Lexes a string literal token.
// Starts after the opening quote and expected to consume until closing quote.
pub struct StringLexer {
reader: Box<Reader>,
typ: StringType,
}

impl StringLexer {
// TODO: support other types of strings
pub fn new_double_quoted(reader: Box<Reader>) -> Self {
StringLexer {
reader,
typ: StringType::DoubleQuotedInline,
}
}
}

impl Lexer for StringLexer {
fn lex(&mut self) -> MaybeToken {
// TODO: doesn't handle escaping etc at all now
// TODO: this is going to have a problem if `matches` automatically eats whitespace
match self.reader.matches("[^\"\\n]*") {
Match(value) => return MaybeToken::Token(Tokens::Literal(LiteralToken::string(value))),
NoMatch() => panic!("failed to parse string"), // This can't really go wrong since empty pattern matches
EOF() => return MaybeToken::Token(Tokens::Literal(LiteralToken::string("".to_owned()))), // Unclosed string literal, let code parser deal with it
}
}

fn consume(self) -> Box<Reader> {
self.reader
}
}
7 changes: 5 additions & 2 deletions src/mango/lexing/typ.rs
Original file line number Diff line number Diff line change
@@ -1,14 +1,17 @@
use mango::io::typ::Reader;
use mango::token::Tokens;

pub enum MaybeToken {
Token(Tokens),
End(),
End,
}

pub trait Lexer<'r> {
pub trait Lexer {
// /// Create a new lexer from a reader instance.
// fn new(reader: &'r mut Reader) -> Self;

/// Every call to lex returns a token until the end of the input.
fn lex(&mut self) -> MaybeToken;

fn consume(self) -> Box<Reader>;
}
4 changes: 4 additions & 0 deletions src/mango/token/tokens/literal.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,10 @@ impl LiteralToken {
pub fn real(value: f64) -> LiteralToken {
LiteralToken::Real(f64eq::new(value))
}

pub fn subpattern_int() -> &'static str {
"[a-z]?\""
}
}

impl ToText for LiteralToken {
Expand Down

0 comments on commit 03b152a

Please sign in to comment.