Skip to content

Commit

Permalink
Restructured lexing using generator, now just borrow/type problem #52
Browse files Browse the repository at this point in the history
  • Loading branch information
mverleg committed Jun 12, 2018
1 parent 3c85472 commit e5ce31c
Show file tree
Hide file tree
Showing 2 changed files with 155 additions and 103 deletions.
251 changes: 149 additions & 102 deletions src/mango/lexing/gen_code_lexer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@ use mango::util::collection::Queue;
use std::cell::RefCell;
use std::ops::{Generator, GeneratorState};
use std::rc::Rc;
use std::borrow::BorrowMut;
use mango::util::strslice::charsliceto;
use mango::util::strslice::slice::glyphat;

/// This generator does the real lexing work, but is wrapped in a normal
/// class to satisfy an interface that doesn't expose nightly or unsafe features.
Expand All @@ -34,126 +37,170 @@ use std::rc::Rc;
//}

struct Container<G: Generator<Yield = Tokens, Return = ()>> {
delegate: Option<Box<Lexer>>,
reader: Rc<RefCell<Reader>>,
generator: G,
}

impl Container<Box<Generator<Yield = Tokens, Return = ()>>> {
pub fn new(reader: Box<Reader>) -> Box<Self> {

fn lex_indents(&mut self) -> Vec<Tokens> {
let mut line_indent = 0;
while let Match(_) = self.reader.borrow_mut().matches("\\t") {
line_indent += 1;
}
for _ in line_indent..self.indent {
// This line is dedented, make end tokens.
// TODO: turn this "new" into a constant
if let Match(_) = self.reader.borrow_mut().matches("end") {
// If this is followed by an 'end' keyword, then that 'end' is redundant.
self.buffer
.push(Tokens::EndBlock(EndBlockToken::new(true, true)));
} else {
self.buffer
.push(Tokens::EndBlock(EndBlockToken::new(true, false)));
}
}
for _ in self.indent..line_indent {
// This line is indented, make start tokens.
self.buffer.push(Tokens::StartBlock(StartBlockToken::new()));
}
self.indent = line_indent;
self.lex()
}

pub fn new(&mut self, reader: Rc<RefCell<Reader>>) -> Box<Self> {
let q = 42;
Box::new(Container {
reader: reader,
delegate: Option::None,
generator: Box::new(move || {

// If there is a buffer due to indentation or continuations, return from that.
if let Some(token) = self.buffer.pop() {
yield token;
}
// Past this point, we assume that hte buffer is empty. When adding stuff, pop it or re-enter lex() soon.
let continue_match_res = self.reader.borrow_mut().matches("\\.\\.\\.");
if let Match(_) = continue_match_res {
// Line continuation has no token, it just continues on the next line.
loop {

// Delegate to another lexer if one is set.
if let Option::Some(delegate) = self.delegate {
match delegate.lex() {
MaybeToken::Token(token) => {
yield token;
continue;
}
MaybeToken::End => {
self.delegate = Option::None;
}
}
}

// TODO: see if all these match_res can be removed (they couldn't before due to borrowchecker, even with non-lexical lifetimes)
let continue_match_res = self.reader.borrow_mut().matches("\\.\\.\\.");
if let Match(_) = continue_match_res {
// Line continuation has no token, it just continues on the next line.
let newline_match_res = self.reader.borrow_mut().matches("\\n\\r?");
if let Match(_) = newline_match_res {
// There should always be a newline after continuations, so that they can be ignored together.
} else {
// All the text between ... and the end of the line is unlexable.
let newline_match_res = self.reader.borrow_mut().matches("[^\\n]*\\n\\r?");
if let Match(word) = newline_match_res {
yield Tokens::Unlexable(UnlexableToken::new(word));
// This is a new line, so there may be indents.
// TODO: is there any yield-from like Python?
for res in self.lex_indents() {
yield res;
}
} else {
// TODO: I don't know yet how to deal with '...' followed by end-of-file
panic!()
}
}
// TODO: are continues necessary? it seems more state-independent to restart for each token
continue;
}
let newline_match_res = self.reader.borrow_mut().matches("\\n\\r?");
if let Match(_) = newline_match_res {
// There should always be a newline after continuations, so that they can be ignored together.
} else {
let newline_match_res = self.reader.borrow_mut().matches("[^\\n]*\\n\\r?");
if let Match(word) = newline_match_res {
self.buffer
.push(Tokens::Unlexable(UnlexableToken::new(word)));
// This is a new line, so there may be indents.
self.lex_indents();
yield self.lex();
// Newline WITHOUT line continuation.
// This is a new line, so there may be indents.
yield Tokens::EndStatement(EndStatementToken::new_end_line());
for res in self.lex_indents() {
yield res;
}
continue;
}
let end_statement_match_res = self.reader.borrow_mut().matches(";");
if let Match(_) = end_statement_match_res {
// Semicolon, which ends a statement.
// Need to do some extra work with buffer, because there may be a newline followed by indentation, which ; should precede.
yield Tokens::EndStatement(EndStatementToken::new_semicolon());
let end_line_match_res = self.reader.borrow_mut().matches("\\n\\r?");
if let Match(_) = end_line_match_res {
// If semicolon is followed by a newline (redundant), then we need to deal with indents (but ignore the newline itself).
// This will return the queue of tokens, including the semicolon.
for res in self.lex_indents() {
yield res;
}
}
// No newline, can just return the semicolon (which is certainly on the queue, and should be the only thing, but it is fine here if not).
continue;
}

//
// Indentation done; do the rest of lexing.
//
// Parse identifiers and keywords. This assumes that keywords are a subset of identifiers.
let word_match_res = self.reader.borrow_mut().matches(IdentifierToken::subpattern());
if let Match(word) = word_match_res {
// Check if it is a keyword.
// TODO: maybe turn identifier into keyword to avoid a string copy? kind of elaborate...
if let Ok(keyword) = KeywordToken::from_str(word.clone()) {
yield Tokens::Keyword(keyword);
}
yield Tokens::Identifier(IdentifierToken::from_str(word).unwrap());
continue;
}
// String literal (delegated).
let string_match_res = self.reader.borrow_mut().matches("[a-z]?\"");
if let Match(_) = string_match_res {
let sublexer: Box<Lexer> = Box::new(StringLexer::new_double_quoted(self.reader.clone()));
self.delegate = Option::Some(sublexer);
continue;
}
// Association (before operator).
let association_match_res = self.reader.borrow_mut().matches(&AssociationToken::subpattern());
if let Match(token) = association_match_res {
if glyphat(token, -1) == "=" {
yield Tokens::Association(AssociationToken::from_unprefixed()); // TODO
} else {
// TODO: I don't know yet how to deal with '...' followed by end-of-file
panic!()
yield Tokens::Association(AssociationToken::from_str(charsliceto(token, -1)).unwrap());
}
continue;
}
}
let newline_match_res = self.reader.borrow_mut().matches("\\n\\r?");
if let Match(_) = newline_match_res {
// Newline WITHOUT line continuation.
// This is a new line, so there may be indents.
self.buffer
.push(Tokens::EndStatement(EndStatementToken::new_end_line()));
self.lex_indents();
yield self.lex();
}
let end_statement_match_res = self.reader.borrow_mut().matches(";");
if let Match(_) = end_statement_match_res {
// Semicolon, which ends a statement.
// Need to do some extra work with buffer, because there may be a newline followed by indentation, which ; should precede.
self.buffer
.push(Tokens::EndStatement(EndStatementToken::new_semicolon()));
let end_line_match_res = self.reader.borrow_mut().matches("\\n\\r?");
if let Match(_) = end_line_match_res {
// If semicolon is followed by a newline (redundant), then we need to deal with indents (but ignore the newline itself).
// This will return the queue of tokens, including the semicolon.
yield self.lex_indents();
// Operator.
let operator_match_res = self.reader.borrow_mut().matches(OperatorToken::subpattern());
if let Match(token) = operator_match_res {
yield Tokens::Operator(OperatorToken::from_str(&token).unwrap());
continue;
}
// No newline, can just return the semicolon (which is certainly on the queue, and should be the only thing, but it is fine here if not).
yield self.buffer.pop().unwrap();
}
//
// Indentation done; do the rest of lexing.
//
// Parse identifiers and keywords. This assumes that keywords are a subset of identifiers.
if let Match(word) = self
.reader
.borrow_mut()
.matches(IdentifierToken::subpattern())
{
// later: maybe turn identifier into keyword to avoid a string copy? kind of elaborate...
if let Ok(keyword) = KeywordToken::from_str(word.clone()) {
yield Tokens::Keyword(keyword);
// Grouping symbols
if let Match(_) = self.reader.borrow_mut().matches(r"\(") {
yield Tokens::ParenthesisOpen(ParenthesisOpenToken::new());
continue;
}
yield Tokens::Identifier(IdentifierToken::from_str(word).unwrap());
}
// Literal
let string_match_res = self.reader.borrow_mut().matches("[a-z]?\"");
if let Match(_) = string_match_res {
let sublexer: Box<Lexer> =
Box::new(StringLexer::new_double_quoted(self.reader.clone()));
self.reader_or_delegate = ReaderOrDelegate::Delegate(sublexer);
yield self.lex();
}
// Association (before operator)
let association_match_res = self
.reader
.borrow_mut()
.matches(&AssociationToken::subpattern());
if let Match(token) = association_match_res {
if token.chars().last().unwrap() == '=' {
// return Token(Tokens::Association(AssociationToken::from_str(token[..1]).unwrap()));
yield Tokens::Association(AssociationToken::from_unprefixed()); // TODO
} else {
yield Tokens::Association(AssociationToken::from_unprefixed());
if let Match(_) = self.reader.borrow_mut().matches(r"\)") {
yield Tokens::ParenthesisClose(ParenthesisCloseToken::new());
continue;
}
}
// Operator
let operator_match_res = self
.reader
.borrow_mut()
.matches(OperatorToken::subpattern());
if let Match(token) = operator_match_res {
yield Tokens::Operator(OperatorToken::from_str(&token).unwrap());
}
// Grouping symbols
if let Match(_) = self.reader.borrow_mut().matches(r"\(") {
yield Tokens::ParenthesisOpen(ParenthesisOpenToken::new());
}
if let Match(_) = self.reader.borrow_mut().matches(r"\)") {
yield Tokens::ParenthesisClose(ParenthesisCloseToken::new());
}

let unknown_word = self.reader.borrow_mut().matches("[^\\s]+");
match unknown_word {
Match(word) => yield Tokens::Unlexable(UnlexableToken::new(word)),
NoMatch() => {
panic!("Do not know how to proceed with parsing")
}
EOF() => {
// TODO: also dedent and end statement here
return

let unknown_word = self.reader.borrow_mut().matches("[^\\s]+");
match unknown_word {
Match(word) => yield Tokens::Unlexable(UnlexableToken::new(word)),
NoMatch() => panic!("Do not know how to proceed with parsing"),
EOF() => {
// TODO: also dedent and end statement here
return
}
}
continue;
}

}),
Expand Down
7 changes: 6 additions & 1 deletion src/mango/util/strslice/slice.rs
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,10 @@ pub fn charsliceto<S: Into<String>>(text: S, end: isize) -> String {
charslice(text, 0, end)
}

pub fn glyphat<S: Into<String>>(text: S, pos: isize) -> String {
charslice(text, pos, pos+1)
}

#[cfg(test)]
mod tests {
use super::*;
Expand All @@ -58,9 +62,10 @@ mod tests {
assert_eq!("你好", charslice("你好!", 0, 2));
assert_eq!("!", charslicefrom("你好!", 2));
assert_eq!("你好", charsliceto("你好!", 2));
assert_eq!("好", glyphat("你好!", 1));
// Negative indices should match Python 3 behaviour:
assert_eq!("你好", charslice("你好!", -3, -1));
assert_eq!("!", charslicefrom("你好!", -1));
assert_eq!("你好", charsliceto("你好!", -1));
assert_eq!("", glyphat("你好!", -2));
}
}

0 comments on commit e5ce31c

Please sign in to comment.