Skip to content

Commit

Permalink
Progress on rewriting lexer #52
Browse files Browse the repository at this point in the history
  • Loading branch information
mverleg committed Jun 14, 2018
1 parent e5ce31c commit f95c892
Showing 1 changed file with 116 additions and 114 deletions.
230 changes: 116 additions & 114 deletions src/mango/lexing/gen_code_lexer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ use mango::util::strslice::slice::glyphat;
//}

struct Container<G: Generator<Yield = Tokens, Return = ()>> {
indent: i32,
delegate: Option<Box<Lexer>>,
reader: Rc<RefCell<Reader>>,
generator: G,
Expand All @@ -46,6 +47,7 @@ impl Container<Box<Generator<Yield = Tokens, Return = ()>>> {

fn lex_indents(&mut self) -> Vec<Tokens> {
let mut line_indent = 0;
let mut res = Vec::with_capacity(12);
while let Match(_) = self.reader.borrow_mut().matches("\\t") {
line_indent += 1;
}
Expand All @@ -54,11 +56,9 @@ impl Container<Box<Generator<Yield = Tokens, Return = ()>>> {
// TODO: turn this "new" into a constant
if let Match(_) = self.reader.borrow_mut().matches("end") {
// If this is followed by an 'end' keyword, then that 'end' is redundant.
self.buffer
.push(Tokens::EndBlock(EndBlockToken::new(true, true)));
yield Tokens::EndBlock(EndBlockToken::new(true, true));
} else {
self.buffer
.push(Tokens::EndBlock(EndBlockToken::new(true, false)));
yield Tokens::EndBlock(EndBlockToken::new(true, false));
}
}
for _ in self.indent..line_indent {
Expand All @@ -72,6 +72,7 @@ impl Container<Box<Generator<Yield = Tokens, Return = ()>>> {
pub fn new(&mut self, reader: Rc<RefCell<Reader>>) -> Box<Self> {
let q = 42;
Box::new(Container {
indent: 0,
reader: reader,
delegate: Option::None,
generator: Box::new(move || {
Expand All @@ -91,116 +92,117 @@ impl Container<Box<Generator<Yield = Tokens, Return = ()>>> {
}
}

// TODO: see if all these match_res can be removed (they couldn't before due to borrowchecker, even with non-lexical lifetimes)
let continue_match_res = self.reader.borrow_mut().matches("\\.\\.\\.");
if let Match(_) = continue_match_res {
// Line continuation has no token, it just continues on the next line.
let newline_match_res = self.reader.borrow_mut().matches("\\n\\r?");
if let Match(_) = newline_match_res {
// There should always be a newline after continuations, so that they can be ignored together.
} else {
// All the text between ... and the end of the line is unlexable.
let newline_match_res = self.reader.borrow_mut().matches("[^\\n]*\\n\\r?");
if let Match(word) = newline_match_res {
yield Tokens::Unlexable(UnlexableToken::new(word));
// This is a new line, so there may be indents.
// TODO: is there any yield-from like Python?
for res in self.lex_indents() {
yield res;
}
} else {
// TODO: I don't know yet how to deal with '...' followed by end-of-file
panic!()
}
}
// TODO: are continues necessary? it seems more state-independent to restart for each token
continue;
}
let newline_match_res = self.reader.borrow_mut().matches("\\n\\r?");
if let Match(_) = newline_match_res {
// Newline WITHOUT line continuation.
// This is a new line, so there may be indents.
yield Tokens::EndStatement(EndStatementToken::new_end_line());
for res in self.lex_indents() {
yield res;
}
continue;
}
let end_statement_match_res = self.reader.borrow_mut().matches(";");
if let Match(_) = end_statement_match_res {
// Semicolon, which ends a statement.
// Need to do some extra work with buffer, because there may be a newline followed by indentation, which ; should precede.
yield Tokens::EndStatement(EndStatementToken::new_semicolon());
let end_line_match_res = self.reader.borrow_mut().matches("\\n\\r?");
if let Match(_) = end_line_match_res {
// If semicolon is followed by a newline (redundant), then we need to deal with indents (but ignore the newline itself).
// This will return the queue of tokens, including the semicolon.
for res in self.lex_indents() {
yield res;
}
}
// No newline, can just return the semicolon (which is certainly on the queue, and should be the only thing, but it is fine here if not).
continue;
}

//
// Indentation done; do the rest of lexing.
//
// Parse identifiers and keywords. This assumes that keywords are a subset of identifiers.
let word_match_res = self.reader.borrow_mut().matches(IdentifierToken::subpattern());
if let Match(word) = word_match_res {
// Check if it is a keyword.
// TODO: maybe turn identifier into keyword to avoid a string copy? kind of elaborate...
if let Ok(keyword) = KeywordToken::from_str(word.clone()) {
yield Tokens::Keyword(keyword);
}
yield Tokens::Identifier(IdentifierToken::from_str(word).unwrap());
continue;
}
// String literal (delegated).
let string_match_res = self.reader.borrow_mut().matches("[a-z]?\"");
if let Match(_) = string_match_res {
let sublexer: Box<Lexer> = Box::new(StringLexer::new_double_quoted(self.reader.clone()));
self.delegate = Option::Some(sublexer);
continue;
}
// Association (before operator).
let association_match_res = self.reader.borrow_mut().matches(&AssociationToken::subpattern());
if let Match(token) = association_match_res {
if glyphat(token, -1) == "=" {
yield Tokens::Association(AssociationToken::from_unprefixed()); // TODO
} else {
yield Tokens::Association(AssociationToken::from_str(charsliceto(token, -1)).unwrap());
}
continue;
}
// Operator.
let operator_match_res = self.reader.borrow_mut().matches(OperatorToken::subpattern());
if let Match(token) = operator_match_res {
yield Tokens::Operator(OperatorToken::from_str(&token).unwrap());
continue;
}
// Grouping symbols
if let Match(_) = self.reader.borrow_mut().matches(r"\(") {
yield Tokens::ParenthesisOpen(ParenthesisOpenToken::new());
continue;
}
if let Match(_) = self.reader.borrow_mut().matches(r"\)") {
yield Tokens::ParenthesisClose(ParenthesisCloseToken::new());
continue;
}


let unknown_word = self.reader.borrow_mut().matches("[^\\s]+");
match unknown_word {
Match(word) => yield Tokens::Unlexable(UnlexableToken::new(word)),
NoMatch() => panic!("Do not know how to proceed with parsing"),
EOF() => {
// TODO: also dedent and end statement here
return
}
}
continue;
// // TODO: see if all these match_res can be removed (they couldn't before due to borrowchecker, even with non-lexical lifetimes)
// let continue_match_res = self.reader.borrow_mut().matches("\\.\\.\\.");
// if let Match(_) = continue_match_res {
// // Line continuation has no token, it just continues on the next line.
// let newline_match_res = self.reader.borrow_mut().matches("\\n\\r?");
// if let Match(_) = newline_match_res {
// // There should always be a newline after continuations, so that they can be ignored together.
// } else {
// // All the text between ... and the end of the line is unlexable.
// let newline_match_res = self.reader.borrow_mut().matches("[^\\n]*\\n\\r?");
// if let Match(word) = newline_match_res {
// yield Tokens::Unlexable(UnlexableToken::new(word));
// // This is a new line, so there may be indents.
// // TODO: is there any yield-from like Python?
// for res in self.lex_indents() {
// yield res;
// }
// } else {
// // TODO: I don't know yet how to deal with '...' followed by end-of-file
// panic!()
// }
// }
// // TODO: are continues necessary? it seems more state-independent to restart for each token
// continue;
// }
// let newline_match_res = self.reader.borrow_mut().matches("\\n\\r?");
// if let Match(_) = newline_match_res {
// // Newline WITHOUT line continuation.
// // This is a new line, so there may be indents.
// yield Tokens::EndStatement(EndStatementToken::new_end_line());
// for res in self.lex_indents() {
// yield res;
// }
// continue;
// }
// let end_statement_match_res = self.reader.borrow_mut().matches(";");
// if let Match(_) = end_statement_match_res {
// // Semicolon, which ends a statement.
// // Need to do some extra work with buffer, because there may be a newline followed by indentation, which ; should precede.
// yield Tokens::EndStatement(EndStatementToken::new_semicolon());
// let end_line_match_res = self.reader.borrow_mut().matches("\\n\\r?");
// if let Match(_) = end_line_match_res {
// // If semicolon is followed by a newline (redundant), then we need to deal with indents (but ignore the newline itself).
// // This will return the queue of tokens, including the semicolon.
// for res in self.lex_indents() {
// yield res;
// }
// }
// // No newline, can just return the semicolon (which is certainly on the queue, and should be the only thing, but it is fine here if not).
// continue;
// }
//
// //
// // Indentation done; do the rest of lexing.
// //
// // Parse identifiers and keywords. This assumes that keywords are a subset of identifiers.
// let word_match_res = self.reader.borrow_mut().matches(IdentifierToken::subpattern());
// if let Match(word) = word_match_res {
// // Check if it is a keyword.
// // TODO: maybe turn identifier into keyword to avoid a string copy? kind of elaborate...
// if word == "end" {
// yield Tokens::EndBlock(EndBlockToken::new(false, true));
// } else if let Ok(keyword) = KeywordToken::from_str(word.clone()) {
// yield Tokens::Keyword(keyword);
// }
// yield Tokens::Identifier(IdentifierToken::from_str(word).unwrap());
// continue;
// }
// // String literal (delegated).
// let string_match_res = self.reader.borrow_mut().matches("[a-z]?\"");
// if let Match(_) = string_match_res {
// let sublexer: Box<Lexer> = Box::new(StringLexer::new_double_quoted(self.reader.clone()));
// self.delegate = Option::Some(sublexer);
// continue;
// }
// // Association (before operator).
// let association_match_res = self.reader.borrow_mut().matches(&AssociationToken::subpattern());
// if let Match(token) = association_match_res {
// if glyphat(token, -1) == "=" {
// yield Tokens::Association(AssociationToken::from_unprefixed()); // TODO
// } else {
// yield Tokens::Association(AssociationToken::from_str(charsliceto(token, -1)).unwrap());
// }
// continue;
// }
// // Operator.
// let operator_match_res = self.reader.borrow_mut().matches(OperatorToken::subpattern());
// if let Match(token) = operator_match_res {
// yield Tokens::Operator(OperatorToken::from_str(&token).unwrap());
// continue;
// }
// // Grouping symbols
// if let Match(_) = self.reader.borrow_mut().matches(r"\(") {
// yield Tokens::ParenthesisOpen(ParenthesisOpenToken::new());
// continue;
// }
// if let Match(_) = self.reader.borrow_mut().matches(r"\)") {
// yield Tokens::ParenthesisClose(ParenthesisCloseToken::new());
// continue;
// }
//
//
// let unknown_word = self.reader.borrow_mut().matches("[^\\s]+");
// match unknown_word {
// Match(word) => yield Tokens::Unlexable(UnlexableToken::new(word)),
// NoMatch() => panic!("Do not know how to proceed with parsing"),
// EOF() => {
// // TODO: also dedent and end statement here
// return
// }
// }
}

}),
Expand Down

0 comments on commit f95c892

Please sign in to comment.