From 5bf15253a47a8c1bd11fc07d966676ddeb396c3f Mon Sep 17 00:00:00 2001 From: Lucas Date: Tue, 25 Jun 2024 02:03:33 +0200 Subject: [PATCH] =?UTF-8?q?=F0=9F=9A=A7=20(parser):=20Work=20on=20earley?= =?UTF-8?q?=20parser?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit semver: chore --- src/abstract_syntax_tree/mod.rs | 8 +- src/concrete_syntax_tree/grammar.rs | 68 ++++++-------- src/concrete_syntax_tree/mod.rs | 140 ++++++++++++---------------- src/main.rs | 2 + 4 files changed, 92 insertions(+), 126 deletions(-) diff --git a/src/abstract_syntax_tree/mod.rs b/src/abstract_syntax_tree/mod.rs index 9e97e94..3f98aa9 100644 --- a/src/abstract_syntax_tree/mod.rs +++ b/src/abstract_syntax_tree/mod.rs @@ -4,23 +4,23 @@ use crate::diagnostic::Range; pub mod builder; -#[derive(Debug)] +#[derive(PartialEq, Clone, Debug)] pub enum AstractSyntax<'a> { Statement(Statement<'a>), } -#[derive(Debug)] +#[derive(PartialEq, Clone, Debug)] pub enum Statement<'a> { EnumDeclaration(Spanned<'a, EnumDeclaration<'a>>), } -#[derive(Debug)] +#[derive(PartialEq, Clone, Debug)] pub struct EnumDeclaration<'a> { pub identifier: Spanned<'a, &'a str>, pub items: Vec>, } -#[derive(Debug)] +#[derive(PartialEq, Clone, Debug)] pub struct Spanned<'a, T> { pub value: &'a T, pub range: &'a Range<'a>, diff --git a/src/concrete_syntax_tree/grammar.rs b/src/concrete_syntax_tree/grammar.rs index f4fe55e..9b9b076 100644 --- a/src/concrete_syntax_tree/grammar.rs +++ b/src/concrete_syntax_tree/grammar.rs @@ -3,16 +3,22 @@ use std::collections::HashMap; use crate::scanner::token::TokenType; #[derive(Debug, Clone, PartialEq)] -pub enum Term { +pub enum Symbol { Terminal(TokenType), NonTerminal(NonTerminal), + OneOrMore(NonTerminal), + ZeroOrMore(NonTerminal), + Optional(NonTerminal), } -impl std::fmt::Display for Term { +impl std::fmt::Display for Symbol { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { - Term::Terminal(token_type) => write!(f, "{}", token_type), - Term::NonTerminal(non_terminal) => write!(f, "{}", non_terminal), + Symbol::Terminal(token_type) => write!(f, "{}", token_type), + Symbol::NonTerminal(non_terminal) => write!(f, "{}", non_terminal), + Symbol::OneOrMore(non_terminal) => write!(f, "{}+", non_terminal), + Symbol::ZeroOrMore(non_terminal) => write!(f, "{}*", non_terminal), + Symbol::Optional(non_terminal) => write!(f, "{}?", non_terminal), } } } @@ -20,10 +26,8 @@ impl std::fmt::Display for Term { #[derive(Debug, Clone, PartialEq, Eq, Hash)] pub enum NonTerminal { Start, - RootItems, RootItem, EnumDeclaration, - EnumItems, EnumItem, } @@ -35,7 +39,7 @@ impl std::fmt::Display for NonTerminal { #[derive(Debug)] pub struct Grammar { - pub rules: HashMap>>, + pub rules: HashMap>>, } impl Default for Grammar { @@ -44,53 +48,33 @@ impl Default for Grammar { rules: HashMap::new(), }; - // start -> root_items + // start -> root_item+ grammar.add_rule( NonTerminal::Start, - vec![Term::NonTerminal(NonTerminal::RootItems)], - ); - // root_items -> root_item root_items | root_item - grammar.add_rules( - NonTerminal::RootItems, - vec![ - vec![ - Term::NonTerminal(NonTerminal::RootItems), - Term::NonTerminal(NonTerminal::RootItem), - ], - vec![Term::NonTerminal(NonTerminal::RootItem)], - ], + vec![Symbol::ZeroOrMore(NonTerminal::RootItem)], ); + // root_item -> enum_declaration grammar.add_rule( NonTerminal::RootItem, - vec![Term::NonTerminal(NonTerminal::EnumDeclaration)], + vec![Symbol::NonTerminal(NonTerminal::EnumDeclaration)], ); - // enum_declaration -> ? + // enum_declaration -> enum_item+ grammar.add_rule( NonTerminal::EnumDeclaration, vec![ - Term::Terminal(TokenType::Enum), - Term::Terminal(TokenType::Identifier), - Term::Terminal(TokenType::Colon), - Term::NonTerminal(NonTerminal::EnumItems), - Term::Terminal(TokenType::Semicolon), - ], - ); - // enum_items -> enum_item enum_items | enum_item - grammar.add_rules( - NonTerminal::EnumItems, - vec![ - vec![ - Term::NonTerminal(NonTerminal::EnumItem), - Term::NonTerminal(NonTerminal::EnumItems), - ], - vec![Term::NonTerminal(NonTerminal::EnumItem)], + Symbol::Terminal(TokenType::Enum), + Symbol::Terminal(TokenType::Identifier), + Symbol::Terminal(TokenType::Colon), + Symbol::OneOrMore(NonTerminal::EnumItem), + Symbol::Terminal(TokenType::Semicolon), ], ); + // enum_item -> grammar.add_rule( NonTerminal::EnumItem, - vec![Term::Terminal(TokenType::Identifier)], + vec![Symbol::Terminal(TokenType::Identifier)], ); grammar @@ -98,15 +82,15 @@ impl Default for Grammar { } impl Grammar { - pub fn add_rule(&mut self, non_terminal: NonTerminal, rule: Vec) { + pub fn add_rule(&mut self, non_terminal: NonTerminal, rule: Vec) { self.rules.entry(non_terminal).or_default().push(rule); } - pub fn add_rules(&mut self, non_terminal: NonTerminal, rules: Vec>) { + pub fn add_rules(&mut self, non_terminal: NonTerminal, rules: Vec>) { self.rules.entry(non_terminal).or_default().extend(rules); } - pub fn get(&self, start: &NonTerminal) -> Option<&Vec>> { + pub fn get(&self, start: &NonTerminal) -> Option<&Vec>> { self.rules.get(start) } } diff --git a/src/concrete_syntax_tree/mod.rs b/src/concrete_syntax_tree/mod.rs index 2bdfef4..52c273a 100644 --- a/src/concrete_syntax_tree/mod.rs +++ b/src/concrete_syntax_tree/mod.rs @@ -1,6 +1,7 @@ -use grammar::{Grammar, NonTerminal, Term}; +use grammar::{Grammar, NonTerminal, Symbol}; use crate::{ + abstract_syntax_tree::AstractSyntax, diagnostic::{Diagnostic, Error, Range}, scanner::token::Token, }; @@ -30,10 +31,11 @@ impl<'a> Chart<'a> { #[derive(Debug, Clone, PartialEq)] pub struct EarleyItem<'a> { pub head: NonTerminal, - pub body: Vec, + pub body: Vec, pub dot: usize, pub start: usize, pub tree: Vec>, + ast_node: Option>, } #[derive(Debug, Clone, PartialEq)] @@ -93,13 +95,14 @@ impl<'a> std::fmt::Display for EarleyItem<'a> { } impl<'a> EarleyItem<'a> { - pub fn new(head: NonTerminal, body: Vec, dot: usize, start: usize) -> Self { + pub fn new(head: NonTerminal, body: Vec, dot: usize, start: usize) -> Self { Self { head, body, dot, start, tree: Vec::new(), + ast_node: None, } } @@ -107,7 +110,7 @@ impl<'a> EarleyItem<'a> { self.dot >= self.body.len() } - pub fn next(&self) -> Option<&Term> { + pub fn next(&self) -> Option<&Symbol> { self.body.get(self.dot) } } @@ -116,14 +119,14 @@ impl<'a> EarleyItem<'a> { pub struct EarleyParser<'a> { grammar: Grammar, chart: Chart<'a>, + diagnostics: Vec>, } impl<'a> EarleyParser<'a> { /// Parses the given input tokens according to the grammar. /// Returns true if the input is accepted by the grammar, otherwise false. pub fn parse(mut self, tokens: &'a [Token]) -> Result, Vec>> { - let mut diagnostics = Vec::new(); - + self.diagnostics = Vec::new(); self.chart = Chart::new(tokens.len()); // Initial state @@ -138,40 +141,7 @@ impl<'a> EarleyParser<'a> { let mut j = 0; if self.chart.states[i].is_empty() { - let expected_symbols: Vec = self - .chart - .states - .get(i - 1) - .map(|state| { - state - .iter() - .filter_map(|item| item.next()) - .filter_map(|term| match term { - Term::Terminal(token_type) => Some(token_type.to_string()), - _ => None, - }) - .collect::>() - }) - .iter() - .flat_map(|set| set.clone()) - .collect::>(); - - let token = tokens.get(i).unwrap_or(tokens.last().unwrap()); - - if !expected_symbols.is_empty() { - diagnostics.push( - Diagnostic::error("Syntax error").with_error( - Error::primary( - token.range.file_id, - i - 1, - 0, - format!("Expected {}", expected_symbols.join(" or ")), - ) - .transform_range(tokens), - ), - ); - } - + self.add_diagnostic(tokens, i - 1); // TODO: enter panic mode } @@ -179,14 +149,23 @@ impl<'a> EarleyParser<'a> { let item = self.chart.states[i][j].clone(); if let Some(next_symbol) = item.next() { match next_symbol { - Term::NonTerminal(non_terminal) => { + Symbol::NonTerminal(non_terminal) => { self.predict(i, non_terminal); } - Term::Terminal(token_type) => { + Symbol::Terminal(token_type) => { if i < tokens.len() && token_type == &token.unwrap().token_type { self.scan(i, &item, token.unwrap()); } } + Symbol::OneOrMore(non_terminal) => { + self.handle_one_or_more(i, &item, non_terminal); + } + Symbol::ZeroOrMore(non_terminal) => { + self.handle_zero_or_more(i, &item, non_terminal); + } + Symbol::Optional(non_terminal) => { + self.handle_optional(i, &item, non_terminal); + } } } else { self.complete(i, &item); @@ -205,42 +184,8 @@ impl<'a> EarleyParser<'a> { item.tree.clone(), )) } else { - // Expected more input ... - let expected_symbols: Vec = self - .chart - .states - .get(tokens.len()) - .map(|state| { - state - .iter() - .filter_map(|item| item.next()) - .filter_map(|term| match term { - Term::Terminal(token_type) => Some(token_type.to_string()), - _ => None, - }) - .collect::>() - }) - .iter() - .flat_map(|set| set.clone()) - .collect::>(); - - let token = tokens.last().unwrap(); - - if !expected_symbols.is_empty() { - diagnostics.push( - Diagnostic::error("Syntax error").with_error( - Error::primary( - token.range.file_id, - tokens.len(), - 0, - format!("Expected {}", expected_symbols.join(" or ")), - ) - .transform_range(tokens), - ), - ); - } - - Err(diagnostics) + self.add_diagnostic(tokens, tokens.len()); + Err(self.diagnostics) } } @@ -281,7 +226,7 @@ impl<'a> EarleyParser<'a> { fn complete(&mut self, position: usize, item: &EarleyItem<'a>) { let start_state_set = self.chart.states[item.start].clone(); for state in start_state_set { - if let Some(Term::NonTerminal(non_terminal)) = state.next() { + if let Some(Symbol::NonTerminal(non_terminal)) = state.next() { if non_terminal == &item.head { let mut next_item = EarleyItem::new( state.head.clone(), @@ -301,5 +246,40 @@ impl<'a> EarleyParser<'a> { } } } + + fn add_diagnostic(&mut self, tokens: &'a [Token], index: usize) { + let expected_symbols: Vec = self + .chart + .states + .get(index) + .map(|state| { + state + .iter() + .filter_map(|item| item.next()) + .filter_map(|term| match term { + Symbol::Terminal(token_type) => Some(token_type.to_string()), + _ => None, + }) + .collect::>() + }) + .iter() + .flat_map(|set| set.clone()) + .collect::>(); + + let token = tokens.get(index).unwrap_or(tokens.last().unwrap()); + + if !expected_symbols.is_empty() { + self.diagnostics.push( + Diagnostic::error("Syntax error").with_error( + Error::primary( + token.range.file_id, + index, + 0, + format!("Expected {}", expected_symbols.join(" or ")), + ) + .transform_range(tokens), + ), + ); + } + } } -q \ No newline at end of file diff --git a/src/main.rs b/src/main.rs index 99e0b4e..2d0fb95 100644 --- a/src/main.rs +++ b/src/main.rs @@ -15,6 +15,8 @@ fn main() -> Result<()> { "main", " enum colors: green blue red + + enum colors: green blue red; ", );