From 420cf8721e682eac86f7734510883be55fc23128 Mon Sep 17 00:00:00 2001 From: Lucas Date: Tue, 28 May 2024 12:11:15 +0200 Subject: [PATCH] =?UTF-8?q?=E2=9C=A8=20(scanner):=20Seperate=20lexeme=20to?= =?UTF-8?q?=20own=20file=20and=20add=20more=20tokens?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit semver: minor --- src/main.rs | 1 - src/scanner/lexeme.rs | 114 +++++++++++++++++++++++++++++++++++++ src/scanner/mod.rs | 128 +++++++++++++++++------------------------- 3 files changed, 164 insertions(+), 79 deletions(-) create mode 100644 src/scanner/lexeme.rs diff --git a/src/main.rs b/src/main.rs index 1fd7a5f..ab301b9 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,5 +1,4 @@ use anyhow::*; -use scanner::Token; use std::{env, io::BufRead}; pub mod scanner; diff --git a/src/scanner/lexeme.rs b/src/scanner/lexeme.rs new file mode 100644 index 0000000..d1715d1 --- /dev/null +++ b/src/scanner/lexeme.rs @@ -0,0 +1,114 @@ +#[derive(Debug, PartialEq, Eq)] +pub enum Lexeme { + Valid(Token, Range), + Invalid(Range), +} + +impl Lexeme { + pub fn valid(token: Token, start: usize, length: usize) -> Lexeme { + Lexeme::Valid( + token, + Range { + position: start, + length, + }, + ) + } + + pub fn invalid(start: usize, length: usize) -> Lexeme { + Lexeme::Invalid(Range { + position: start, + length, + }) + } +} + +#[derive(Debug, PartialEq, Eq)] +pub struct Range { + pub position: usize, + pub length: usize, +} + +#[derive(Debug, PartialEq, Eq, Hash)] +pub enum Token { + /// A token that should be ignored. This is used for whitespace, comments, etc. + Ignore, + + /// An opening parenthesis; `(`. + ParenOpen, + /// A closing parenthesis; `)`. + ParenClose, + /// An opening curly brace; `{`. + CurlyOpen, + /// A closing curly brace; `}`. + CurlyClose, + /// An opening square bracket; `[`. + SquareOpen, + /// A closing square bracket; `]`. + SquareClose, + + /// A comma; `,`. + Comma, + /// A dot; `.`. + Dot, + /// A colon; `:`. + Colon, + /// A semicolon; `;`. + Semicolon, + + /// A plus sign; `+`; + Plus, + /// A minus sign; `-`. + Minus, + /// A forward slash; `/`. + Slash, + /// An asterisk; `*`. + Star, + + /// An equals sign; `=`. + Equal, + /// A negetion sign; `!`. + Not, + /// A less-than sign; `<`. + LessThan, + /// A greater-than sign; `>`. + GreaterThan, + /// A less-than-or-equal sign; `<=`. + LessThanOrEqual, + /// A greater-than-or-equal sign; `>=`. + GreaterThanOrEqual, + /// An equality sign; `==`. + Equiality, + /// An inequality sign; `!=`. + Inequality, + + /// An if keyword; `if`. + If, + /// An else keyword; `else`. + Else, + + /// A while keyword; `while`. + While, + /// A for keyword; `for`. + For, + + /// A let keyword; `let`. + Let, + + /// A function keyword; `fn`. + Function, + /// A return keyword; `return`. + Return, + + /// A boolean; `true`, `false`. + Boolean(bool), + /// A number; `42`, `12`, `-7`. + Number(i32), + /// A string; `"foo"`, `"bar"`, `"baz"`. + String(String), + /// A character; `'a'`, `'b'`, `'c'`. + Character(char), + + /// An identifying name; `foo`, `bar`, `baz`. + Identifier(String), +} diff --git a/src/scanner/mod.rs b/src/scanner/mod.rs index 4dc0d4a..e55823e 100644 --- a/src/scanner/mod.rs +++ b/src/scanner/mod.rs @@ -1,56 +1,18 @@ +use lexeme::Lexeme; +use lexeme::Range; +use lexeme::Token; use regex::Regex; -#[derive(Debug, PartialEq, Eq)] -pub enum Lexeme { - Valid(Token, Range), - Invalid(Range), -} - -impl Lexeme { - pub fn valid(token: Token, start: usize, length: usize) -> Lexeme { - Lexeme::Valid( - token, - Range { - position: start, - length, - }, - ) - } - - pub fn invalid(start: usize, length: usize) -> Lexeme { - Lexeme::Invalid(Range { - position: start, - length, - }) - } -} +pub mod lexeme; -#[derive(Debug, PartialEq, Eq)] -pub struct Range { - pub position: usize, - pub length: usize, -} +type SpecItem = (Regex, fn(&str) -> Token); -#[derive(Debug, PartialEq, Eq)] -pub enum Token { - Ignore, - Number(i32), - String(String), - Character(char), - Identifier(String), - Equal, - Plus, - Minus, - Slash, - Star, - ParenOpen, - ParenClose, - CurlyOpen, - CurlyClose, +macro_rules! r { + ($pattern:expr) => { + Regex::new(format!("^{}", $pattern).as_str()).unwrap() + }; } -type SpecItem = (Regex, fn(&str) -> Token); - pub struct Scanner { input: String, cursor: usize, @@ -63,36 +25,46 @@ impl Scanner { input, cursor: 0, spec: vec![ - (Regex::new(r#"^(\s+)"#).unwrap(), |_| Token::Ignore), - (Regex::new(r#"^\/\/(.*)"#).unwrap(), |_| Token::Ignore), - (Regex::new(r#"^(\d+)"#).unwrap(), |s: &str| { - Token::Number(s.parse().unwrap()) - }), - (Regex::new(r#"^'([^"]*)'"#).unwrap(), |s: &str| { - Token::String(s.to_string()) - }), - (Regex::new(r#"^`(.)`"#).unwrap(), |s: &str| { - Token::Character(s.chars().nth(0).unwrap()) - }), - (Regex::new(r#"^([a-zA-Z_]\w*)"#).unwrap(), |s: &str| { - Token::Identifier(s.to_string()) - }), - (Regex::new(r#"^(\+)"#).unwrap(), |_| Token::Plus), - (Regex::new(r#"^(-)"#).unwrap(), |_| Token::Minus), - (Regex::new(r#"^(\/)"#).unwrap(), |_| Token::Slash), - (Regex::new(r#"^(\*)"#).unwrap(), |_| Token::Star), - (Regex::new(r#"^(=)"#).unwrap(), |_| Token::Equal), - (Regex::new(r#"^(?P\()"#).unwrap(), |_| { - Token::ParenOpen - }), - (Regex::new(r#"^(?P\))"#).unwrap(), |_| { - Token::ParenClose - }), - (Regex::new(r#"^(?P\{)"#).unwrap(), |_| { - Token::CurlyOpen + (r!(r"(\s+)"), |_| Token::Ignore), + (r!(r"//(.*)"), |_| Token::Ignore), + (r!(r"(\()"), |_| Token::ParenOpen), + (r!(r"(\))"), |_| Token::ParenClose), + (r!(r"(\{)"), |_| Token::CurlyOpen), + (r!(r"(\})"), |_| Token::CurlyClose), + (r!(r"(\[)"), |_| Token::SquareOpen), + (r!(r"(\])"), |_| Token::SquareClose), + (r!(r"(\,)"), |_| Token::Comma), + (r!(r"(\.)"), |_| Token::Dot), + (r!(r"(\:)"), |_| Token::Colon), + (r!(r"(;)"), |_| Token::Semicolon), + (r!(r"(\+)"), |_| Token::Plus), + (r!(r"(-)"), |_| Token::Minus), + (r!(r"(/)"), |_| Token::Slash), + (r!(r"(\*)"), |_| Token::Star), + (r!(r"(=)"), |_| Token::Equal), + (r!(r"(!)"), |_| Token::Not), + (r!(r"(<)"), |_| Token::LessThan), + (r!(r"(>)"), |_| Token::GreaterThan), + (r!(r"(<=)"), |_| Token::LessThanOrEqual), + (r!(r"(>=)"), |_| Token::GreaterThanOrEqual), + (r!(r"(==)"), |_| Token::Equiality), + (r!(r"(!=)"), |_| Token::Inequality), + (r!(r"(if)"), |_| Token::If), + (r!(r"(else)"), |_| Token::Else), + (r!(r"(while)"), |_| Token::While), + (r!(r"(for)"), |_| Token::For), + (r!(r"(let)"), |_| Token::Let), + (r!(r"(fn)"), |_| Token::Function), + (r!(r"(return)"), |_| Token::Return), + (r!(r"(true)"), |_| Token::Boolean(true)), + (r!(r"(false)"), |_| Token::Boolean(false)), + (r!(r"(\d+)"), |value| Token::Number(value.parse().unwrap())), + (r!(r"'([^']*)'"), |value| Token::String(value.to_string())), + (r!(r"`([^`]*)`"), |value| { + Token::Character(value.chars().next().unwrap()) }), - (Regex::new(r#"^(?P\})"#).unwrap(), |_| { - Token::CurlyClose + (r!(r"([a-zA-Z_]\w*)"), |value| { + Token::Identifier(value.to_string()) }), ], } @@ -270,10 +242,10 @@ mod tests { #[test] fn parses_invalid_lexeme_at_end() { test_scanner( - "123~~~", + "123~~~±±±", vec![ Lexeme::valid(Token::Number(123), 0, 3), - Lexeme::invalid(3, 3), + Lexeme::invalid(3, 6), ], ); }