From 28aaca80a45785d56fa3d4c4cbc28bb6f9107da5 Mon Sep 17 00:00:00 2001 From: Hunter Damron Date: Sun, 16 Aug 2020 14:00:10 -0400 Subject: [PATCH 1/5] Initial WIP on preprocessor-hashhash Collected before and after tokens and passed them to a concat function --- src/data/error.rs | 4 ++ src/data/lex.rs | 2 + src/lex/mod.rs | 8 ++- src/lex/replace.rs | 146 ++++++++++++++++++++++++++++++--------------- 4 files changed, 110 insertions(+), 50 deletions(-) diff --git a/src/data/error.rs b/src/data/error.rs index fef5b59e..89894721 100644 --- a/src/data/error.rs +++ b/src/data/error.rs @@ -506,6 +506,10 @@ pub enum CppError { /// '#' in a function macro not followed by function parameter #[error("'#' is not followed by a macro parameter")] HashMissingParameter, + + /// '##' missing arguments + #[error("'##' cannot appear at {} of macro expansion", if *(.0) { "start" } else { "end"})] + HashHashMissingParameter(bool), } /// Lex errors are non-exhaustive and may have new variants added at any time diff --git a/src/data/lex.rs b/src/data/lex.rs index fb9a3a59..3680846d 100644 --- a/src/data/lex.rs +++ b/src/data/lex.rs @@ -233,6 +233,7 @@ pub enum Token { Ellipsis, StructDeref, // -> Hash, // #, used for preprocessing + HashHash, // ##, used for preprocessing } /* impls */ @@ -393,6 +394,7 @@ impl std::fmt::Display for Token { Ellipsis => write!(f, "..."), StructDeref => write!(f, "->"), Hash => write!(f, "#"), + HashHash => write!(f, "##"), } } } diff --git a/src/lex/mod.rs b/src/lex/mod.rs index 7fb3cd9c..034c7ed3 100644 --- a/src/lex/mod.rs +++ b/src/lex/mod.rs @@ -362,7 +362,13 @@ impl Iterator for Lexer { let span_start = self.location.offset - c.len_utf8() as u32; // this giant switch is most of the logic let data = match c { - '#' => Token::Hash, + '#' => match self.peek() { + Some('#') => { + self.next_char(); + Token::HashHash + } + _ => Token::Hash, + }, '+' => match self.peek() { Some('=') => { self.next_char(); diff --git a/src/lex/replace.rs b/src/lex/replace.rs index 6359afa2..3ccb72ec 100644 --- a/src/lex/replace.rs +++ b/src/lex/replace.rs @@ -166,56 +166,95 @@ pub fn replace( // - _not_ after every token, since otherwise that won't catch some mutual recursion // See https://github.com/jyn514/rcc/issues/427 for examples. let mut ids_seen = HashSet::new(); - let mut replacements = Vec::new(); + let mut replacements: Vec>> = Vec::new(); let mut pending = VecDeque::new(); pending.push_back(Ok(location.with(token))); + let mut pending_hashhash: Option = None; // Token before ## + // outer loop: replace all tokens in the replacement list while let Some(token) = pending.pop_front() { - // first step: perform (recursive) substitution on the ID - if let Ok(Locatable { - data: Token::Id(id), - .. - }) = token - { - if !ids_seen.contains(&id) { - match definitions.get(&id) { - Some(Definition::Object(replacement_list)) => { - ids_seen.insert(id); - // prepend the new tokens to the pending tokens - // They need to go before, not after. For instance: - // ```c - // #define a b c d - // #define b 1 + 2 - // a - // ``` - // should replace to `1 + 2 c d`, not `c d 1 + 2` - let mut new_pending = VecDeque::new(); - // we need a `clone()` because `self.definitions` needs to keep its copy of the definition - new_pending.extend( - replacement_list - .iter() - .cloned() - .map(|t| Ok(location.with(t))), - ); - new_pending.append(&mut pending); - pending = new_pending; - continue; - } - // TODO: so many allocations :( - Some(Definition::Function { .. }) => { - ids_seen.insert(id); - let func_replacements = - replace_function(definitions, id, location, &mut pending, &mut inner); - let mut func_replacements: VecDeque<_> = - func_replacements.into_iter().collect(); - func_replacements.append(&mut pending); - pending = func_replacements; - continue; + match token { + Ok(Locatable { + data: ref succeeding_tok, + .. + }) if pending_hashhash.is_some() => { + if matches!(succeeding_tok, Token::Whitespace(_)) { + continue; + } + let pending_hashhash = pending_hashhash.take().unwrap(); // We just checked that it's some + let concat_token = concat(pending_hashhash, succeeding_tok.clone(), &location); + replacements.push(concat_token); // TODO don't bypass pending + continue; + } + Ok(Locatable { + data: Token::Id(id), + .. + }) => { + if !ids_seen.contains(&id) { + match definitions.get(&id) { + Some(Definition::Object(replacement_list)) => { + ids_seen.insert(id); + // prepend the new tokens to the pending tokens + // They need to go before, not after. For instance: + // ```c + // #define a b c d + // #define b 1 + 2 + // a + // ``` + // should replace to `1 + 2 c d`, not `c d 1 + 2` + let mut new_pending = VecDeque::new(); + // we need a `clone()` because `self.definitions` needs to keep its copy of the definition + new_pending.extend( + replacement_list + .iter() + .cloned() + .map(|t| Ok(location.with(t))), + ); + new_pending.append(&mut pending); + pending = new_pending; + continue; + } + // TODO: so many allocations :( + Some(Definition::Function { .. }) => { + ids_seen.insert(id); + let func_replacements = replace_function( + definitions, + id, + location, + &mut pending, + &mut inner, + ); + let mut func_replacements: VecDeque<_> = + func_replacements.into_iter().collect(); + func_replacements.append(&mut pending); + pending = func_replacements; + continue; + } + None => {} } - None => {} } } + Ok(Locatable { + data: Token::HashHash, + .. + }) => { + let preceding_tok = loop { + match replacements.pop() { + Some(Ok(Locatable { + data: Token::Whitespace(_), + .. + })) => continue, + Some(Ok(Locatable { data: token, .. })) => break token, + None | Some(Err(_)) => { + return wrap_error(&location, CppError::HashHashMissingParameter(true)) + } + } + }; + pending_hashhash = Some(preceding_tok); + continue; + } + _ => {} } replacements.push(token); } @@ -367,16 +406,17 @@ fn replace_function( // and taking no arguments other than knowing the number of parameters. if !(args.len() == 1 && params.is_empty() && args[0].is_empty()) { // booo, this is the _only_ error in the whole replacer - return vec![Err( - location.with(CppError::TooFewArguments(params.len(), args.len()).into()) - )]; + return wrap_error( + &location, + CppError::TooFewArguments(params.len(), args.len()).into(), + ); } } let mut pending_hash = false; // Seen a hash? for token in body { - match *token { - Token::Id(id) => { + match token { + &Token::Id(id) => { // #define f(a) { a + 1 } \n f(b) => b + 1 if let Some(index) = params.iter().position(|¶m| param == id) { let replacement = args[index].clone(); @@ -387,7 +427,7 @@ fn replace_function( replacements.push(stringify(replacement)); } } else if pending_hash { - return vec![Err(location.with(CppError::HashMissingParameter.into()))]; + return wrap_error(&location, CppError::HashMissingParameter); } else { replacements.push(Token::Id(id)); } @@ -403,7 +443,7 @@ fn replace_function( } _ => { if pending_hash { - return vec![Err(location.with(CppError::HashMissingParameter.into()))]; + return wrap_error(&location, CppError::HashMissingParameter); } else { replacements.push(token.clone()); } @@ -439,3 +479,11 @@ fn stringify(args: Vec) -> Token { ret.trim() ))])) } + +fn concat(x: Token, b: Token, location: &Location) -> CompileResult> { + todo!(); +} + +fn wrap_error(location: &Location, err: CppError) -> Vec> { + vec![Err(location.with(err.into()))] +} From 1f32b12d4a2d8009266eb1e0317f9f3028adc6ac Mon Sep 17 00:00:00 2001 From: Hunter Damron Date: Mon, 17 Aug 2020 14:16:31 -0400 Subject: [PATCH 2/5] Implement concat with a temporary lexer --- src/data/error.rs | 4 ++++ src/lex/replace.rs | 21 +++++++++++++++++---- 2 files changed, 21 insertions(+), 4 deletions(-) diff --git a/src/data/error.rs b/src/data/error.rs index 89894721..b9480ed0 100644 --- a/src/data/error.rs +++ b/src/data/error.rs @@ -510,6 +510,10 @@ pub enum CppError { /// '##' missing arguments #[error("'##' cannot appear at {} of macro expansion", if *(.0) { "start" } else { "end"})] HashHashMissingParameter(bool), + + /// The result of '##' is not a valid token + #[error("pasting formed '{0}{1}', an invalid preprocessing token")] + HashHashInvalid(Token, Token), } /// Lex errors are non-exhaustive and may have new variants added at any time diff --git a/src/lex/replace.rs b/src/lex/replace.rs index 3ccb72ec..e23ce7f4 100644 --- a/src/lex/replace.rs +++ b/src/lex/replace.rs @@ -2,7 +2,7 @@ //! //! This module does no parsing and accepts only tokens. -use super::{cpp::CppResult, files::FileProcessor}; +use super::{cpp::CppResult, files::FileProcessor, Lexer}; use crate::{ error::CppError, CompileError, CompileResult, InternedStr, LiteralToken, Locatable, Location, Token, @@ -183,7 +183,16 @@ pub fn replace( continue; } let pending_hashhash = pending_hashhash.take().unwrap(); // We just checked that it's some - let concat_token = concat(pending_hashhash, succeeding_tok.clone(), &location); + let concat_token = + concat(&pending_hashhash, succeeding_tok, &location).ok_or_else(|| { + location.with( + CppError::HashHashInvalid( + pending_hashhash.clone(), + succeeding_tok.clone(), + ) + .into(), + ) + }); replacements.push(concat_token); // TODO don't bypass pending continue; } @@ -480,8 +489,12 @@ fn stringify(args: Vec) -> Token { ))])) } -fn concat(x: Token, b: Token, location: &Location) -> CompileResult> { - todo!(); +fn concat(x: &Token, y: &Token, location: &Location) -> Option> { + let mut lexer = Lexer::new(location.file, format!("{}{}", x, y), false); + match lexer.next() { + Some(Ok(tok)) if lexer.next().is_none() => Some(tok), + _ => None, + } } fn wrap_error(location: &Location, err: CppError) -> Vec> { From 4f0e7d624b422e7c93477874993541674efa0238 Mon Sep 17 00:00:00 2001 From: Hunter Damron Date: Mon, 17 Aug 2020 14:50:22 -0400 Subject: [PATCH 3/5] Add more tests for hashhash --- src/lex/cpp.rs | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/src/lex/cpp.rs b/src/lex/cpp.rs index 52696aa1..f79d5a47 100644 --- a/src/lex/cpp.rs +++ b/src/lex/cpp.rs @@ -1941,4 +1941,47 @@ h", assert_is_str("__DATE__"); assert_is_str("__TIME__"); } + + #[test] + fn hashhash() { + use crate::data::lex::{AssignmentToken::*, ComparisonToken::*, Keyword, LiteralToken::*}; + use Token::*; + fn assert_concat(x: &str, y: &str, cpp_src: Option) { + let src = format!("#define tok {} ## {}\ntok", x, y); + let tok = cpp(&src) + .next_non_whitespace() + .unwrap() + .map(|tok| tok.data) + .ok(); + assert_eq!(tok, cpp_src); + } + assert_concat("<", "<", Some(ShiftLeft)); + assert_concat("+", "+", Some(PlusPlus)); + assert_concat(">>", "=", Some(Assignment(ShrEqual))); + assert_concat(">", "=", Some(Comparison(GreaterEqual))); + assert_concat("#", "#", Some(HashHash)); + assert_concat("-", ">", Some(StructDeref)); + assert_concat("const", "ance", Some(Id("constance".into()))); + assert_concat("xyz", "123", Some(Id("xyz123".into()))); + assert_concat("un", "signed", Some(Keyword(Keyword::Unsigned))); + assert_concat("unsign", "ed", Some(Keyword(Keyword::Unsigned))); + assert_concat("5", "e5", Some(Literal(Float("5e5".into())))); + assert_concat("1234", ".5", Some(Literal(Float("1234.5".into())))); + assert_concat("42", "000", Some(Literal(Int("42000".into())))); + + assert_concat("+", "/", None); + assert_concat(r#"'x'"#, r#"'y'"#, None); + assert_concat(r#""x""#, r#""y""#, None); + assert_concat("0b1", "6", None); + assert_concat("/", "/", None); // Not a comment + + assert_same( + "#define hash_hash # ## # + #define mkstr(a) # a + #define in_between(a) mkstr(a) + #define join(c, d) in_between(c hash_hash d) + join(x, y);", + r#""X ## Y""#, + ); + } } From 6396e5102fd523b96880ff00ff73fefe4521efd1 Mon Sep 17 00:00:00 2001 From: Hunter Damron Date: Mon, 17 Aug 2020 15:47:37 -0400 Subject: [PATCH 4/5] Used pending tokens in replace with ## Add active flag to HashHash so it does not get replaced indefinitely Ignore test which relies on #513 - stringify out of order --- src/data/lex.rs | 8 ++++---- src/lex/cpp.rs | 7 +++++-- src/lex/mod.rs | 2 +- src/lex/replace.rs | 12 +++++++++--- 4 files changed, 19 insertions(+), 10 deletions(-) diff --git a/src/data/lex.rs b/src/data/lex.rs index 3680846d..2eb9ace8 100644 --- a/src/data/lex.rs +++ b/src/data/lex.rs @@ -231,9 +231,9 @@ pub enum Token { // Misc Ellipsis, - StructDeref, // -> - Hash, // #, used for preprocessing - HashHash, // ##, used for preprocessing + StructDeref, // -> + Hash, // #, used for preprocessing + HashHash(bool), // ##, used for preprocessing (the bool is true unless it is created by `# ## #`) } /* impls */ @@ -394,7 +394,7 @@ impl std::fmt::Display for Token { Ellipsis => write!(f, "..."), StructDeref => write!(f, "->"), Hash => write!(f, "#"), - HashHash => write!(f, "##"), + HashHash(_) => write!(f, "##"), } } } diff --git a/src/lex/cpp.rs b/src/lex/cpp.rs index f79d5a47..622eb6ef 100644 --- a/src/lex/cpp.rs +++ b/src/lex/cpp.rs @@ -1959,7 +1959,7 @@ h", assert_concat("+", "+", Some(PlusPlus)); assert_concat(">>", "=", Some(Assignment(ShrEqual))); assert_concat(">", "=", Some(Comparison(GreaterEqual))); - assert_concat("#", "#", Some(HashHash)); + assert_concat("#", "#", Some(HashHash(false))); assert_concat("-", ">", Some(StructDeref)); assert_concat("const", "ance", Some(Id("constance".into()))); assert_concat("xyz", "123", Some(Id("xyz123".into()))); @@ -1974,7 +1974,10 @@ h", assert_concat(r#""x""#, r#""y""#, None); assert_concat("0b1", "6", None); assert_concat("/", "/", None); // Not a comment - + } + #[test] + #[ignore] // Related to https://github.com/jyn514/saltwater/issues/513 + fn hash_and_hashhash() { assert_same( "#define hash_hash # ## # #define mkstr(a) # a diff --git a/src/lex/mod.rs b/src/lex/mod.rs index 034c7ed3..a5362f59 100644 --- a/src/lex/mod.rs +++ b/src/lex/mod.rs @@ -365,7 +365,7 @@ impl Iterator for Lexer { '#' => match self.peek() { Some('#') => { self.next_char(); - Token::HashHash + Token::HashHash(true) } _ => Token::Hash, }, diff --git a/src/lex/replace.rs b/src/lex/replace.rs index e23ce7f4..fd5717de 100644 --- a/src/lex/replace.rs +++ b/src/lex/replace.rs @@ -193,7 +193,7 @@ pub fn replace( .into(), ) }); - replacements.push(concat_token); // TODO don't bypass pending + pending.push_back(concat_token); continue; } Ok(Locatable { @@ -245,7 +245,7 @@ pub fn replace( } } Ok(Locatable { - data: Token::HashHash, + data: Token::HashHash(true), .. }) => { let preceding_tok = loop { @@ -492,7 +492,13 @@ fn stringify(args: Vec) -> Token { fn concat(x: &Token, y: &Token, location: &Location) -> Option> { let mut lexer = Lexer::new(location.file, format!("{}{}", x, y), false); match lexer.next() { - Some(Ok(tok)) if lexer.next().is_none() => Some(tok), + Some(Ok(tok)) if lexer.next().is_none() => Some(match tok { + Locatable { + data: Token::HashHash(_), + location, + } => location.with(Token::HashHash(false)), + tok => tok, + }), _ => None, } } From d1282868f988fc516a6f188c61c3ea72d2987667 Mon Sep 17 00:00:00 2001 From: Hunter Damron Date: Mon, 17 Aug 2020 16:09:04 -0400 Subject: [PATCH 5/5] clippy knows best --- src/lex/replace.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lex/replace.rs b/src/lex/replace.rs index fd5717de..7f73e773 100644 --- a/src/lex/replace.rs +++ b/src/lex/replace.rs @@ -417,7 +417,7 @@ fn replace_function( // booo, this is the _only_ error in the whole replacer return wrap_error( &location, - CppError::TooFewArguments(params.len(), args.len()).into(), + CppError::TooFewArguments(params.len(), args.len()), ); } }