From fc0a2024257ca73431a4ee81fc3e546d44684672 Mon Sep 17 00:00:00 2001 From: Hans Ott Date: Tue, 7 Jan 2025 14:58:35 +0100 Subject: [PATCH 1/7] Correctly look for end delimiter dollar quoted string Currently the tokenizer throws an error for ```sql SELECT $abc$x$ab$abc$ ``` The logic is also quite difficult to read so I made it a bit simpler. --- src/tokenizer.rs | 94 +++++++++++++++++++++++++----------------------- 1 file changed, 50 insertions(+), 44 deletions(-) diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 38bd33d60..472400b63 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -1554,46 +1554,36 @@ impl<'a> Tokenizer<'a> { if matches!(chars.peek(), Some('$')) && !self.dialect.supports_dollar_placeholder() { chars.next(); - 'searching_for_end: loop { - s.push_str(&peeking_take_while(chars, |ch| ch != '$')); - match chars.peek() { + let mut end_tag_name = String::new(); + + loop { + match chars.next() { Some('$') => { - chars.next(); - let mut maybe_s = String::from("$"); - for c in value.chars() { - if let Some(next_char) = chars.next() { - maybe_s.push(next_char); - if next_char != c { - // This doesn't match the dollar quote delimiter so this - // is not the end of the string. - s.push_str(&maybe_s); - continue 'searching_for_end; - } + if !end_tag_name.is_empty() { + if end_tag_name == value { + // Found the end delimiter, truncate the string + s.truncate(s.len() - value.len() - 1); + break; } else { - return self.tokenizer_error( - chars.location(), - "Unterminated dollar-quoted, expected $", - ); + // Reset the tag name + end_tag_name.clear(); } - } - if chars.peek() == Some(&'$') { - chars.next(); - maybe_s.push('$'); - // maybe_s matches the end delimiter - break 'searching_for_end; } else { - // This also doesn't match the dollar quote delimiter as there are - // more characters before the second dollar so this is not the end - // of the string. - s.push_str(&maybe_s); - continue 'searching_for_end; + // Start a new tag name + end_tag_name.clear(); } + + s.push('$'); } - _ => { + Some(ch) => { + end_tag_name.push(ch); + s.push(ch); + } + None => { return self.tokenizer_error( chars.location(), "Unterminated dollar-quoted, expected $", - ) + ); } } } @@ -2551,20 +2541,36 @@ mod tests { #[test] fn tokenize_dollar_quoted_string_tagged() { - let sql = String::from( - "SELECT $tag$dollar '$' quoted strings have $tags like this$ or like this $$$tag$", - ); - let dialect = GenericDialect {}; - let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap(); - let expected = vec![ - Token::make_keyword("SELECT"), - Token::Whitespace(Whitespace::Space), - Token::DollarQuotedString(DollarQuotedString { - value: "dollar '$' quoted strings have $tags like this$ or like this $$".into(), - tag: Some("tag".into()), - }), + let test_cases = vec![ + ( + String::from("SELECT $tag$dollar '$' quoted strings have $tags like this$ or like this $$$tag$"), + vec![ + Token::make_keyword("SELECT"), + Token::Whitespace(Whitespace::Space), + Token::DollarQuotedString(DollarQuotedString { + value: "dollar '$' quoted strings have $tags like this$ or like this $$".into(), + tag: Some("tag".into()), + }) + ] + ), + ( + String::from("SELECT $abc$x$ab$abc$"), + vec![ + Token::make_keyword("SELECT"), + Token::Whitespace(Whitespace::Space), + Token::DollarQuotedString(DollarQuotedString { + value: "x$ab".into(), + tag: Some("abc".into()), + }) + ] + ), ]; - compare(expected, tokens); + + let dialect = GenericDialect {}; + for (sql, expected) in test_cases { + let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap(); + compare(expected, tokens); + } } #[test] From fe1bf63c3d9177275f1b6a6cacbd58ecefd76bc9 Mon Sep 17 00:00:00 2001 From: Hans Ott Date: Tue, 7 Jan 2025 15:44:16 +0100 Subject: [PATCH 2/7] Improve comment --- src/tokenizer.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 472400b63..d78e643e9 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -1561,7 +1561,9 @@ impl<'a> Tokenizer<'a> { Some('$') => { if !end_tag_name.is_empty() { if end_tag_name == value { - // Found the end delimiter, truncate the string + // We found the end delimiter + // Let's remove the tag name from the string content + // (plus the dollar sign of the end delimiter) s.truncate(s.len() - value.len() - 1); break; } else { From 4917ccb107ac51ed6cd4cf4c620974f0d26b5b1d Mon Sep 17 00:00:00 2001 From: Hans Ott Date: Tue, 7 Jan 2025 20:07:25 +0100 Subject: [PATCH 3/7] Add more tests and simplify algo --- src/tokenizer.rs | 106 +++++++++++++++++++++++++++++++++++++---------- 1 file changed, 84 insertions(+), 22 deletions(-) diff --git a/src/tokenizer.rs b/src/tokenizer.rs index d78e643e9..81f49aada 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -1554,34 +1554,25 @@ impl<'a> Tokenizer<'a> { if matches!(chars.peek(), Some('$')) && !self.dialect.supports_dollar_placeholder() { chars.next(); - let mut end_tag_name = String::new(); + let mut temp = String::new(); + let end_delimiter = format!("${}$", value); loop { match chars.next() { - Some('$') => { - if !end_tag_name.is_empty() { - if end_tag_name == value { - // We found the end delimiter - // Let's remove the tag name from the string content - // (plus the dollar sign of the end delimiter) - s.truncate(s.len() - value.len() - 1); - break; - } else { - // Reset the tag name - end_tag_name.clear(); - } - } else { - // Start a new tag name - end_tag_name.clear(); - } - - s.push('$'); - } Some(ch) => { - end_tag_name.push(ch); - s.push(ch); + temp.push(ch); // Collect all characters into `temp` + + if temp.ends_with(&end_delimiter) { + s.push_str(&temp[..temp.len() - end_delimiter.len()]); + break; + } } None => { + if temp.ends_with(&end_delimiter) { + s.push_str(&temp[..temp.len() - end_delimiter.len()]); + break; + } + return self.tokenizer_error( chars.location(), "Unterminated dollar-quoted, expected $", @@ -2566,6 +2557,28 @@ mod tests { }) ] ), + ( + String::from("SELECT $abc$$abc$"), + vec![ + Token::make_keyword("SELECT"), + Token::Whitespace(Whitespace::Space), + Token::DollarQuotedString(DollarQuotedString { + value: "".into(), + tag: Some("abc".into()), + }) + ] + ), + ( + String::from("0$abc$$abc$1"), + vec![ + Token::Number("0".into(), false), + Token::DollarQuotedString(DollarQuotedString { + value: "".into(), + tag: Some("abc".into()), + }), + Token::Number("1".into(), false), + ] + ), ]; let dialect = GenericDialect {}; @@ -2591,6 +2604,23 @@ mod tests { ); } + #[test] + fn tokenize_dollar_quoted_string_tagged_unterminated_mirror() { + let sql = String::from("SELECT $abc$abc$"); + let dialect = GenericDialect {}; + assert_eq!( + Tokenizer::new(&dialect, &sql).tokenize(), + Err(TokenizerError { + message: "Unterminated dollar-quoted, expected $".into(), + location: Location { + line: 1, + column: 17 + } + }) + ); + } + + #[test] fn tokenize_dollar_placeholder() { let sql = String::from("SELECT $$, $$ABC$$, $ABC$, $ABC"); @@ -2615,6 +2645,38 @@ mod tests { ); } + #[test] + fn tokenize_nested_dollar_quoted_strings() { + let sql = String::from("SELECT $tag$dollar $nested$ string$tag$"); + let dialect = GenericDialect {}; + let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap(); + let expected = vec![ + Token::make_keyword("SELECT"), + Token::Whitespace(Whitespace::Space), + Token::DollarQuotedString(DollarQuotedString { + value: "dollar $nested$ string".into(), + tag: Some("tag".into()), + }), + ]; + compare(expected, tokens); + } + + #[test] + fn tokenize_dollar_quoted_string_untagged_empty() { + let sql = String::from("SELECT $$$$"); + let dialect = GenericDialect {}; + let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap(); + let expected = vec![ + Token::make_keyword("SELECT"), + Token::Whitespace(Whitespace::Space), + Token::DollarQuotedString(DollarQuotedString { + value: "".into(), + tag: None, + }), + ]; + compare(expected, tokens); + } + #[test] fn tokenize_dollar_quoted_string_untagged() { let sql = From 5d21da9bb7a9060c71be24b1bec46a0de638f355 Mon Sep 17 00:00:00 2001 From: Hans Ott Date: Tue, 7 Jan 2025 20:09:50 +0100 Subject: [PATCH 4/7] Fix code style --- src/tokenizer.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 81f49aada..c219dce0c 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -2620,7 +2620,6 @@ mod tests { ); } - #[test] fn tokenize_dollar_placeholder() { let sql = String::from("SELECT $$, $$ABC$$, $ABC$, $ABC"); From e9cb4086e48d4ba49b872425df314e795082ff58 Mon Sep 17 00:00:00 2001 From: Hans Ott Date: Tue, 7 Jan 2025 20:10:48 +0100 Subject: [PATCH 5/7] Remove comment --- src/tokenizer.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tokenizer.rs b/src/tokenizer.rs index c219dce0c..e7b8802d1 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -1560,7 +1560,7 @@ impl<'a> Tokenizer<'a> { loop { match chars.next() { Some(ch) => { - temp.push(ch); // Collect all characters into `temp` + temp.push(ch); if temp.ends_with(&end_delimiter) { s.push_str(&temp[..temp.len() - end_delimiter.len()]); From dfcd1313e7a4a8ef7b97175832bdf9f10a3a531a Mon Sep 17 00:00:00 2001 From: Hans Ott Date: Sun, 12 Jan 2025 18:31:25 +0100 Subject: [PATCH 6/7] Use strip_suffix instead of slicing bytes String::len won't necessarily produce correct results on arbitrary UTF8 strings --- src/tokenizer.rs | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/src/tokenizer.rs b/src/tokenizer.rs index e7b8802d1..dbd7d56b9 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -1563,13 +1563,19 @@ impl<'a> Tokenizer<'a> { temp.push(ch); if temp.ends_with(&end_delimiter) { - s.push_str(&temp[..temp.len() - end_delimiter.len()]); + if let Some(temp) = temp.strip_suffix(&end_delimiter) { + s.push_str(temp); + break; + } break; } } None => { if temp.ends_with(&end_delimiter) { - s.push_str(&temp[..temp.len() - end_delimiter.len()]); + if let Some(temp) = temp.strip_suffix(&end_delimiter) { + s.push_str(temp); + break; + } break; } @@ -2579,6 +2585,15 @@ mod tests { Token::Number("1".into(), false), ] ), + ( + String::from("$function$abc$q$data$q$$function$"), + vec![ + Token::DollarQuotedString(DollarQuotedString { + value: "abc$q$data$q$".into(), + tag: Some("function".into()), + }), + ] + ), ]; let dialect = GenericDialect {}; From 955f8bb4f41e35c19c9096a9546f17be556bcf53 Mon Sep 17 00:00:00 2001 From: Hans Ott Date: Sun, 12 Jan 2025 18:35:22 +0100 Subject: [PATCH 7/7] Remove extra break --- src/tokenizer.rs | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/tokenizer.rs b/src/tokenizer.rs index dbd7d56b9..c9c4ded55 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -1565,7 +1565,6 @@ impl<'a> Tokenizer<'a> { if temp.ends_with(&end_delimiter) { if let Some(temp) = temp.strip_suffix(&end_delimiter) { s.push_str(temp); - break; } break; } @@ -1574,7 +1573,6 @@ impl<'a> Tokenizer<'a> { if temp.ends_with(&end_delimiter) { if let Some(temp) = temp.strip_suffix(&end_delimiter) { s.push_str(temp); - break; } break; }