From fc0a2024257ca73431a4ee81fc3e546d44684672 Mon Sep 17 00:00:00 2001
From: Hans Ott <hansott@hotmail.be>
Date: Tue, 7 Jan 2025 14:58:35 +0100
Subject: [PATCH 1/7] Correctly look for end delimiter dollar quoted string

Currently the tokenizer throws an error for

```sql
SELECT $abc$x$ab$abc$
```

The logic is also quite difficult to read so I made it a bit simpler.
---
 src/tokenizer.rs | 94 +++++++++++++++++++++++++-----------------------
 1 file changed, 50 insertions(+), 44 deletions(-)

diff --git a/src/tokenizer.rs b/src/tokenizer.rs
index 38bd33d60..472400b63 100644
--- a/src/tokenizer.rs
+++ b/src/tokenizer.rs
@@ -1554,46 +1554,36 @@ impl<'a> Tokenizer<'a> {
             if matches!(chars.peek(), Some('$')) && !self.dialect.supports_dollar_placeholder() {
                 chars.next();
 
-                'searching_for_end: loop {
-                    s.push_str(&peeking_take_while(chars, |ch| ch != '$'));
-                    match chars.peek() {
+                let mut end_tag_name = String::new();
+
+                loop {
+                    match chars.next() {
                         Some('$') => {
-                            chars.next();
-                            let mut maybe_s = String::from("$");
-                            for c in value.chars() {
-                                if let Some(next_char) = chars.next() {
-                                    maybe_s.push(next_char);
-                                    if next_char != c {
-                                        // This doesn't match the dollar quote delimiter so this
-                                        // is not the end of the string.
-                                        s.push_str(&maybe_s);
-                                        continue 'searching_for_end;
-                                    }
+                            if !end_tag_name.is_empty() {
+                                if end_tag_name == value {
+                                    // Found the end delimiter, truncate the string
+                                    s.truncate(s.len() - value.len() - 1);
+                                    break;
                                 } else {
-                                    return self.tokenizer_error(
-                                        chars.location(),
-                                        "Unterminated dollar-quoted, expected $",
-                                    );
+                                    // Reset the tag name
+                                    end_tag_name.clear();
                                 }
-                            }
-                            if chars.peek() == Some(&'$') {
-                                chars.next();
-                                maybe_s.push('$');
-                                // maybe_s matches the end delimiter
-                                break 'searching_for_end;
                             } else {
-                                // This also doesn't match the dollar quote delimiter as there are
-                                // more characters before the second dollar so this is not the end
-                                // of the string.
-                                s.push_str(&maybe_s);
-                                continue 'searching_for_end;
+                                // Start a new tag name
+                                end_tag_name.clear();
                             }
+
+                            s.push('$');
                         }
-                        _ => {
+                        Some(ch) => {
+                            end_tag_name.push(ch);
+                            s.push(ch);
+                        }
+                        None => {
                             return self.tokenizer_error(
                                 chars.location(),
                                 "Unterminated dollar-quoted, expected $",
-                            )
+                            );
                         }
                     }
                 }
@@ -2551,20 +2541,36 @@ mod tests {
 
     #[test]
     fn tokenize_dollar_quoted_string_tagged() {
-        let sql = String::from(
-            "SELECT $tag$dollar '$' quoted strings have $tags like this$ or like this $$$tag$",
-        );
-        let dialect = GenericDialect {};
-        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
-        let expected = vec![
-            Token::make_keyword("SELECT"),
-            Token::Whitespace(Whitespace::Space),
-            Token::DollarQuotedString(DollarQuotedString {
-                value: "dollar '$' quoted strings have $tags like this$ or like this $$".into(),
-                tag: Some("tag".into()),
-            }),
+        let test_cases = vec![
+            (
+                String::from("SELECT $tag$dollar '$' quoted strings have $tags like this$ or like this $$$tag$"),
+                vec![
+                    Token::make_keyword("SELECT"),
+                    Token::Whitespace(Whitespace::Space),
+                    Token::DollarQuotedString(DollarQuotedString {
+                        value: "dollar '$' quoted strings have $tags like this$ or like this $$".into(),
+                        tag: Some("tag".into()),
+                    })
+                ]
+            ),
+            (
+                String::from("SELECT $abc$x$ab$abc$"),
+                vec![
+                    Token::make_keyword("SELECT"),
+                    Token::Whitespace(Whitespace::Space),
+                    Token::DollarQuotedString(DollarQuotedString {
+                        value: "x$ab".into(),
+                        tag: Some("abc".into()),
+                    })
+                ]
+            ),
         ];
-        compare(expected, tokens);
+
+        let dialect = GenericDialect {};
+        for (sql, expected) in test_cases {
+            let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
+            compare(expected, tokens);
+        }
     }
 
     #[test]

From fe1bf63c3d9177275f1b6a6cacbd58ecefd76bc9 Mon Sep 17 00:00:00 2001
From: Hans Ott <hansott@hotmail.be>
Date: Tue, 7 Jan 2025 15:44:16 +0100
Subject: [PATCH 2/7] Improve comment

---
 src/tokenizer.rs | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/tokenizer.rs b/src/tokenizer.rs
index 472400b63..d78e643e9 100644
--- a/src/tokenizer.rs
+++ b/src/tokenizer.rs
@@ -1561,7 +1561,9 @@ impl<'a> Tokenizer<'a> {
                         Some('$') => {
                             if !end_tag_name.is_empty() {
                                 if end_tag_name == value {
-                                    // Found the end delimiter, truncate the string
+                                    // We found the end delimiter
+                                    // Let's remove the tag name from the string content
+                                    // (plus the dollar sign of the end delimiter)
                                     s.truncate(s.len() - value.len() - 1);
                                     break;
                                 } else {

From 4917ccb107ac51ed6cd4cf4c620974f0d26b5b1d Mon Sep 17 00:00:00 2001
From: Hans Ott <hansott@hotmail.be>
Date: Tue, 7 Jan 2025 20:07:25 +0100
Subject: [PATCH 3/7] Add more tests and simplify algo

---
 src/tokenizer.rs | 106 +++++++++++++++++++++++++++++++++++++----------
 1 file changed, 84 insertions(+), 22 deletions(-)

diff --git a/src/tokenizer.rs b/src/tokenizer.rs
index d78e643e9..81f49aada 100644
--- a/src/tokenizer.rs
+++ b/src/tokenizer.rs
@@ -1554,34 +1554,25 @@ impl<'a> Tokenizer<'a> {
             if matches!(chars.peek(), Some('$')) && !self.dialect.supports_dollar_placeholder() {
                 chars.next();
 
-                let mut end_tag_name = String::new();
+                let mut temp = String::new();
+                let end_delimiter = format!("${}$", value);
 
                 loop {
                     match chars.next() {
-                        Some('$') => {
-                            if !end_tag_name.is_empty() {
-                                if end_tag_name == value {
-                                    // We found the end delimiter
-                                    // Let's remove the tag name from the string content
-                                    // (plus the dollar sign of the end delimiter)
-                                    s.truncate(s.len() - value.len() - 1);
-                                    break;
-                                } else {
-                                    // Reset the tag name
-                                    end_tag_name.clear();
-                                }
-                            } else {
-                                // Start a new tag name
-                                end_tag_name.clear();
-                            }
-
-                            s.push('$');
-                        }
                         Some(ch) => {
-                            end_tag_name.push(ch);
-                            s.push(ch);
+                            temp.push(ch); // Collect all characters into `temp`
+
+                            if temp.ends_with(&end_delimiter) {
+                                s.push_str(&temp[..temp.len() - end_delimiter.len()]);
+                                break;
+                            }
                         }
                         None => {
+                            if temp.ends_with(&end_delimiter) {
+                                s.push_str(&temp[..temp.len() - end_delimiter.len()]);
+                                break;
+                            }
+
                             return self.tokenizer_error(
                                 chars.location(),
                                 "Unterminated dollar-quoted, expected $",
@@ -2566,6 +2557,28 @@ mod tests {
                     })
                 ]
             ),
+            (
+                String::from("SELECT $abc$$abc$"),
+                vec![
+                    Token::make_keyword("SELECT"),
+                    Token::Whitespace(Whitespace::Space),
+                    Token::DollarQuotedString(DollarQuotedString {
+                        value: "".into(),
+                        tag: Some("abc".into()),
+                    })
+                ]
+            ),
+            (
+                String::from("0$abc$$abc$1"),
+                vec![
+                    Token::Number("0".into(), false),
+                    Token::DollarQuotedString(DollarQuotedString {
+                        value: "".into(),
+                        tag: Some("abc".into()),
+                    }),
+                    Token::Number("1".into(), false),
+                ]
+            ),
         ];
 
         let dialect = GenericDialect {};
@@ -2591,6 +2604,23 @@ mod tests {
         );
     }
 
+    #[test]
+    fn tokenize_dollar_quoted_string_tagged_unterminated_mirror() {
+        let sql = String::from("SELECT $abc$abc$");
+        let dialect = GenericDialect {};
+        assert_eq!(
+            Tokenizer::new(&dialect, &sql).tokenize(),
+            Err(TokenizerError {
+                message: "Unterminated dollar-quoted, expected $".into(),
+                location: Location {
+                    line: 1,
+                    column: 17
+                }
+            })
+        );
+    }
+
+
     #[test]
     fn tokenize_dollar_placeholder() {
         let sql = String::from("SELECT $$, $$ABC$$, $ABC$, $ABC");
@@ -2615,6 +2645,38 @@ mod tests {
         );
     }
 
+    #[test]
+    fn tokenize_nested_dollar_quoted_strings() {
+        let sql = String::from("SELECT $tag$dollar $nested$ string$tag$");
+        let dialect = GenericDialect {};
+        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
+        let expected = vec![
+            Token::make_keyword("SELECT"),
+            Token::Whitespace(Whitespace::Space),
+            Token::DollarQuotedString(DollarQuotedString {
+                value: "dollar $nested$ string".into(),
+                tag: Some("tag".into()),
+            }),
+        ];
+        compare(expected, tokens);
+    }
+
+    #[test]
+    fn tokenize_dollar_quoted_string_untagged_empty() {
+        let sql = String::from("SELECT $$$$");
+        let dialect = GenericDialect {};
+        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
+        let expected = vec![
+            Token::make_keyword("SELECT"),
+            Token::Whitespace(Whitespace::Space),
+            Token::DollarQuotedString(DollarQuotedString {
+                value: "".into(),
+                tag: None,
+            }),
+        ];
+        compare(expected, tokens);
+    }
+
     #[test]
     fn tokenize_dollar_quoted_string_untagged() {
         let sql =

From 5d21da9bb7a9060c71be24b1bec46a0de638f355 Mon Sep 17 00:00:00 2001
From: Hans Ott <hansott@hotmail.be>
Date: Tue, 7 Jan 2025 20:09:50 +0100
Subject: [PATCH 4/7] Fix code style

---
 src/tokenizer.rs | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/tokenizer.rs b/src/tokenizer.rs
index 81f49aada..c219dce0c 100644
--- a/src/tokenizer.rs
+++ b/src/tokenizer.rs
@@ -2620,7 +2620,6 @@ mod tests {
         );
     }
 
-
     #[test]
     fn tokenize_dollar_placeholder() {
         let sql = String::from("SELECT $$, $$ABC$$, $ABC$, $ABC");

From e9cb4086e48d4ba49b872425df314e795082ff58 Mon Sep 17 00:00:00 2001
From: Hans Ott <hansott@hotmail.be>
Date: Tue, 7 Jan 2025 20:10:48 +0100
Subject: [PATCH 5/7] Remove comment

---
 src/tokenizer.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/tokenizer.rs b/src/tokenizer.rs
index c219dce0c..e7b8802d1 100644
--- a/src/tokenizer.rs
+++ b/src/tokenizer.rs
@@ -1560,7 +1560,7 @@ impl<'a> Tokenizer<'a> {
                 loop {
                     match chars.next() {
                         Some(ch) => {
-                            temp.push(ch); // Collect all characters into `temp`
+                            temp.push(ch);
 
                             if temp.ends_with(&end_delimiter) {
                                 s.push_str(&temp[..temp.len() - end_delimiter.len()]);

From dfcd1313e7a4a8ef7b97175832bdf9f10a3a531a Mon Sep 17 00:00:00 2001
From: Hans Ott <hansott@hotmail.be>
Date: Sun, 12 Jan 2025 18:31:25 +0100
Subject: [PATCH 6/7] Use strip_suffix instead of slicing bytes

String::len won't necessarily produce correct results on arbitrary UTF8 strings
---
 src/tokenizer.rs | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/src/tokenizer.rs b/src/tokenizer.rs
index e7b8802d1..dbd7d56b9 100644
--- a/src/tokenizer.rs
+++ b/src/tokenizer.rs
@@ -1563,13 +1563,19 @@ impl<'a> Tokenizer<'a> {
                             temp.push(ch);
 
                             if temp.ends_with(&end_delimiter) {
-                                s.push_str(&temp[..temp.len() - end_delimiter.len()]);
+                                if let Some(temp) = temp.strip_suffix(&end_delimiter) {
+                                    s.push_str(temp);
+                                    break;
+                                }
                                 break;
                             }
                         }
                         None => {
                             if temp.ends_with(&end_delimiter) {
-                                s.push_str(&temp[..temp.len() - end_delimiter.len()]);
+                                if let Some(temp) = temp.strip_suffix(&end_delimiter) {
+                                    s.push_str(temp);
+                                    break;
+                                }
                                 break;
                             }
 
@@ -2579,6 +2585,15 @@ mod tests {
                     Token::Number("1".into(), false),
                 ]
             ),
+            (
+                String::from("$function$abc$q$data$q$$function$"),
+                vec![
+                    Token::DollarQuotedString(DollarQuotedString {
+                        value: "abc$q$data$q$".into(),
+                        tag: Some("function".into()),
+                    }),
+                ]
+            ),
         ];
 
         let dialect = GenericDialect {};

From 955f8bb4f41e35c19c9096a9546f17be556bcf53 Mon Sep 17 00:00:00 2001
From: Hans Ott <hansott@hotmail.be>
Date: Sun, 12 Jan 2025 18:35:22 +0100
Subject: [PATCH 7/7] Remove extra break

---
 src/tokenizer.rs | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/tokenizer.rs b/src/tokenizer.rs
index dbd7d56b9..c9c4ded55 100644
--- a/src/tokenizer.rs
+++ b/src/tokenizer.rs
@@ -1565,7 +1565,6 @@ impl<'a> Tokenizer<'a> {
                             if temp.ends_with(&end_delimiter) {
                                 if let Some(temp) = temp.strip_suffix(&end_delimiter) {
                                     s.push_str(temp);
-                                    break;
                                 }
                                 break;
                             }
@@ -1574,7 +1573,6 @@ impl<'a> Tokenizer<'a> {
                             if temp.ends_with(&end_delimiter) {
                                 if let Some(temp) = temp.strip_suffix(&end_delimiter) {
                                     s.push_str(temp);
-                                    break;
                                 }
                                 break;
                             }