From a452a80fbd6166c4f9b1938f67d32c8ae7f1d577 Mon Sep 17 00:00:00 2001
From: Martin Nowak <code@dawg.eu>
Date: Sun, 27 Aug 2023 22:48:33 +0200
Subject: [PATCH 1/2] feat: show location info in parse errors

- add consistent Display for Location
- show (optional) token location in parse errors
- keep Location line = 0 as noop
---
 src/parser/mod.rs | 93 ++++++++++++++++++++++++++++++++---------------
 src/tokenizer.rs  | 48 +++++++++++++-----------
 2 files changed, 89 insertions(+), 52 deletions(-)

diff --git a/src/parser/mod.rs b/src/parser/mod.rs
index 846215249..8f7c4b0cf 100644
--- a/src/parser/mod.rs
+++ b/src/parser/mod.rs
@@ -44,8 +44,8 @@ pub enum ParserError {
 
 // Use `Parser::expected` instead, if possible
 macro_rules! parser_err {
-    ($MSG:expr) => {
-        Err(ParserError::ParserError($MSG.to_string()))
+    ($MSG:expr, $loc:expr) => {
+        Err(ParserError::ParserError(format!("{}{}", $MSG, $loc)))
     };
 }
 
@@ -724,6 +724,7 @@ impl<'a> Parser<'a> {
         // Note also that naively `SELECT date` looks like a syntax error because the `date` type
         // name is not followed by a string literal, but in fact in PostgreSQL it is a valid
         // expression that should parse as the column name "date".
+        let loc = self.peek_token().location;
         return_ok_if_some!(self.maybe_parse(|parser| {
             match parser.parse_data_type()? {
                 DataType::Interval => parser.parse_interval(),
@@ -734,7 +735,7 @@ impl<'a> Parser<'a> {
                 // name, resulting in `NOT 'a'` being recognized as a `TypedString` instead of
                 // an unary negation `NOT ('a' LIKE 'b')`. To solve this, we don't accept the
                 // `type 'string'` syntax for the custom data types at all.
-                DataType::Custom(..) => parser_err!("dummy"),
+                DataType::Custom(..) => parser_err!("dummy", loc),
                 data_type => Ok(Expr::TypedString {
                     data_type,
                     value: parser.parse_literal_string()?,
@@ -909,7 +910,12 @@ impl<'a> Parser<'a> {
                     let tok = self.next_token();
                     let key = match tok.token {
                         Token::Word(word) => word.to_ident(),
-                        _ => return parser_err!(format!("Expected identifier, found: {tok}")),
+                        _ => {
+                            return parser_err!(
+                                format!("Expected identifier, found: {tok}"),
+                                tok.location
+                            )
+                        }
                     };
                     Ok(Expr::CompositeAccess {
                         expr: Box::new(expr),
@@ -1220,7 +1226,10 @@ impl<'a> Parser<'a> {
                 r#in: Box::new(from),
             })
         } else {
-            parser_err!("Position function must include IN keyword".to_string())
+            parser_err!(
+                "Position function must include IN keyword".to_string(),
+                self.peek_token().location
+            )
         }
     }
 
@@ -1884,7 +1893,10 @@ impl<'a> Parser<'a> {
                     }
                 }
                 // Can only happen if `get_next_precedence` got out of sync with this function
-                _ => parser_err!(format!("No infix parser for token {:?}", tok.token)),
+                _ => parser_err!(
+                    format!("No infix parser for token {:?}", tok.token),
+                    tok.location
+                ),
             }
         } else if Token::DoubleColon == tok {
             self.parse_pg_cast(expr)
@@ -1935,7 +1947,10 @@ impl<'a> Parser<'a> {
             })
         } else {
             // Can only happen if `get_next_precedence` got out of sync with this function
-            parser_err!(format!("No infix parser for token {:?}", tok.token))
+            parser_err!(
+                format!("No infix parser for token {:?}", tok.token),
+                tok.location
+            )
         }
     }
 
@@ -2213,7 +2228,10 @@ impl<'a> Parser<'a> {
 
     /// Report unexpected token
     pub fn expected<T>(&self, expected: &str, found: TokenWithLocation) -> Result<T, ParserError> {
-        parser_err!(format!("Expected {expected}, found: {found}"))
+        parser_err!(
+            format!("Expected {expected}, found: {found}"),
+            found.location
+        )
     }
 
     /// Look for an expected keyword and consume it if it exists
@@ -2378,13 +2396,14 @@ impl<'a> Parser<'a> {
     /// Parse either `ALL`, `DISTINCT` or `DISTINCT ON (...)`. Returns `None` if `ALL` is parsed
     /// and results in a `ParserError` if both `ALL` and `DISTINCT` are found.
     pub fn parse_all_or_distinct(&mut self) -> Result<Option<Distinct>, ParserError> {
+        let loc = self.peek_token().location;
         let all = self.parse_keyword(Keyword::ALL);
         let distinct = self.parse_keyword(Keyword::DISTINCT);
         if !distinct {
             return Ok(None);
         }
         if all {
-            return parser_err!("Cannot specify both ALL and DISTINCT".to_string());
+            return parser_err!("Cannot specify both ALL and DISTINCT".to_string(), loc);
         }
         let on = self.parse_keyword(Keyword::ON);
         if !on {
@@ -2986,10 +3005,14 @@ impl<'a> Parser<'a> {
         let mut admin = vec![];
 
         while let Some(keyword) = self.parse_one_of_keywords(&optional_keywords) {
+            let loc = self
+                .tokens
+                .get(self.index - 1)
+                .map_or(Location { line: 0, column: 0 }, |t| t.location);
             match keyword {
                 Keyword::AUTHORIZATION => {
                     if authorization_owner.is_some() {
-                        parser_err!("Found multiple AUTHORIZATION")
+                        parser_err!("Found multiple AUTHORIZATION", loc)
                     } else {
                         authorization_owner = Some(self.parse_object_name()?);
                         Ok(())
@@ -2997,7 +3020,7 @@ impl<'a> Parser<'a> {
                 }
                 Keyword::LOGIN | Keyword::NOLOGIN => {
                     if login.is_some() {
-                        parser_err!("Found multiple LOGIN or NOLOGIN")
+                        parser_err!("Found multiple LOGIN or NOLOGIN", loc)
                     } else {
                         login = Some(keyword == Keyword::LOGIN);
                         Ok(())
@@ -3005,7 +3028,7 @@ impl<'a> Parser<'a> {
                 }
                 Keyword::INHERIT | Keyword::NOINHERIT => {
                     if inherit.is_some() {
-                        parser_err!("Found multiple INHERIT or NOINHERIT")
+                        parser_err!("Found multiple INHERIT or NOINHERIT", loc)
                     } else {
                         inherit = Some(keyword == Keyword::INHERIT);
                         Ok(())
@@ -3013,7 +3036,7 @@ impl<'a> Parser<'a> {
                 }
                 Keyword::BYPASSRLS | Keyword::NOBYPASSRLS => {
                     if bypassrls.is_some() {
-                        parser_err!("Found multiple BYPASSRLS or NOBYPASSRLS")
+                        parser_err!("Found multiple BYPASSRLS or NOBYPASSRLS", loc)
                     } else {
                         bypassrls = Some(keyword == Keyword::BYPASSRLS);
                         Ok(())
@@ -3021,7 +3044,7 @@ impl<'a> Parser<'a> {
                 }
                 Keyword::CREATEDB | Keyword::NOCREATEDB => {
                     if create_db.is_some() {
-                        parser_err!("Found multiple CREATEDB or NOCREATEDB")
+                        parser_err!("Found multiple CREATEDB or NOCREATEDB", loc)
                     } else {
                         create_db = Some(keyword == Keyword::CREATEDB);
                         Ok(())
@@ -3029,7 +3052,7 @@ impl<'a> Parser<'a> {
                 }
                 Keyword::CREATEROLE | Keyword::NOCREATEROLE => {
                     if create_role.is_some() {
-                        parser_err!("Found multiple CREATEROLE or NOCREATEROLE")
+                        parser_err!("Found multiple CREATEROLE or NOCREATEROLE", loc)
                     } else {
                         create_role = Some(keyword == Keyword::CREATEROLE);
                         Ok(())
@@ -3037,7 +3060,7 @@ impl<'a> Parser<'a> {
                 }
                 Keyword::SUPERUSER | Keyword::NOSUPERUSER => {
                     if superuser.is_some() {
-                        parser_err!("Found multiple SUPERUSER or NOSUPERUSER")
+                        parser_err!("Found multiple SUPERUSER or NOSUPERUSER", loc)
                     } else {
                         superuser = Some(keyword == Keyword::SUPERUSER);
                         Ok(())
@@ -3045,7 +3068,7 @@ impl<'a> Parser<'a> {
                 }
                 Keyword::REPLICATION | Keyword::NOREPLICATION => {
                     if replication.is_some() {
-                        parser_err!("Found multiple REPLICATION or NOREPLICATION")
+                        parser_err!("Found multiple REPLICATION or NOREPLICATION", loc)
                     } else {
                         replication = Some(keyword == Keyword::REPLICATION);
                         Ok(())
@@ -3053,7 +3076,7 @@ impl<'a> Parser<'a> {
                 }
                 Keyword::PASSWORD => {
                     if password.is_some() {
-                        parser_err!("Found multiple PASSWORD")
+                        parser_err!("Found multiple PASSWORD", loc)
                     } else {
                         password = if self.parse_keyword(Keyword::NULL) {
                             Some(Password::NullPassword)
@@ -3066,7 +3089,7 @@ impl<'a> Parser<'a> {
                 Keyword::CONNECTION => {
                     self.expect_keyword(Keyword::LIMIT)?;
                     if connection_limit.is_some() {
-                        parser_err!("Found multiple CONNECTION LIMIT")
+                        parser_err!("Found multiple CONNECTION LIMIT", loc)
                     } else {
                         connection_limit = Some(Expr::Value(self.parse_number_value()?));
                         Ok(())
@@ -3075,7 +3098,7 @@ impl<'a> Parser<'a> {
                 Keyword::VALID => {
                     self.expect_keyword(Keyword::UNTIL)?;
                     if valid_until.is_some() {
-                        parser_err!("Found multiple VALID UNTIL")
+                        parser_err!("Found multiple VALID UNTIL", loc)
                     } else {
                         valid_until = Some(Expr::Value(self.parse_value()?));
                         Ok(())
@@ -3084,14 +3107,14 @@ impl<'a> Parser<'a> {
                 Keyword::IN => {
                     if self.parse_keyword(Keyword::ROLE) {
                         if !in_role.is_empty() {
-                            parser_err!("Found multiple IN ROLE")
+                            parser_err!("Found multiple IN ROLE", loc)
                         } else {
                             in_role = self.parse_comma_separated(Parser::parse_identifier)?;
                             Ok(())
                         }
                     } else if self.parse_keyword(Keyword::GROUP) {
                         if !in_group.is_empty() {
-                            parser_err!("Found multiple IN GROUP")
+                            parser_err!("Found multiple IN GROUP", loc)
                         } else {
                             in_group = self.parse_comma_separated(Parser::parse_identifier)?;
                             Ok(())
@@ -3102,7 +3125,7 @@ impl<'a> Parser<'a> {
                 }
                 Keyword::ROLE => {
                     if !role.is_empty() {
-                        parser_err!("Found multiple ROLE")
+                        parser_err!("Found multiple ROLE", loc)
                     } else {
                         role = self.parse_comma_separated(Parser::parse_identifier)?;
                         Ok(())
@@ -3110,7 +3133,7 @@ impl<'a> Parser<'a> {
                 }
                 Keyword::USER => {
                     if !user.is_empty() {
-                        parser_err!("Found multiple USER")
+                        parser_err!("Found multiple USER", loc)
                     } else {
                         user = self.parse_comma_separated(Parser::parse_identifier)?;
                         Ok(())
@@ -3118,7 +3141,7 @@ impl<'a> Parser<'a> {
                 }
                 Keyword::ADMIN => {
                     if !admin.is_empty() {
-                        parser_err!("Found multiple ADMIN")
+                        parser_err!("Found multiple ADMIN", loc)
                     } else {
                         admin = self.parse_comma_separated(Parser::parse_identifier)?;
                         Ok(())
@@ -3181,14 +3204,19 @@ impl<'a> Parser<'a> {
         // specifying multiple objects to delete in a single statement
         let if_exists = self.parse_keywords(&[Keyword::IF, Keyword::EXISTS]);
         let names = self.parse_comma_separated(Parser::parse_object_name)?;
+
+        let loc = self.peek_token().location;
         let cascade = self.parse_keyword(Keyword::CASCADE);
         let restrict = self.parse_keyword(Keyword::RESTRICT);
         let purge = self.parse_keyword(Keyword::PURGE);
         if cascade && restrict {
-            return parser_err!("Cannot specify both CASCADE and RESTRICT in DROP");
+            return parser_err!("Cannot specify both CASCADE and RESTRICT in DROP", loc);
         }
         if object_type == ObjectType::Role && (cascade || restrict || purge) {
-            return parser_err!("Cannot specify CASCADE, RESTRICT, or PURGE in DROP ROLE");
+            return parser_err!(
+                "Cannot specify CASCADE, RESTRICT, or PURGE in DROP ROLE",
+                loc
+            );
         }
         Ok(Statement::Drop {
             object_type,
@@ -4446,7 +4474,11 @@ impl<'a> Parser<'a> {
     fn parse_literal_char(&mut self) -> Result<char, ParserError> {
         let s = self.parse_literal_string()?;
         if s.len() != 1 {
-            return parser_err!(format!("Expect a char, found {s:?}"));
+            let loc = self
+                .tokens
+                .get(self.index - 1)
+                .map_or(Location { line: 0, column: 0 }, |t| t.location);
+            return parser_err!(format!("Expect a char, found {s:?}"), loc);
         }
         Ok(s.chars().next().unwrap())
     }
@@ -4525,7 +4557,7 @@ impl<'a> Parser<'a> {
             // (i.e., it returns the input string).
             Token::Number(ref n, l) => match n.parse() {
                 Ok(n) => Ok(Value::Number(n, l)),
-                Err(e) => parser_err!(format!("Could not parse '{n}' as number: {e}")),
+                Err(e) => parser_err!(format!("Could not parse '{n}' as number: {e}"), location),
             },
             Token::SingleQuotedString(ref s) => Ok(Value::SingleQuotedString(s.to_string())),
             Token::DoubleQuotedString(ref s) => Ok(Value::DoubleQuotedString(s.to_string())),
@@ -6465,10 +6497,11 @@ impl<'a> Parser<'a> {
             .parse_keywords(&[Keyword::GRANTED, Keyword::BY])
             .then(|| self.parse_identifier().unwrap());
 
+        let loc = self.peek_token().location;
         let cascade = self.parse_keyword(Keyword::CASCADE);
         let restrict = self.parse_keyword(Keyword::RESTRICT);
         if cascade && restrict {
-            return parser_err!("Cannot specify both CASCADE and RESTRICT in REVOKE");
+            return parser_err!("Cannot specify both CASCADE and RESTRICT in REVOKE", loc);
         }
 
         Ok(Statement::Revoke {
diff --git a/src/tokenizer.rs b/src/tokenizer.rs
index f20e01b71..1b1b1e96c 100644
--- a/src/tokenizer.rs
+++ b/src/tokenizer.rs
@@ -350,7 +350,7 @@ impl fmt::Display for Whitespace {
 }
 
 /// Location in input string
-#[derive(Debug, Eq, PartialEq, Clone)]
+#[derive(Debug, Eq, PartialEq, Clone, Copy)]
 pub struct Location {
     /// Line number, starting from 1
     pub line: u64,
@@ -358,6 +358,20 @@ pub struct Location {
     pub column: u64,
 }
 
+impl fmt::Display for Location {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        if self.line == 0 {
+            return Ok(());
+        }
+        write!(
+            f,
+            // TODO: use standard compiler location syntax (<path>:<line>:<col>)
+            " at Line: {}, Column {}",
+            self.line, self.column,
+        )
+    }
+}
+
 /// A [Token] with [Location] attached to it
 #[derive(Debug, Eq, PartialEq, Clone)]
 pub struct TokenWithLocation {
@@ -400,17 +414,12 @@ impl fmt::Display for TokenWithLocation {
 #[derive(Debug, PartialEq, Eq)]
 pub struct TokenizerError {
     pub message: String,
-    pub line: u64,
-    pub col: u64,
+    pub location: Location,
 }
 
 impl fmt::Display for TokenizerError {
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        write!(
-            f,
-            "{} at Line: {}, Column {}",
-            self.message, self.line, self.col
-        )
+        write!(f, "{}{}", self.message, self.location,)
     }
 }
 
@@ -546,10 +555,7 @@ impl<'a> Tokenizer<'a> {
 
         let mut location = state.location();
         while let Some(token) = self.next_token(&mut state)? {
-            tokens.push(TokenWithLocation {
-                token,
-                location: location.clone(),
-            });
+            tokens.push(TokenWithLocation { token, location });
 
             location = state.location();
         }
@@ -1122,8 +1128,7 @@ impl<'a> Tokenizer<'a> {
     ) -> Result<R, TokenizerError> {
         Err(TokenizerError {
             message: message.into(),
-            col: loc.column,
-            line: loc.line,
+            location: loc,
         })
     }
 
@@ -1368,8 +1373,7 @@ mod tests {
     fn tokenizer_error_impl() {
         let err = TokenizerError {
             message: "test".into(),
-            line: 1,
-            col: 1,
+            location: Location { line: 1, column: 1 },
         };
         #[cfg(feature = "std")]
         {
@@ -1694,8 +1698,7 @@ mod tests {
             tokenizer.tokenize(),
             Err(TokenizerError {
                 message: "Unterminated string literal".to_string(),
-                line: 1,
-                col: 8
+                location: Location { line: 1, column: 8 },
             })
         );
     }
@@ -1710,8 +1713,10 @@ mod tests {
             tokenizer.tokenize(),
             Err(TokenizerError {
                 message: "Unterminated string literal".to_string(),
-                line: 1,
-                col: 35
+                location: Location {
+                    line: 1,
+                    column: 35
+                }
             })
         );
     }
@@ -1873,8 +1878,7 @@ mod tests {
             tokenizer.tokenize(),
             Err(TokenizerError {
                 message: "Expected close delimiter '\"' before EOF.".to_string(),
-                line: 1,
-                col: 1
+                location: Location { line: 1, column: 1 },
             })
         );
     }

From cad04dac2b03bd859eda3d28a3d6a68ff4c36be9 Mon Sep 17 00:00:00 2001
From: Martin Nowak <code@dawg.eu>
Date: Sun, 27 Aug 2023 23:18:12 +0200
Subject: [PATCH 2/2] feat!: tokenize with locations by default

- in parse_sql and try_with_sql
- update test_parser_error_loc
- change test_utils to parse without locations to avoid maintenance overhead

BREAKING CHANGE: changes the default ParserError messages
---
 src/parser/mod.rs         |  8 +++-----
 src/test_utils.rs         |  8 +++++++-
 tests/sqlparser_common.rs | 30 +++++++++++++++++-------------
 3 files changed, 27 insertions(+), 19 deletions(-)

diff --git a/src/parser/mod.rs b/src/parser/mod.rs
index 8f7c4b0cf..6902fc565 100644
--- a/src/parser/mod.rs
+++ b/src/parser/mod.rs
@@ -368,8 +368,8 @@ impl<'a> Parser<'a> {
         debug!("Parsing sql '{}'...", sql);
         let tokens = Tokenizer::new(self.dialect, sql)
             .with_unescape(self.options.unescape)
-            .tokenize()?;
-        Ok(self.with_tokens(tokens))
+            .tokenize_with_location()?;
+        Ok(self.with_tokens_with_locations(tokens))
     }
 
     /// Parse potentially multiple statements
@@ -7962,14 +7962,12 @@ mod tests {
 
     #[test]
     fn test_parser_error_loc() {
-        // TODO: Once we thread token locations through the parser, we should update this
-        // test to assert the locations of the referenced token
         let sql = "SELECT this is a syntax error";
         let ast = Parser::parse_sql(&GenericDialect, sql);
         assert_eq!(
             ast,
             Err(ParserError::ParserError(
-                "Expected [NOT] NULL or TRUE|FALSE or [NOT] DISTINCT FROM after IS, found: a"
+                "Expected [NOT] NULL or TRUE|FALSE or [NOT] DISTINCT FROM after IS, found: a at Line: 1, Column 16"
                     .to_string()
             ))
         );
diff --git a/src/test_utils.rs b/src/test_utils.rs
index 91130fb51..a5e9e739d 100644
--- a/src/test_utils.rs
+++ b/src/test_utils.rs
@@ -28,6 +28,7 @@ use core::fmt::Debug;
 
 use crate::dialect::*;
 use crate::parser::{Parser, ParserError};
+use crate::tokenizer::Tokenizer;
 use crate::{ast::*, parser::ParserOptions};
 
 /// Tests use the methods on this struct to invoke the parser on one or
@@ -82,8 +83,13 @@ impl TestedDialects {
     /// the result is the same for all tested dialects.
     pub fn parse_sql_statements(&self, sql: &str) -> Result<Vec<Statement>, ParserError> {
         self.one_of_identical_results(|dialect| {
+            let mut tokenizer = Tokenizer::new(dialect, sql);
+            if let Some(options) = &self.options {
+                tokenizer = tokenizer.with_unescape(options.unescape);
+            }
+            let tokens = tokenizer.tokenize()?;
             self.new_parser(dialect)
-                .try_with_sql(sql)?
+                .with_tokens(tokens)
                 .parse_statements()
         })
         // To fail the `ensure_multiple_dialects_are_tested` test:
diff --git a/tests/sqlparser_common.rs b/tests/sqlparser_common.rs
index 3fdf3d211..450743360 100644
--- a/tests/sqlparser_common.rs
+++ b/tests/sqlparser_common.rs
@@ -836,7 +836,12 @@ fn test_eof_after_as() {
 
 #[test]
 fn test_no_infix_error() {
-    let res = Parser::parse_sql(&ClickHouseDialect {}, "ASSERT-URA<<");
+    let dialects = TestedDialects {
+        dialects: vec![Box::new(ClickHouseDialect {})],
+        options: None,
+    };
+
+    let res = dialects.parse_sql_statements("ASSERT-URA<<");
     assert_eq!(
         ParserError::ParserError("No infix parser for token ShiftLeft".to_string()),
         res.unwrap_err()
@@ -3238,19 +3243,21 @@ fn parse_alter_table_alter_column_type() {
         _ => unreachable!(),
     }
 
-    let res = Parser::parse_sql(
-        &GenericDialect {},
-        &format!("{alter_stmt} ALTER COLUMN is_active TYPE TEXT"),
-    );
+    let dialect = TestedDialects {
+        dialects: vec![Box::new(GenericDialect {})],
+        options: None,
+    };
+
+    let res =
+        dialect.parse_sql_statements(&format!("{alter_stmt} ALTER COLUMN is_active TYPE TEXT"));
     assert_eq!(
         ParserError::ParserError("Expected SET/DROP NOT NULL, SET DEFAULT, SET DATA TYPE after ALTER COLUMN, found: TYPE".to_string()),
         res.unwrap_err()
     );
 
-    let res = Parser::parse_sql(
-        &GenericDialect {},
-        &format!("{alter_stmt} ALTER COLUMN is_active SET DATA TYPE TEXT USING 'text'"),
-    );
+    let res = dialect.parse_sql_statements(&format!(
+        "{alter_stmt} ALTER COLUMN is_active SET DATA TYPE TEXT USING 'text'"
+    ));
     assert_eq!(
         ParserError::ParserError("Expected end of statement, found: USING".to_string()),
         res.unwrap_err()
@@ -3295,10 +3302,7 @@ fn parse_alter_table_drop_constraint() {
         _ => unreachable!(),
     }
 
-    let res = Parser::parse_sql(
-        &GenericDialect {},
-        &format!("{alter_stmt} DROP CONSTRAINT is_active TEXT"),
-    );
+    let res = parse_sql_statements(&format!("{alter_stmt} DROP CONSTRAINT is_active TEXT"));
     assert_eq!(
         ParserError::ParserError("Expected end of statement, found: TEXT".to_string()),
         res.unwrap_err()