Skip to content

Commit

Permalink
Add support for BigQuery struct and array datatype
Browse files Browse the repository at this point in the history
This builds on top of #817

- `STRUCT` literal support via `STRUCT` keyword
   https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#constructing_a_struct
- `STRUCT` and `ARRAY` type declarations
   https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#declaring_an_array_type
   https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#declaring_a_struct_type

It works around the issue of not being able to parse nested types
like `ARRAY<ARRAY<INT>>` due to the right angle bracket ambiguity
where the tokenizer chooses the right-shift operator (this affects
other dialects like Hive that have similar syntax).
When parsing such types, we accept a closing `>` or `>>` and track
which variant is in use in order to preserve correctness.

Fixes #901
Closes #817
  • Loading branch information
iffyio committed Oct 15, 2023
1 parent 83cb734 commit ee8c6f6
Show file tree
Hide file tree
Showing 8 changed files with 894 additions and 65 deletions.
67 changes: 55 additions & 12 deletions src/ast/data_type.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ use serde::{Deserialize, Serialize};
#[cfg(feature = "visitor")]
use sqlparser_derive::{Visit, VisitMut};

use crate::ast::ObjectName;
use crate::ast::{display_comma_separated, ObjectName, StructField};

use super::value::escape_single_quote_string;

Expand Down Expand Up @@ -71,6 +71,10 @@ pub enum DataType {
/// [standard]: https://jakewheat.github.io/sql-overview/sql-2016-foundation-grammar.html#binary-large-object-string-type
/// [Oracle]: https://docs.oracle.com/javadb/10.8.3.0/ref/rrefblob.html
Blob(Option<u64>),
/// Variable-length binary data with optional length.
///
/// [bigquery]: https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#bytes_type
Bytes(Option<u64>),
/// Numeric type with optional precision and scale e.g. NUMERIC(10,2), [standard][1]
///
/// [1]: https://jakewheat.github.io/sql-overview/sql-2016-foundation-grammar.html#exact-numeric-type
Expand Down Expand Up @@ -125,6 +129,10 @@ pub enum DataType {
///
/// [postgresql]: https://www.postgresql.org/docs/15/datatype.html
Int4(Option<u64>),
/// Integer type in [bigquery]
///
/// [bigquery]: https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#integer_types
Int64,
/// Integer with optional display width e.g. INTEGER or INTEGER(11)
Integer(Option<u64>),
/// Unsigned int with optional display width e.g. INT UNSIGNED or INT(11) UNSIGNED
Expand All @@ -149,6 +157,10 @@ pub enum DataType {
///
/// [postgresql]: https://www.postgresql.org/docs/15/datatype.html
Float4,
/// Floating point in [bigquery]
///
/// [bigquery]: https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#floating_point_types
Float64,
/// Floating point e.g. REAL
Real,
/// Float8 as alias for Double in [postgresql]
Expand Down Expand Up @@ -190,18 +202,23 @@ pub enum DataType {
Regclass,
/// Text
Text,
/// String
String,
/// String with optional length.
String(Option<u64>),
/// Bytea
Bytea,
/// Custom type such as enums
Custom(ObjectName, Vec<String>),
/// Arrays
Array(Option<Box<DataType>>),
Array(ArrayElemTypeDef),
/// Enums
Enum(Vec<String>),
/// Set
Set(Vec<String>),
/// Struct
///
/// [hive]: https://docs.cloudera.com/cdw-runtime/cloud/impala-sql-reference/topics/impala-struct.html
/// [bigquery]: https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#struct_type
Struct(Vec<StructField>),
}

impl fmt::Display for DataType {
Expand Down Expand Up @@ -231,6 +248,7 @@ impl fmt::Display for DataType {
format_type_with_optional_length(f, "VARBINARY", size, false)
}
DataType::Blob(size) => format_type_with_optional_length(f, "BLOB", size, false),
DataType::Bytes(size) => format_type_with_optional_length(f, "BYTES", size, false),
DataType::Numeric(info) => {
write!(f, "NUMERIC{info}")
}
Expand Down Expand Up @@ -274,6 +292,9 @@ impl fmt::Display for DataType {
DataType::Int4(zerofill) => {
format_type_with_optional_length(f, "INT4", zerofill, false)
}
DataType::Int64 => {
write!(f, "INT64")
}
DataType::UnsignedInt4(zerofill) => {
format_type_with_optional_length(f, "INT4", zerofill, true)
}
Expand All @@ -297,6 +318,7 @@ impl fmt::Display for DataType {
}
DataType::Real => write!(f, "REAL"),
DataType::Float4 => write!(f, "FLOAT4"),
DataType::Float64 => write!(f, "FLOAT64"),
DataType::Double => write!(f, "DOUBLE"),
DataType::Float8 => write!(f, "FLOAT8"),
DataType::DoublePrecision => write!(f, "DOUBLE PRECISION"),
Expand All @@ -316,15 +338,13 @@ impl fmt::Display for DataType {
DataType::JSON => write!(f, "JSON"),
DataType::Regclass => write!(f, "REGCLASS"),
DataType::Text => write!(f, "TEXT"),
DataType::String => write!(f, "STRING"),
DataType::String(size) => format_type_with_optional_length(f, "STRING", size, false),
DataType::Bytea => write!(f, "BYTEA"),
DataType::Array(ty) => {
if let Some(t) = &ty {
write!(f, "{t}[]")
} else {
write!(f, "ARRAY")
}
}
DataType::Array(ty) => match ty {
ArrayElemTypeDef::None => write!(f, "ARRAY"),
ArrayElemTypeDef::SquareBracket(t) => write!(f, "{t}[]"),
ArrayElemTypeDef::AngleBracket(t) => write!(f, "ARRAY<{t}>"),
},
DataType::Custom(ty, modifiers) => {
if modifiers.is_empty() {
write!(f, "{ty}")
Expand Down Expand Up @@ -352,6 +372,13 @@ impl fmt::Display for DataType {
}
write!(f, ")")
}
DataType::Struct(fields) => {
if !fields.is_empty() {
write!(f, "STRUCT<{}>", display_comma_separated(fields))
} else {
write!(f, "STRUCT")
}
}
}
}
}
Expand Down Expand Up @@ -533,3 +560,19 @@ impl fmt::Display for CharLengthUnits {
}
}
}

/// Represents the data type of the elements in an array (if any) as well as
/// the syntax used to declare the array.
///
/// For example: Bigquery/Hive use ARRAY<INT> whereas snowflake uses ARRAY.

Check failure on line 567 in src/ast/data_type.rs

View workflow job for this annotation

GitHub Actions / docs

unclosed HTML tag `INT`
#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
pub enum ArrayElemTypeDef {
/// ARRAY
None,
/// ARRAY<INT>

Check failure on line 574 in src/ast/data_type.rs

View workflow job for this annotation

GitHub Actions / docs

unclosed HTML tag `INT`
AngleBracket(Box<DataType>),
/// []INT
SquareBracket(Box<DataType>),
}
58 changes: 57 additions & 1 deletion src/ast/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ use serde::{Deserialize, Serialize};
use sqlparser_derive::{Visit, VisitMut};

pub use self::data_type::{
CharLengthUnits, CharacterLength, DataType, ExactNumberInfo, TimezoneInfo,
ArrayElemTypeDef, CharLengthUnits, CharacterLength, DataType, ExactNumberInfo, TimezoneInfo,
};
pub use self::dcl::{AlterRoleOperation, ResetConfig, RoleOption, SetConfigValue};
pub use self::ddl::{
Expand Down Expand Up @@ -322,6 +322,27 @@ impl fmt::Display for JsonOperator {
}
}

/// A field definition within a struct.
///
/// [bigquery]: https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#struct_type
#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
pub struct StructField {
pub field_name: Option<Ident>,
pub field_type: DataType,
}

impl fmt::Display for StructField {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
if let Some(name) = &self.field_name {
write!(f, "{name} {}", self.field_type)
} else {
write!(f, "{}", self.field_type)
}
}
}

/// An SQL expression of any type.
///
/// The parser does not distinguish between expressions of different types
Expand Down Expand Up @@ -569,6 +590,26 @@ pub enum Expr {
Rollup(Vec<Vec<Expr>>),
/// ROW / TUPLE a single value, such as `SELECT (1, 2)`
Tuple(Vec<Expr>),
/// `BigQuery` specific `Struct` literal expression
/// Syntax:
/// ```sql
/// STRUCT<[field_name] field_type, ...>( expr1 [, ... ])
/// ```
/// https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#struct_type

Check failure on line 598 in src/ast/mod.rs

View workflow job for this annotation

GitHub Actions / docs

this URL is not a hyperlink
Struct {
/// Struct values.
values: Vec<Expr>,
/// Struct field definitions.
fields: Vec<StructField>,
},
/// `BigQuery` specific: An named expression in a typeless struct
///
/// Syntax
/// ```sql
/// 1 AS a
/// ```
/// https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#struct_type

Check failure on line 611 in src/ast/mod.rs

View workflow job for this annotation

GitHub Actions / docs

this URL is not a hyperlink
Named { expr: Box<Expr>, name: Ident },
/// An array index expression e.g. `(ARRAY[1, 2])[1]` or `(current_schemas(FALSE))[1]`
ArrayIndex { obj: Box<Expr>, indexes: Vec<Expr> },
/// An array expression e.g. `ARRAY[1, 2]`
Expand Down Expand Up @@ -917,6 +958,21 @@ impl fmt::Display for Expr {
Expr::Tuple(exprs) => {
write!(f, "({})", display_comma_separated(exprs))
}
Expr::Struct { values, fields } => {
if !fields.is_empty() {
write!(
f,
"STRUCT<{}>({})",
display_comma_separated(fields),
display_comma_separated(values)
)
} else {
write!(f, "STRUCT({})", display_comma_separated(values))
}
}
Expr::Named { expr, name } => {
write!(f, "{} AS {}", expr, name)
}
Expr::ArrayIndex { obj, indexes } => {
write!(f, "{obj}")?;
for i in indexes {
Expand Down
5 changes: 5 additions & 0 deletions src/keywords.rs
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,7 @@ define_keywords!(
BY,
BYPASSRLS,
BYTEA,
BYTES,
CACHE,
CALL,
CALLED,
Expand Down Expand Up @@ -270,6 +271,7 @@ define_keywords!(
FIRST_VALUE,
FLOAT,
FLOAT4,
FLOAT64,
FLOAT8,
FLOOR,
FOLLOWING,
Expand All @@ -293,6 +295,7 @@ define_keywords!(
FUSION,
GENERATE,
GENERATED,
GEOGRAPHY,
GET,
GLOBAL,
GRANT,
Expand Down Expand Up @@ -328,6 +331,7 @@ define_keywords!(
INT,
INT2,
INT4,
INT64,
INT8,
INTEGER,
INTERSECT,
Expand Down Expand Up @@ -580,6 +584,7 @@ define_keywords!(
STORED,
STRICT,
STRING,
STRUCT,
SUBMULTISET,
SUBSTRING,
SUBSTRING_REGEX,
Expand Down
Loading

0 comments on commit ee8c6f6

Please sign in to comment.