Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support the custom terminator for the CSV file format #12263

Merged
merged 7 commits into from
Sep 5, 2024
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions datafusion-examples/examples/csv_opener.rs
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ async fn main() -> Result<()> {
true,
b',',
b'"',
None,
object_store,
Some(b'#'),
);
Expand Down
13 changes: 13 additions & 0 deletions datafusion/common/src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1628,6 +1628,7 @@ config_namespace! {
pub has_header: Option<bool>, default = None
pub delimiter: u8, default = b','
pub quote: u8, default = b'"'
pub terminator: Option<u8>, default = None
pub escape: Option<u8>, default = None
pub double_quote: Option<bool>, default = None
/// Specifies whether newlines in (quoted) values are supported.
Expand Down Expand Up @@ -1696,6 +1697,13 @@ impl CsvOptions {
self
}

/// The character that terminates a row.
/// - default to None (CRLF)
pub fn with_terminator(mut self, terminator: Option<u8>) -> Self {
self.terminator = terminator;
self
}

/// The escape character in a row.
/// - default is None
pub fn with_escape(mut self, escape: Option<u8>) -> Self {
Expand Down Expand Up @@ -1742,6 +1750,11 @@ impl CsvOptions {
self.quote
}

/// The terminator character.
pub fn terminator(&self) -> Option<u8> {
self.terminator
}

/// The escape character.
pub fn escape(&self) -> Option<u8> {
self.escape
Expand Down
8 changes: 8 additions & 0 deletions datafusion/core/src/datasource/file_format/csv.rs
Original file line number Diff line number Diff line change
Expand Up @@ -247,6 +247,13 @@ impl CsvFormat {
self
}

/// The character used to indicate the end of a row.
/// - default to None (CRLF)
pub fn with_terminator(mut self, terminator: Option<u8>) -> Self {
self.options.terminator = terminator;
self
}

/// Specifies whether newlines in (quoted) values are supported.
///
/// Parsing newlines in quoted values may be affected by execution behaviour such as
Expand Down Expand Up @@ -359,6 +366,7 @@ impl FileFormat for CsvFormat {
.with_has_header(has_header)
.with_delimeter(self.options.delimiter)
.with_quote(self.options.quote)
.with_terminator(self.options.terminator)
.with_escape(self.options.escape)
.with_comment(self.options.comment)
.with_newlines_in_values(newlines_in_values)
Expand Down
10 changes: 10 additions & 0 deletions datafusion/core/src/datasource/file_format/options.rs
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,8 @@ pub struct CsvReadOptions<'a> {
pub delimiter: u8,
/// An optional quote character. Defaults to `b'"'`.
pub quote: u8,
/// An optional terminator character. Defaults to None (CRLF).
pub terminator: Option<u8>,
/// An optional escape character. Defaults to None.
pub escape: Option<u8>,
/// If enabled, lines beginning with this byte are ignored.
Expand Down Expand Up @@ -102,6 +104,7 @@ impl<'a> CsvReadOptions<'a> {
schema_infer_max_records: DEFAULT_SCHEMA_INFER_MAX_RECORD,
delimiter: b',',
quote: b'"',
terminator: None,
escape: None,
newlines_in_values: false,
file_extension: DEFAULT_CSV_EXTENSION,
Expand Down Expand Up @@ -136,6 +139,12 @@ impl<'a> CsvReadOptions<'a> {
self
}

/// Specify terminator to use for CSV read
pub fn terminator(mut self, terminator: Option<u8>) -> Self {
self.terminator = terminator;
self
}

/// Specify delimiter to use for CSV read
pub fn escape(mut self, escape: u8) -> Self {
self.escape = Some(escape);
Expand Down Expand Up @@ -511,6 +520,7 @@ impl ReadOptions<'_> for CsvReadOptions<'_> {
.with_delimiter(self.delimiter)
.with_quote(self.quote)
.with_escape(self.escape)
.with_terminator(self.terminator)
.with_newlines_in_values(self.newlines_in_values)
.with_schema_infer_max_rec(self.schema_infer_max_records)
.with_file_compression_type(self.file_compression_type.to_owned());
Expand Down
Loading