From 9ef47a8eab34dc3a5de006c7dcc68e47b8f37b4b Mon Sep 17 00:00:00 2001 From: Jefffrey <22608443+Jefffrey@users.noreply.github.com> Date: Sun, 12 Nov 2023 11:07:47 +1100 Subject: [PATCH 1/5] Allow writing null valued keys in JSON --- arrow-array/src/numeric.rs | 1 - arrow-json/src/writer.rs | 489 +++++++++++++++++--- arrow-json/test/data/nested_with_nulls.json | 4 + arrow/src/ffi.rs | 2 - arrow/tests/array_cast.rs | 1 - object_store/src/gcp/builder.rs | 2 +- 6 files changed, 420 insertions(+), 79 deletions(-) create mode 100644 arrow-json/test/data/nested_with_nulls.json diff --git a/arrow-array/src/numeric.rs b/arrow-array/src/numeric.rs index ad7b3eca1dbc..b5e474ba696a 100644 --- a/arrow-array/src/numeric.rs +++ b/arrow-array/src/numeric.rs @@ -618,7 +618,6 @@ mod tests { let mask = 0b01010101_01010101_10101010_10101010; let actual = UInt16Type::mask_from_u64(mask); let expected = expected_mask!(i16, mask); - dbg!(&expected); let expected = m16x32::from_cast(i16x32::from_slice_unaligned(expected.as_slice())); assert_eq!(expected, actual); diff --git a/arrow-json/src/writer.rs b/arrow-json/src/writer.rs index 5ecfc932364b..83bab3b0ff12 100644 --- a/arrow-json/src/writer.rs +++ b/arrow-json/src/writer.rs @@ -92,8 +92,13 @@ //! let buf = writer.into_inner(); //! assert_eq!(r#"[{"a":1},{"a":2},{"a":3}]"#, String::from_utf8(buf).unwrap()) //! ``` +//! +//! [`LineDelimitedWriter`] and [`ArrayWriter`] will omit writing keys with null values. +//! In order to explicitly write null values for keys, use the alternative +//! [`LineDelimitedWriterWithNulls`] and [`ArrayWriterWithNulls`] versions. use std::iter; +use std::marker::PhantomData; use std::{fmt::Debug, io::Write}; use serde_json::map::Map as JsonMap; @@ -107,6 +112,68 @@ use arrow_schema::*; use arrow_cast::display::{ArrayFormatter, FormatOptions}; +/// This trait controls whether null values should be written explicitly +/// for keys in objects, or whether the key should be omitted entirely. +pub trait NullHandler { + /// How to insert a maybe null into a JSON object + fn insert_value(row: &mut JsonMap, col_name: &str, value: Option); + + /// How to handle inserting a [`NullArray`] into a list of JSON objects. + fn insert_null_array(rows: &mut [JsonMap], col_name: &str); +} + +/// Skips writing keys that have null values. +/// +/// For example, with [`LineDelimited`] format: +/// +/// ```json +/// {"foo":1} +/// {"foo":1,"bar":2} +/// {} +/// ``` +pub struct SkipNulls {} + +impl NullHandler for SkipNulls { + #[inline] + fn insert_value(row: &mut JsonMap, col_name: &str, value: Option) { + if let Some(j) = value { + row.insert(col_name.to_string(), j); + } + } + + #[inline] + fn insert_null_array(_rows: &mut [JsonMap], _col_namee: &str) {} +} + +/// Writes keys that have null values. +/// +/// For example, with [`LineDelimited`] format: +/// +/// ```json +/// {"foo":1,"bar":null} +/// {"foo":1,"bar":2} +/// {"foo":null,"bar":null} +/// ``` +pub struct KeepNulls {} + +impl NullHandler for KeepNulls { + #[inline] + fn insert_value(row: &mut JsonMap, col_name: &str, value: Option) { + if let Some(j) = value { + row.insert(col_name.to_string(), j); + } else { + row.insert(col_name.to_string(), Value::Null); + } + } + + #[inline] + fn insert_null_array(rows: &mut [JsonMap], col_name: &str) { + rows.iter_mut().for_each(|row| { + row.insert(col_name.to_string(), Value::Null); + }); + } +} + fn primitive_array_to_json(array: &dyn Array) -> Result, ArrowError> where T: ArrowPrimitiveType, @@ -122,7 +189,7 @@ where .collect()) } -fn struct_array_to_jsonmap_array( +fn struct_array_to_jsonmap_array( array: &StructArray, ) -> Result>, ArrowError> { let inner_col_names = array.column_names(); @@ -132,13 +199,20 @@ fn struct_array_to_jsonmap_array( .collect::>>(); for (j, struct_col) in array.columns().iter().enumerate() { - set_column_for_json_rows(&mut inner_objs, struct_col, inner_col_names[j])? + set_column_for_json_rows::(&mut inner_objs, struct_col, inner_col_names[j])? } Ok(inner_objs) } /// Converts an arrow [`Array`] into a `Vec` of Serde JSON [`serde_json::Value`]'s pub fn array_to_json_array(array: &dyn Array) -> Result, ArrowError> { + // For backwards compatibility, default to SkipNulls + array_to_json_array_internal::(array) +} + +fn array_to_json_array_internal( + array: &dyn Array, +) -> Result, ArrowError> { match array.data_type() { DataType::Null => Ok(iter::repeat(Value::Null).take(array.len()).collect()), DataType::Boolean => Ok(array @@ -180,32 +254,32 @@ pub fn array_to_json_array(array: &dyn Array) -> Result, ArrowError> DataType::List(_) => as_list_array(array) .iter() .map(|maybe_value| match maybe_value { - Some(v) => Ok(Value::Array(array_to_json_array(&v)?)), + Some(v) => Ok(Value::Array(array_to_json_array_internal::(&v)?)), None => Ok(Value::Null), }) .collect(), DataType::LargeList(_) => as_large_list_array(array) .iter() .map(|maybe_value| match maybe_value { - Some(v) => Ok(Value::Array(array_to_json_array(&v)?)), + Some(v) => Ok(Value::Array(array_to_json_array_internal::(&v)?)), None => Ok(Value::Null), }) .collect(), DataType::FixedSizeList(_, _) => as_fixed_size_list_array(array) .iter() .map(|maybe_value| match maybe_value { - Some(v) => Ok(Value::Array(array_to_json_array(&v)?)), + Some(v) => Ok(Value::Array(array_to_json_array_internal::(&v)?)), None => Ok(Value::Null), }) .collect(), DataType::Struct(_) => { - let jsonmaps = struct_array_to_jsonmap_array(array.as_struct())?; + let jsonmaps = struct_array_to_jsonmap_array::(array.as_struct())?; Ok(jsonmaps.into_iter().map(Value::Object).collect()) } DataType::Map(_, _) => as_map_array(array) .iter() .map(|maybe_value| match maybe_value { - Some(v) => Ok(Value::Array(array_to_json_array(&v)?)), + Some(v) => Ok(Value::Array(array_to_json_array_internal::(&v)?)), None => Ok(Value::Null), }) .collect(), @@ -216,89 +290,85 @@ pub fn array_to_json_array(array: &dyn Array) -> Result, ArrowError> } macro_rules! set_column_by_array_type { - ($cast_fn:ident, $col_name:ident, $rows:ident, $array:ident) => { + ($cast_fn:ident, $col_name:ident, $rows:ident, $array:ident, $null_handler:ty) => { let arr = $cast_fn($array); $rows .iter_mut() .zip(arr.iter()) .for_each(|(row, maybe_value)| { - if let Some(v) = maybe_value { - row.insert($col_name.to_string(), v.into()); - } + <$null_handler>::insert_value(row, $col_name, maybe_value.map(Into::into)) }); }; } -fn set_column_by_primitive_type( +fn set_column_by_primitive_type( rows: &mut [JsonMap], array: &ArrayRef, col_name: &str, ) where T: ArrowPrimitiveType, T::Native: JsonSerializable, + N: NullHandler, { let primitive_arr = array.as_primitive::(); rows.iter_mut() .zip(primitive_arr.iter()) .for_each(|(row, maybe_value)| { - // when value is null, we simply skip setting the key - if let Some(j) = maybe_value.and_then(|v| v.into_json_value()) { - row.insert(col_name.to_string(), j); - } + N::insert_value(row, col_name, maybe_value.and_then(|v| v.into_json_value())) }); } -fn set_column_for_json_rows( +fn set_column_for_json_rows( rows: &mut [JsonMap], array: &ArrayRef, col_name: &str, ) -> Result<(), ArrowError> { match array.data_type() { DataType::Int8 => { - set_column_by_primitive_type::(rows, array, col_name); + set_column_by_primitive_type::(rows, array, col_name); } DataType::Int16 => { - set_column_by_primitive_type::(rows, array, col_name); + set_column_by_primitive_type::(rows, array, col_name); } DataType::Int32 => { - set_column_by_primitive_type::(rows, array, col_name); + set_column_by_primitive_type::(rows, array, col_name); } DataType::Int64 => { - set_column_by_primitive_type::(rows, array, col_name); + set_column_by_primitive_type::(rows, array, col_name); } DataType::UInt8 => { - set_column_by_primitive_type::(rows, array, col_name); + set_column_by_primitive_type::(rows, array, col_name); } DataType::UInt16 => { - set_column_by_primitive_type::(rows, array, col_name); + set_column_by_primitive_type::(rows, array, col_name); } DataType::UInt32 => { - set_column_by_primitive_type::(rows, array, col_name); + set_column_by_primitive_type::(rows, array, col_name); } DataType::UInt64 => { - set_column_by_primitive_type::(rows, array, col_name); + set_column_by_primitive_type::(rows, array, col_name); } DataType::Float16 => { - set_column_by_primitive_type::(rows, array, col_name); + set_column_by_primitive_type::(rows, array, col_name); } DataType::Float32 => { - set_column_by_primitive_type::(rows, array, col_name); + set_column_by_primitive_type::(rows, array, col_name); } DataType::Float64 => { - set_column_by_primitive_type::(rows, array, col_name); + set_column_by_primitive_type::(rows, array, col_name); } DataType::Null => { - // when value is null, we simply skip setting the key + N::insert_null_array(rows, col_name); } DataType::Boolean => { - set_column_by_array_type!(as_boolean_array, col_name, rows, array); + set_column_by_array_type!(as_boolean_array, col_name, rows, array, N); } DataType::Utf8 => { - set_column_by_array_type!(as_string_array, col_name, rows, array); + set_column_by_array_type!(as_string_array, col_name, rows, array, N); } DataType::LargeUtf8 => { - set_column_by_array_type!(as_largestring_array, col_name, rows, array); + set_column_by_array_type!(as_largestring_array, col_name, rows, array, N); } DataType::Date32 | DataType::Date64 @@ -310,16 +380,15 @@ fn set_column_for_json_rows( let formatter = ArrayFormatter::try_new(array.as_ref(), &options)?; let nulls = array.nulls(); rows.iter_mut().enumerate().for_each(|(idx, row)| { - if nulls.map(|x| x.is_valid(idx)).unwrap_or(true) { - row.insert( - col_name.to_string(), - formatter.value(idx).to_string().into(), - ); - } + let maybe_value = nulls + .map(|x| x.is_valid(idx)) + .unwrap_or(true) + .then(|| formatter.value(idx).to_string().into()); + N::insert_value(row, col_name, maybe_value); }); } DataType::Struct(_) => { - let inner_objs = struct_array_to_jsonmap_array(array.as_struct())?; + let inner_objs = struct_array_to_jsonmap_array::(array.as_struct())?; rows.iter_mut().zip(inner_objs).for_each(|(row, obj)| { row.insert(col_name.to_string(), Value::Object(obj)); }); @@ -328,9 +397,10 @@ fn set_column_for_json_rows( let listarr = as_list_array(array); rows.iter_mut().zip(listarr.iter()).try_for_each( |(row, maybe_value)| -> Result<(), ArrowError> { - if let Some(v) = maybe_value { - row.insert(col_name.to_string(), Value::Array(array_to_json_array(&v)?)); - } + let maybe_value = maybe_value + .map(|v| array_to_json_array_internal::(&v).map(Value::Array)) + .transpose()?; + N::insert_value(row, col_name, maybe_value); Ok(()) }, )?; @@ -339,10 +409,10 @@ fn set_column_for_json_rows( let listarr = as_large_list_array(array); rows.iter_mut().zip(listarr.iter()).try_for_each( |(row, maybe_value)| -> Result<(), ArrowError> { - if let Some(v) = maybe_value { - let val = array_to_json_array(&v)?; - row.insert(col_name.to_string(), Value::Array(val)); - } + let maybe_value = maybe_value + .map(|v| array_to_json_array_internal::(&v).map(Value::Array)) + .transpose()?; + N::insert_value(row, col_name, maybe_value); Ok(()) }, )?; @@ -350,7 +420,7 @@ fn set_column_for_json_rows( DataType::Dictionary(_, value_type) => { let hydrated = arrow_cast::cast::cast(&array, value_type) .expect("cannot cast dictionary to underlying values"); - set_column_for_json_rows(rows, &hydrated, col_name)?; + set_column_for_json_rows::(rows, &hydrated, col_name)?; } DataType::Map(_, _) => { let maparr = as_map_array(array); @@ -367,7 +437,7 @@ fn set_column_for_json_rows( } let keys = keys.as_string::(); - let values = array_to_json_array(values)?; + let values = array_to_json_array_internal::(values)?; let mut kv = keys.iter().zip(values); @@ -401,6 +471,13 @@ fn set_column_for_json_rows( /// [`JsonMap`]s (objects) pub fn record_batches_to_json_rows( batches: &[&RecordBatch], +) -> Result>, ArrowError> { + // For backwards compatibility, default to SkipNulls + record_batches_to_json_rows_internal::(batches) +} + +fn record_batches_to_json_rows_internal( + batches: &[&RecordBatch], ) -> Result>, ArrowError> { let mut rows: Vec> = iter::repeat(JsonMap::new()) .take(batches.iter().map(|b| b.num_rows()).sum()) @@ -414,7 +491,7 @@ pub fn record_batches_to_json_rows( let row_slice = &mut rows[base..base + batch.num_rows()]; for (j, col) in batch.columns().iter().enumerate() { let col_name = schema.field(j).name(); - set_column_for_json_rows(row_slice, col, col_name)? + set_column_for_json_rows::(row_slice, col, col_name)? } base += row_count; } @@ -450,7 +527,9 @@ pub trait JsonFormat: Debug + Default { } } -/// Produces JSON output with one record per line. For example +/// Produces JSON output with one record per line. +/// +/// For example: /// /// ```json /// {"foo":1} @@ -467,7 +546,9 @@ impl JsonFormat for LineDelimited { } } -/// Produces JSON output as a single JSON array. For example +/// Produces JSON output as a single JSON array. +/// +/// For example: /// /// ```json /// [{"foo":1},{"bar":1}] @@ -494,21 +575,36 @@ impl JsonFormat for JsonArray { } } -/// A JSON writer which serializes [`RecordBatch`]es to newline delimited JSON objects -pub type LineDelimitedWriter = Writer; +/// A JSON writer which serializes [`RecordBatch`]es to newline delimited JSON objects. +/// +/// Will skip writing keys with null values. +pub type LineDelimitedWriter = Writer; -/// A JSON writer which serializes [`RecordBatch`]es to JSON arrays -pub type ArrayWriter = Writer; +/// Similar to [`LineDelimitedWriter`] but will keep keys with null values. +pub type LineDelimitedWriterWithNulls = Writer; + +/// A JSON writer which serializes [`RecordBatch`]es to JSON arrays. +/// +/// Will skip writing keys with null values. +pub type ArrayWriter = Writer; + +/// Similar to [`ArrayWriter`] but will keep keys with null values. +pub type ArrayWriterWithNulls = Writer; /// A JSON writer which serializes [`RecordBatch`]es to a stream of -/// `u8` encoded JSON objects. See the module level documentation for -/// detailed usage and examples. The specific format of the stream is -/// controlled by the [`JsonFormat`] type parameter. +/// `u8` encoded JSON objects. +/// +/// See the module level documentation for detailed usage and examples. +/// The specific format of the stream is controlled by the [`JsonFormat`] +/// type parameter. The [`NullHandler`] type parameter controls whether +/// nulls should be written explicitly for keys or skipped. Default is +/// [`SkipNulls`] for backward compatibility. #[derive(Debug)] -pub struct Writer +pub struct Writer where W: Write, F: JsonFormat, + N: NullHandler, { /// Underlying writer to use to write bytes writer: W, @@ -521,12 +617,16 @@ where /// Determines how the byte stream is formatted format: F, + + /// Determines whether nulls should be written for keys or omitted + null_handler: PhantomData, } -impl Writer +impl Writer where W: Write, F: JsonFormat, + N: NullHandler, { /// Construct a new writer pub fn new(writer: W) -> Self { @@ -535,6 +635,7 @@ where started: false, finished: false, format: F::default(), + null_handler: Default::default(), } } @@ -556,7 +657,7 @@ where /// Convert the `RecordBatch` into JSON rows, and write them to the output pub fn write(&mut self, batch: &RecordBatch) -> Result<(), ArrowError> { - for row in record_batches_to_json_rows(&[batch])? { + for row in record_batches_to_json_rows_internal::(&[batch])? { self.write_row(&Value::Object(row))?; } Ok(()) @@ -564,7 +665,7 @@ where /// Convert the [`RecordBatch`] into JSON rows, and write them to the output pub fn write_batches(&mut self, batches: &[&RecordBatch]) -> Result<(), ArrowError> { - for row in record_batches_to_json_rows(batches)? { + for row in record_batches_to_json_rows_internal::(batches)? { self.write_row(&Value::Object(row))?; } Ok(()) @@ -587,10 +688,11 @@ where } } -impl RecordBatchWriter for Writer +impl RecordBatchWriter for Writer where W: Write, F: JsonFormat, + N: NullHandler, { fn write(&mut self, batch: &RecordBatch) -> Result<(), ArrowError> { self.write(batch) @@ -609,7 +711,7 @@ mod tests { use serde_json::json; - use arrow_array::builder::{Int32Builder, MapBuilder, StringBuilder}; + use arrow_array::builder::{Int32Builder, Int64Builder, MapBuilder, StringBuilder}; use arrow_buffer::{Buffer, ToByteSlice}; use arrow_data::ArrayData; @@ -1203,7 +1305,7 @@ mod tests { ); } - fn test_write_for_file(test_file: &str) { + fn test_write_for_file(test_file: &str, remove_nulls: bool) { let file = File::open(test_file).unwrap(); let mut reader = BufReader::new(file); let (schema, _) = infer_json_schema(&mut reader, None).unwrap(); @@ -1215,18 +1317,25 @@ mod tests { let mut buf = Vec::new(); { - let mut writer = LineDelimitedWriter::new(&mut buf); - writer.write_batches(&[&batch]).unwrap(); + if remove_nulls { + let mut writer = LineDelimitedWriter::new(&mut buf); + writer.write_batches(&[&batch]).unwrap(); + } else { + let mut writer = LineDelimitedWriterWithNulls::new(&mut buf); + writer.write_batches(&[&batch]).unwrap(); + } } let result = String::from_utf8(buf).unwrap(); let expected = read_to_string(test_file).unwrap(); for (r, e) in result.lines().zip(expected.lines()) { let mut expected_json = serde_json::from_str::(e).unwrap(); - // remove null value from object to make comparison consistent: - if let Value::Object(obj) = expected_json { - expected_json = - Value::Object(obj.into_iter().filter(|(_, v)| *v != Value::Null).collect()); + if remove_nulls { + // remove null value from object to make comparison consistent: + if let Value::Object(obj) = expected_json { + expected_json = + Value::Object(obj.into_iter().filter(|(_, v)| *v != Value::Null).collect()); + } } assert_eq!(serde_json::from_str::(r).unwrap(), expected_json,); } @@ -1234,17 +1343,22 @@ mod tests { #[test] fn write_basic_rows() { - test_write_for_file("test/data/basic.json"); + test_write_for_file("test/data/basic.json", true); } #[test] fn write_arrays() { - test_write_for_file("test/data/arrays.json"); + test_write_for_file("test/data/arrays.json", true); } #[test] fn write_basic_nulls() { - test_write_for_file("test/data/basic_nulls.json"); + test_write_for_file("test/data/basic_nulls.json", true); + } + + #[test] + fn write_nested_with_nulls() { + test_write_for_file("test/data/nested_with_nulls.json", false); } #[test] @@ -1530,4 +1644,231 @@ mod tests { assert_eq!(array_to_json_array(&map_array).unwrap(), expected_json); } + + #[test] + fn test_writer_keep_nulls() -> Result<(), ArrowError> { + fn nested_list() -> (Arc, Arc) { + let array = Arc::new(ListArray::from_iter_primitive::(vec![ + Some(vec![None, None, None]), + Some(vec![Some(1), Some(2), Some(3)]), + None, + Some(vec![None, None, None]), + ])); + let field = Arc::new(Field::new("list", array.data_type().clone(), true)); + // [{"list":[null,null,null]},{"list":[1,2,3]},{"list":null},{"list":[null,null,null]}] + (array, field) + } + + fn nested_dict() -> (Arc>, Arc) { + let array = Arc::new(DictionaryArray::from_iter(vec![ + Some("cupcakes"), + None, + Some("bear"), + Some("kuma"), + ])); + let field = Arc::new(Field::new("dict", array.data_type().clone(), true)); + // [{"dict":"cupcakes"},{"dict":null},{"dict":"bear"},{"dict":"kuma"}] + (array, field) + } + + fn nested_map() -> (Arc, Arc) { + let string_builder = StringBuilder::new(); + let int_builder = Int64Builder::new(); + let mut builder = MapBuilder::new(None, string_builder, int_builder); + + // [{"foo": 10}, null, {}, {"bar": 20, "baz": 30, "qux": 40}] + builder.keys().append_value("foo"); + builder.values().append_value(10); + builder.append(true).unwrap(); + + builder.append(false).unwrap(); + + builder.append(true).unwrap(); + + builder.keys().append_value("bar"); + builder.values().append_value(20); + builder.keys().append_value("baz"); + builder.values().append_value(30); + builder.keys().append_value("qux"); + builder.values().append_value(40); + builder.append(true).unwrap(); + + let array = Arc::new(builder.finish()); + let field = Arc::new(Field::new("map", array.data_type().clone(), true)); + (array, field) + } + + fn root_list() -> (Arc, Field) { + let struct_array = StructArray::from(vec![ + ( + Arc::new(Field::new("utf8", DataType::Utf8, true)), + Arc::new(StringArray::from(vec![Some("a"), Some("b"), None, None])) as ArrayRef, + ), + ( + Arc::new(Field::new("int32", DataType::Int32, true)), + Arc::new(Int32Array::from(vec![Some(1), None, Some(5), None])) as ArrayRef, + ), + ]); + + let field = Field::new_list( + "list", + Field::new("struct", struct_array.data_type().clone(), true), + true, + ); + + // [{"list":[{"int32":1,"utf8":"a"},{"int32":null,"utf8":"b"}]},{"list":null},{"list":[{int32":5,"utf8":null}]},{"list":null}] + let entry_offsets = Buffer::from(&[0, 2, 2, 3, 3].to_byte_slice()); + let data = ArrayData::builder(field.data_type().clone()) + .len(4) + .add_buffer(entry_offsets) + .add_child_data(struct_array.into_data()) + .null_bit_buffer(Some([0b00000101].into())) + .build() + .unwrap(); + let array = Arc::new(ListArray::from(data)); + (array, field) + } + + let (nested_list_array, nested_list_field) = nested_list(); + let (nested_dict_array, nested_dict_field) = nested_dict(); + let (nested_map_array, nested_map_field) = nested_map(); + let (root_list_array, root_list_field) = root_list(); + + let schema = Schema::new(vec![ + Field::new("date", DataType::Date32, true), + Field::new("null", DataType::Null, true), + Field::new_struct( + "struct", + vec![ + Arc::new(Field::new("utf8", DataType::Utf8, true)), + nested_list_field.clone(), + nested_dict_field.clone(), + nested_map_field.clone(), + ], + true, + ), + root_list_field, + ]); + + let arr_date32 = Date32Array::from(vec![Some(0), None, Some(1), None]); + let arr_null = NullArray::new(4); + let arr_struct = StructArray::from(vec![ + // [{"utf8":"a"},{"utf8":null},{"utf8":null},{"utf8":"b"}] + ( + Arc::new(Field::new("utf8", DataType::Utf8, true)), + Arc::new(StringArray::from(vec![Some("a"), None, None, Some("b")])) as ArrayRef, + ), + // [{"list":[null,null,null]},{"list":[1,2,3]},{"list":null},{"list":[null,null,null]}] + (nested_list_field, nested_list_array as ArrayRef), + // [{"dict":"cupcakes"},{"dict":null},{"dict":"bear"},{"dict":"kuma"}] + (nested_dict_field, nested_dict_array as ArrayRef), + // [{"foo": 10}, null, {}, {"bar": 20, "baz": 30, "qux": 40}] + (nested_map_field, nested_map_array as ArrayRef), + ]); + + let batch = RecordBatch::try_new( + Arc::new(schema), + vec![ + // [{"date":"1970-01-01"},{"date":null},{"date":"1970-01-02"},{"date":null}] + Arc::new(arr_date32), + // [{"null":null},{"null":null},{"null":null},{"null":null}] + Arc::new(arr_null), + Arc::new(arr_struct), + // [{"list":[{"int32":1,"utf8":"a"},{"int32":null,"utf8":"b"}]},{"list":null},{"list":[{int32":5,"utf8":null}]},{"list":null}] + root_list_array, + ], + )?; + + let mut buf = Vec::new(); + { + let mut writer = ArrayWriterWithNulls::new(&mut buf); + writer.write_batches(&[&batch])?; + writer.finish()?; + } + + let actual = serde_json::from_slice::>(&buf).unwrap(); + let expected = serde_json::from_value::>(json!([ + { + "date": "1970-01-01", + "list": [ + { + "int32": 1, + "utf8": "a" + }, + { + "int32": null, + "utf8": "b" + } + ], + "null": null, + "struct": { + "dict": "cupcakes", + "list": [ + null, + null, + null + ], + "map": { + "foo": 10 + }, + "utf8": "a" + } + }, + { + "date": null, + "list": null, + "null": null, + "struct": { + "dict": null, + "list": [ + 1, + 2, + 3 + ], + "map": null, + "utf8": null + } + }, + { + "date": "1970-01-02", + "list": [ + { + "int32": 5, + "utf8": null + } + ], + "null": null, + "struct": { + "dict": "bear", + "list": null, + "map": {}, + "utf8": null + } + }, + { + "date": null, + "list": null, + "null": null, + "struct": { + "dict": "kuma", + "list": [ + null, + null, + null + ], + "map": { + "bar": 20, + "baz": 30, + "qux": 40 + }, + "utf8": "b" + } + } + ])) + .unwrap(); + + assert_eq!(actual, expected); + + Ok(()) + } } diff --git a/arrow-json/test/data/nested_with_nulls.json b/arrow-json/test/data/nested_with_nulls.json new file mode 100644 index 000000000000..932565d56063 --- /dev/null +++ b/arrow-json/test/data/nested_with_nulls.json @@ -0,0 +1,4 @@ +{"a": null, "b": null, "c": null, "d": {"d1": null, "d2": [null, 1, 2, null]}} +{"a": null, "b": -3.5, "c": true, "d": {"d1": null, "d2": null}} +{"a": null, "b": null, "c": false, "d": {"d1": "1970-01-01", "d2": null}} +{"a": 1, "b": 2.0, "c": false, "d": {"d1": null, "d2": null}} diff --git a/arrow/src/ffi.rs b/arrow/src/ffi.rs index c13d4c6e5dff..7e4a7bbf2ede 100644 --- a/arrow/src/ffi.rs +++ b/arrow/src/ffi.rs @@ -617,8 +617,6 @@ mod tests { .downcast_ref::>() .unwrap(); - dbg!(&array); - // verify let expected = GenericListArray::::from(list_data); assert_eq!(&array.value(0), &expected.value(0)); diff --git a/arrow/tests/array_cast.rs b/arrow/tests/array_cast.rs index bfe16db5cc4d..c73f4f50ac01 100644 --- a/arrow/tests/array_cast.rs +++ b/arrow/tests/array_cast.rs @@ -47,7 +47,6 @@ fn test_cast_timestamp_to_string() { let a = TimestampMillisecondArray::from(vec![Some(864000000005), Some(1545696000001), None]) .with_timezone("UTC".to_string()); let array = Arc::new(a) as ArrayRef; - dbg!(&array); let b = cast(&array, &DataType::Utf8).unwrap(); let c = b.as_any().downcast_ref::().unwrap(); assert_eq!(&DataType::Utf8, c.data_type()); diff --git a/object_store/src/gcp/builder.rs b/object_store/src/gcp/builder.rs index 5f718d63d94a..7417ea4c8a50 100644 --- a/object_store/src/gcp/builder.rs +++ b/object_store/src/gcp/builder.rs @@ -605,7 +605,7 @@ mod tests { .with_bucket_name("foo") .with_proxy_url("https://example.com") .build(); - assert!(dbg!(gcs).is_ok()); + assert!(gcs.is_ok()); let err = GoogleCloudStorageBuilder::new() .with_service_account_path(service_account_path.to_str().unwrap()) From 2413c215793d01a52dbc9614bedbadc520c03f26 Mon Sep 17 00:00:00 2001 From: Jefffrey <22608443+Jefffrey@users.noreply.github.com> Date: Sun, 12 Nov 2023 15:09:38 +1100 Subject: [PATCH 2/5] Trigger From 0cdd05673afad64edc1734f1f3076a7593c15c32 Mon Sep 17 00:00:00 2001 From: Jefffrey <22608443+Jefffrey@users.noreply.github.com> Date: Mon, 20 Nov 2023 22:28:45 +1100 Subject: [PATCH 3/5] Refactor keep nulls to be runtime config --- arrow-json/src/lib.rs | 2 +- arrow-json/src/writer.rs | 307 ++++++++++++++++++++++----------------- 2 files changed, 173 insertions(+), 136 deletions(-) diff --git a/arrow-json/src/lib.rs b/arrow-json/src/lib.rs index e69eaaba3ef8..e39882e52620 100644 --- a/arrow-json/src/lib.rs +++ b/arrow-json/src/lib.rs @@ -82,7 +82,7 @@ pub type RawReader = Reader; pub type RawReaderBuilder = ReaderBuilder; pub use self::reader::{Reader, ReaderBuilder}; -pub use self::writer::{ArrayWriter, LineDelimitedWriter, Writer}; +pub use self::writer::{ArrayWriter, LineDelimitedWriter, Writer, WriterBuilder}; use half::f16; use serde_json::{Number, Value}; diff --git a/arrow-json/src/writer.rs b/arrow-json/src/writer.rs index 83bab3b0ff12..02585fda4725 100644 --- a/arrow-json/src/writer.rs +++ b/arrow-json/src/writer.rs @@ -94,11 +94,10 @@ //! ``` //! //! [`LineDelimitedWriter`] and [`ArrayWriter`] will omit writing keys with null values. -//! In order to explicitly write null values for keys, use the alternative -//! [`LineDelimitedWriterWithNulls`] and [`ArrayWriterWithNulls`] versions. +//! In order to explicitly write null values for keys, configure a custom [`Writer`] by +//! using a [`WriterBuilder`] to construct a [`Writer`]. use std::iter; -use std::marker::PhantomData; use std::{fmt::Debug, io::Write}; use serde_json::map::Map as JsonMap; @@ -112,68 +111,6 @@ use arrow_schema::*; use arrow_cast::display::{ArrayFormatter, FormatOptions}; -/// This trait controls whether null values should be written explicitly -/// for keys in objects, or whether the key should be omitted entirely. -pub trait NullHandler { - /// How to insert a maybe null into a JSON object - fn insert_value(row: &mut JsonMap, col_name: &str, value: Option); - - /// How to handle inserting a [`NullArray`] into a list of JSON objects. - fn insert_null_array(rows: &mut [JsonMap], col_name: &str); -} - -/// Skips writing keys that have null values. -/// -/// For example, with [`LineDelimited`] format: -/// -/// ```json -/// {"foo":1} -/// {"foo":1,"bar":2} -/// {} -/// ``` -pub struct SkipNulls {} - -impl NullHandler for SkipNulls { - #[inline] - fn insert_value(row: &mut JsonMap, col_name: &str, value: Option) { - if let Some(j) = value { - row.insert(col_name.to_string(), j); - } - } - - #[inline] - fn insert_null_array(_rows: &mut [JsonMap], _col_namee: &str) {} -} - -/// Writes keys that have null values. -/// -/// For example, with [`LineDelimited`] format: -/// -/// ```json -/// {"foo":1,"bar":null} -/// {"foo":1,"bar":2} -/// {"foo":null,"bar":null} -/// ``` -pub struct KeepNulls {} - -impl NullHandler for KeepNulls { - #[inline] - fn insert_value(row: &mut JsonMap, col_name: &str, value: Option) { - if let Some(j) = value { - row.insert(col_name.to_string(), j); - } else { - row.insert(col_name.to_string(), Value::Null); - } - } - - #[inline] - fn insert_null_array(rows: &mut [JsonMap], col_name: &str) { - rows.iter_mut().for_each(|row| { - row.insert(col_name.to_string(), Value::Null); - }); - } -} - fn primitive_array_to_json(array: &dyn Array) -> Result, ArrowError> where T: ArrowPrimitiveType, @@ -189,8 +126,9 @@ where .collect()) } -fn struct_array_to_jsonmap_array( +fn struct_array_to_jsonmap_array( array: &StructArray, + keep_nulls: bool, ) -> Result>, ArrowError> { let inner_col_names = array.column_names(); @@ -199,19 +137,20 @@ fn struct_array_to_jsonmap_array( .collect::>>(); for (j, struct_col) in array.columns().iter().enumerate() { - set_column_for_json_rows::(&mut inner_objs, struct_col, inner_col_names[j])? + set_column_for_json_rows(&mut inner_objs, struct_col, inner_col_names[j], keep_nulls)? } Ok(inner_objs) } /// Converts an arrow [`Array`] into a `Vec` of Serde JSON [`serde_json::Value`]'s pub fn array_to_json_array(array: &dyn Array) -> Result, ArrowError> { - // For backwards compatibility, default to SkipNulls - array_to_json_array_internal::(array) + // For backwards compatibility, default to skip nulls + array_to_json_array_internal(array, false) } -fn array_to_json_array_internal( +fn array_to_json_array_internal( array: &dyn Array, + keep_nulls: bool, ) -> Result, ArrowError> { match array.data_type() { DataType::Null => Ok(iter::repeat(Value::Null).take(array.len()).collect()), @@ -254,32 +193,32 @@ fn array_to_json_array_internal( DataType::List(_) => as_list_array(array) .iter() .map(|maybe_value| match maybe_value { - Some(v) => Ok(Value::Array(array_to_json_array_internal::(&v)?)), + Some(v) => Ok(Value::Array(array_to_json_array_internal(&v, keep_nulls)?)), None => Ok(Value::Null), }) .collect(), DataType::LargeList(_) => as_large_list_array(array) .iter() .map(|maybe_value| match maybe_value { - Some(v) => Ok(Value::Array(array_to_json_array_internal::(&v)?)), + Some(v) => Ok(Value::Array(array_to_json_array_internal(&v, keep_nulls)?)), None => Ok(Value::Null), }) .collect(), DataType::FixedSizeList(_, _) => as_fixed_size_list_array(array) .iter() .map(|maybe_value| match maybe_value { - Some(v) => Ok(Value::Array(array_to_json_array_internal::(&v)?)), + Some(v) => Ok(Value::Array(array_to_json_array_internal(&v, keep_nulls)?)), None => Ok(Value::Null), }) .collect(), DataType::Struct(_) => { - let jsonmaps = struct_array_to_jsonmap_array::(array.as_struct())?; + let jsonmaps = struct_array_to_jsonmap_array(array.as_struct(), keep_nulls)?; Ok(jsonmaps.into_iter().map(Value::Object).collect()) } DataType::Map(_, _) => as_map_array(array) .iter() .map(|maybe_value| match maybe_value { - Some(v) => Ok(Value::Array(array_to_json_array_internal::(&v)?)), + Some(v) => Ok(Value::Array(array_to_json_array_internal(&v, keep_nulls)?)), None => Ok(Value::Null), }) .collect(), @@ -290,85 +229,98 @@ fn array_to_json_array_internal( } macro_rules! set_column_by_array_type { - ($cast_fn:ident, $col_name:ident, $rows:ident, $array:ident, $null_handler:ty) => { + ($cast_fn:ident, $col_name:ident, $rows:ident, $array:ident, $keep_nulls:ident) => { let arr = $cast_fn($array); $rows .iter_mut() .zip(arr.iter()) .for_each(|(row, maybe_value)| { - <$null_handler>::insert_value(row, $col_name, maybe_value.map(Into::into)) + if let Some(j) = maybe_value.map(Into::into) { + row.insert($col_name.to_string(), j); + } else if $keep_nulls { + row.insert($col_name.to_string(), Value::Null); + } }); }; } -fn set_column_by_primitive_type( +fn set_column_by_primitive_type( rows: &mut [JsonMap], array: &ArrayRef, col_name: &str, + keep_nulls: bool, ) where T: ArrowPrimitiveType, T::Native: JsonSerializable, - N: NullHandler, { let primitive_arr = array.as_primitive::(); rows.iter_mut() .zip(primitive_arr.iter()) .for_each(|(row, maybe_value)| { - N::insert_value(row, col_name, maybe_value.and_then(|v| v.into_json_value())) + if let Some(j) = maybe_value.and_then(|v| v.into_json_value()) { + row.insert(col_name.to_string(), j); + } else if keep_nulls { + row.insert(col_name.to_string(), Value::Null); + } }); } -fn set_column_for_json_rows( +fn set_column_for_json_rows( rows: &mut [JsonMap], array: &ArrayRef, col_name: &str, + keep_nulls: bool, ) -> Result<(), ArrowError> { match array.data_type() { DataType::Int8 => { - set_column_by_primitive_type::(rows, array, col_name); + set_column_by_primitive_type::(rows, array, col_name, keep_nulls); } DataType::Int16 => { - set_column_by_primitive_type::(rows, array, col_name); + set_column_by_primitive_type::(rows, array, col_name, keep_nulls); } DataType::Int32 => { - set_column_by_primitive_type::(rows, array, col_name); + set_column_by_primitive_type::(rows, array, col_name, keep_nulls); } DataType::Int64 => { - set_column_by_primitive_type::(rows, array, col_name); + set_column_by_primitive_type::(rows, array, col_name, keep_nulls); } DataType::UInt8 => { - set_column_by_primitive_type::(rows, array, col_name); + set_column_by_primitive_type::(rows, array, col_name, keep_nulls); } DataType::UInt16 => { - set_column_by_primitive_type::(rows, array, col_name); + set_column_by_primitive_type::(rows, array, col_name, keep_nulls); } DataType::UInt32 => { - set_column_by_primitive_type::(rows, array, col_name); + set_column_by_primitive_type::(rows, array, col_name, keep_nulls); } DataType::UInt64 => { - set_column_by_primitive_type::(rows, array, col_name); + set_column_by_primitive_type::(rows, array, col_name, keep_nulls); } DataType::Float16 => { - set_column_by_primitive_type::(rows, array, col_name); + set_column_by_primitive_type::(rows, array, col_name, keep_nulls); } DataType::Float32 => { - set_column_by_primitive_type::(rows, array, col_name); + set_column_by_primitive_type::(rows, array, col_name, keep_nulls); } DataType::Float64 => { - set_column_by_primitive_type::(rows, array, col_name); + set_column_by_primitive_type::(rows, array, col_name, keep_nulls); } DataType::Null => { - N::insert_null_array(rows, col_name); + if keep_nulls { + rows.iter_mut().for_each(|row| { + row.insert(col_name.to_string(), Value::Null); + }); + } } DataType::Boolean => { - set_column_by_array_type!(as_boolean_array, col_name, rows, array, N); + set_column_by_array_type!(as_boolean_array, col_name, rows, array, keep_nulls); } DataType::Utf8 => { - set_column_by_array_type!(as_string_array, col_name, rows, array, N); + set_column_by_array_type!(as_string_array, col_name, rows, array, keep_nulls); } DataType::LargeUtf8 => { - set_column_by_array_type!(as_largestring_array, col_name, rows, array, N); + set_column_by_array_type!(as_largestring_array, col_name, rows, array, keep_nulls); } DataType::Date32 | DataType::Date64 @@ -384,11 +336,15 @@ fn set_column_for_json_rows( .map(|x| x.is_valid(idx)) .unwrap_or(true) .then(|| formatter.value(idx).to_string().into()); - N::insert_value(row, col_name, maybe_value); + if let Some(j) = maybe_value { + row.insert(col_name.to_string(), j); + } else if keep_nulls { + row.insert(col_name.to_string(), Value::Null); + }; }); } DataType::Struct(_) => { - let inner_objs = struct_array_to_jsonmap_array::(array.as_struct())?; + let inner_objs = struct_array_to_jsonmap_array(array.as_struct(), keep_nulls)?; rows.iter_mut().zip(inner_objs).for_each(|(row, obj)| { row.insert(col_name.to_string(), Value::Object(obj)); }); @@ -398,9 +354,13 @@ fn set_column_for_json_rows( rows.iter_mut().zip(listarr.iter()).try_for_each( |(row, maybe_value)| -> Result<(), ArrowError> { let maybe_value = maybe_value - .map(|v| array_to_json_array_internal::(&v).map(Value::Array)) + .map(|v| array_to_json_array_internal(&v, keep_nulls).map(Value::Array)) .transpose()?; - N::insert_value(row, col_name, maybe_value); + if let Some(j) = maybe_value { + row.insert(col_name.to_string(), j); + } else if keep_nulls { + row.insert(col_name.to_string(), Value::Null); + } Ok(()) }, )?; @@ -410,9 +370,13 @@ fn set_column_for_json_rows( rows.iter_mut().zip(listarr.iter()).try_for_each( |(row, maybe_value)| -> Result<(), ArrowError> { let maybe_value = maybe_value - .map(|v| array_to_json_array_internal::(&v).map(Value::Array)) + .map(|v| array_to_json_array_internal(&v, keep_nulls).map(Value::Array)) .transpose()?; - N::insert_value(row, col_name, maybe_value); + if let Some(j) = maybe_value { + row.insert(col_name.to_string(), j); + } else if keep_nulls { + row.insert(col_name.to_string(), Value::Null); + } Ok(()) }, )?; @@ -420,7 +384,7 @@ fn set_column_for_json_rows( DataType::Dictionary(_, value_type) => { let hydrated = arrow_cast::cast::cast(&array, value_type) .expect("cannot cast dictionary to underlying values"); - set_column_for_json_rows::(rows, &hydrated, col_name)?; + set_column_for_json_rows(rows, &hydrated, col_name, keep_nulls)?; } DataType::Map(_, _) => { let maparr = as_map_array(array); @@ -437,7 +401,7 @@ fn set_column_for_json_rows( } let keys = keys.as_string::(); - let values = array_to_json_array_internal::(values)?; + let values = array_to_json_array_internal(values, keep_nulls)?; let mut kv = keys.iter().zip(values); @@ -472,12 +436,13 @@ fn set_column_for_json_rows( pub fn record_batches_to_json_rows( batches: &[&RecordBatch], ) -> Result>, ArrowError> { - // For backwards compatibility, default to SkipNulls - record_batches_to_json_rows_internal::(batches) + // For backwards compatibility, default to skip nulls + record_batches_to_json_rows_internal(batches, false) } -fn record_batches_to_json_rows_internal( +fn record_batches_to_json_rows_internal( batches: &[&RecordBatch], + keep_nulls: bool, ) -> Result>, ArrowError> { let mut rows: Vec> = iter::repeat(JsonMap::new()) .take(batches.iter().map(|b| b.num_rows()).sum()) @@ -491,7 +456,7 @@ fn record_batches_to_json_rows_internal( let row_slice = &mut rows[base..base + batch.num_rows()]; for (j, col) in batch.columns().iter().enumerate() { let col_name = schema.field(j).name(); - set_column_for_json_rows::(row_slice, col, col_name)? + set_column_for_json_rows(row_slice, col, col_name, keep_nulls)? } base += row_count; } @@ -576,35 +541,105 @@ impl JsonFormat for JsonArray { } /// A JSON writer which serializes [`RecordBatch`]es to newline delimited JSON objects. -/// -/// Will skip writing keys with null values. -pub type LineDelimitedWriter = Writer; - -/// Similar to [`LineDelimitedWriter`] but will keep keys with null values. -pub type LineDelimitedWriterWithNulls = Writer; +pub type LineDelimitedWriter = Writer; /// A JSON writer which serializes [`RecordBatch`]es to JSON arrays. -/// -/// Will skip writing keys with null values. -pub type ArrayWriter = Writer; +pub type ArrayWriter = Writer; + +/// JSON writer builder. +#[derive(Debug, Clone, Default)] +pub struct WriterBuilder { + /// Controls whether null values should be written explicitly for keys + /// in objects, or whether the key should be omitted entirely. + keep_nulls: bool, +} -/// Similar to [`ArrayWriter`] but will keep keys with null values. -pub type ArrayWriterWithNulls = Writer; +impl WriterBuilder { + /// Create a new builder for configuring JSON writing options. + /// + /// # Example + /// + /// ``` + /// # use arrow_json::{Writer, WriterBuilder}; + /// # use arrow_json::writer::LineDelimited; + /// # use std::fs::File; + /// + /// fn example() -> Writer { + /// let file = File::create("target/out.json").unwrap(); + /// + /// // create a builder that keeps keys with null values + /// let builder = WriterBuilder::new().with_keep_nulls(true); + /// let writer = builder.build::<_, LineDelimited>(file); + /// + /// writer + /// } + /// ``` + pub fn new() -> Self { + Self::default() + } + + /// Returns `true` if this writer is configured to keep keys with null values. + pub fn keep_nulls(&self) -> bool { + self.keep_nulls + } + + /// Set whether to keep keys with null values, or to omit writing them. + /// + /// For example, with [`LineDelimited`] format: + /// + /// Skip nulls (set to `false`): + /// + /// ```json + /// {"foo":1} + /// {"foo":1,"bar":2} + /// {} + /// ``` + /// + /// Keep nulls (set to `true`): + /// + /// ```json + /// {"foo":1,"bar":null} + /// {"foo":1,"bar":2} + /// {"foo":null,"bar":null} + /// ``` + /// + /// Default is to skip nulls (set to `false`). + pub fn with_keep_nulls(mut self, keep_nulls: bool) -> Self { + self.keep_nulls = keep_nulls; + self + } + + /// Create a new `Writer` with specified `JsonFormat` and builder options. + pub fn build(self, writer: W) -> Writer + where + W: Write, + F: JsonFormat, + { + Writer { + writer, + started: false, + finished: false, + format: F::default(), + keep_nulls: self.keep_nulls, + } + } +} /// A JSON writer which serializes [`RecordBatch`]es to a stream of /// `u8` encoded JSON objects. /// /// See the module level documentation for detailed usage and examples. /// The specific format of the stream is controlled by the [`JsonFormat`] -/// type parameter. The [`NullHandler`] type parameter controls whether -/// nulls should be written explicitly for keys or skipped. Default is -/// [`SkipNulls`] for backward compatibility. +/// type parameter. +/// +/// By default the writer will skip writing keys with null values for +/// backward compatibility. See [`WriterBuilder`] on how to customize +/// this behaviour when creating a new writer. #[derive(Debug)] -pub struct Writer +pub struct Writer where W: Write, F: JsonFormat, - N: NullHandler, { /// Underlying writer to use to write bytes writer: W, @@ -618,15 +653,14 @@ where /// Determines how the byte stream is formatted format: F, - /// Determines whether nulls should be written for keys or omitted - null_handler: PhantomData, + /// Whether keys with null values should be written or skipped + keep_nulls: bool, } -impl Writer +impl Writer where W: Write, F: JsonFormat, - N: NullHandler, { /// Construct a new writer pub fn new(writer: W) -> Self { @@ -635,7 +669,7 @@ where started: false, finished: false, format: F::default(), - null_handler: Default::default(), + keep_nulls: false, } } @@ -657,7 +691,7 @@ where /// Convert the `RecordBatch` into JSON rows, and write them to the output pub fn write(&mut self, batch: &RecordBatch) -> Result<(), ArrowError> { - for row in record_batches_to_json_rows_internal::(&[batch])? { + for row in record_batches_to_json_rows_internal(&[batch], self.keep_nulls)? { self.write_row(&Value::Object(row))?; } Ok(()) @@ -665,7 +699,7 @@ where /// Convert the [`RecordBatch`] into JSON rows, and write them to the output pub fn write_batches(&mut self, batches: &[&RecordBatch]) -> Result<(), ArrowError> { - for row in record_batches_to_json_rows_internal::(batches)? { + for row in record_batches_to_json_rows_internal(batches, self.keep_nulls)? { self.write_row(&Value::Object(row))?; } Ok(()) @@ -688,11 +722,10 @@ where } } -impl RecordBatchWriter for Writer +impl RecordBatchWriter for Writer where W: Write, F: JsonFormat, - N: NullHandler, { fn write(&mut self, batch: &RecordBatch) -> Result<(), ArrowError> { self.write(batch) @@ -1321,7 +1354,9 @@ mod tests { let mut writer = LineDelimitedWriter::new(&mut buf); writer.write_batches(&[&batch]).unwrap(); } else { - let mut writer = LineDelimitedWriterWithNulls::new(&mut buf); + let mut writer = WriterBuilder::new() + .with_keep_nulls(true) + .build::<_, LineDelimited>(&mut buf); writer.write_batches(&[&batch]).unwrap(); } } @@ -1781,7 +1816,9 @@ mod tests { let mut buf = Vec::new(); { - let mut writer = ArrayWriterWithNulls::new(&mut buf); + let mut writer = WriterBuilder::new() + .with_keep_nulls(true) + .build::<_, JsonArray>(&mut buf); writer.write_batches(&[&batch])?; writer.finish()?; } From d598e3981370f9f77e09e9bf533d050efec6c4d4 Mon Sep 17 00:00:00 2001 From: Jefffrey <22608443+Jefffrey@users.noreply.github.com> Date: Mon, 20 Nov 2023 22:51:44 +1100 Subject: [PATCH 4/5] Rename option --- arrow-json/src/writer.rs | 121 ++++++++++++++++++++++----------------- 1 file changed, 69 insertions(+), 52 deletions(-) diff --git a/arrow-json/src/writer.rs b/arrow-json/src/writer.rs index 02585fda4725..7baa879db819 100644 --- a/arrow-json/src/writer.rs +++ b/arrow-json/src/writer.rs @@ -128,7 +128,7 @@ where fn struct_array_to_jsonmap_array( array: &StructArray, - keep_nulls: bool, + keep_null_keys: bool, ) -> Result>, ArrowError> { let inner_col_names = array.column_names(); @@ -137,7 +137,12 @@ fn struct_array_to_jsonmap_array( .collect::>>(); for (j, struct_col) in array.columns().iter().enumerate() { - set_column_for_json_rows(&mut inner_objs, struct_col, inner_col_names[j], keep_nulls)? + set_column_for_json_rows( + &mut inner_objs, + struct_col, + inner_col_names[j], + keep_null_keys, + )? } Ok(inner_objs) } @@ -150,7 +155,7 @@ pub fn array_to_json_array(array: &dyn Array) -> Result, ArrowError> fn array_to_json_array_internal( array: &dyn Array, - keep_nulls: bool, + keep_null_keys: bool, ) -> Result, ArrowError> { match array.data_type() { DataType::Null => Ok(iter::repeat(Value::Null).take(array.len()).collect()), @@ -193,32 +198,44 @@ fn array_to_json_array_internal( DataType::List(_) => as_list_array(array) .iter() .map(|maybe_value| match maybe_value { - Some(v) => Ok(Value::Array(array_to_json_array_internal(&v, keep_nulls)?)), + Some(v) => Ok(Value::Array(array_to_json_array_internal( + &v, + keep_null_keys, + )?)), None => Ok(Value::Null), }) .collect(), DataType::LargeList(_) => as_large_list_array(array) .iter() .map(|maybe_value| match maybe_value { - Some(v) => Ok(Value::Array(array_to_json_array_internal(&v, keep_nulls)?)), + Some(v) => Ok(Value::Array(array_to_json_array_internal( + &v, + keep_null_keys, + )?)), None => Ok(Value::Null), }) .collect(), DataType::FixedSizeList(_, _) => as_fixed_size_list_array(array) .iter() .map(|maybe_value| match maybe_value { - Some(v) => Ok(Value::Array(array_to_json_array_internal(&v, keep_nulls)?)), + Some(v) => Ok(Value::Array(array_to_json_array_internal( + &v, + keep_null_keys, + )?)), None => Ok(Value::Null), }) .collect(), DataType::Struct(_) => { - let jsonmaps = struct_array_to_jsonmap_array(array.as_struct(), keep_nulls)?; + let jsonmaps = struct_array_to_jsonmap_array(array.as_struct(), keep_null_keys)?; Ok(jsonmaps.into_iter().map(Value::Object).collect()) } DataType::Map(_, _) => as_map_array(array) .iter() .map(|maybe_value| match maybe_value { - Some(v) => Ok(Value::Array(array_to_json_array_internal(&v, keep_nulls)?)), + Some(v) => Ok(Value::Array(array_to_json_array_internal( + &v, + keep_null_keys, + )?)), None => Ok(Value::Null), }) .collect(), @@ -229,7 +246,7 @@ fn array_to_json_array_internal( } macro_rules! set_column_by_array_type { - ($cast_fn:ident, $col_name:ident, $rows:ident, $array:ident, $keep_nulls:ident) => { + ($cast_fn:ident, $col_name:ident, $rows:ident, $array:ident, $keep_null_keys:ident) => { let arr = $cast_fn($array); $rows .iter_mut() @@ -237,7 +254,7 @@ macro_rules! set_column_by_array_type { .for_each(|(row, maybe_value)| { if let Some(j) = maybe_value.map(Into::into) { row.insert($col_name.to_string(), j); - } else if $keep_nulls { + } else if $keep_null_keys { row.insert($col_name.to_string(), Value::Null); } }); @@ -248,7 +265,7 @@ fn set_column_by_primitive_type( rows: &mut [JsonMap], array: &ArrayRef, col_name: &str, - keep_nulls: bool, + keep_null_keys: bool, ) where T: ArrowPrimitiveType, T::Native: JsonSerializable, @@ -260,7 +277,7 @@ fn set_column_by_primitive_type( .for_each(|(row, maybe_value)| { if let Some(j) = maybe_value.and_then(|v| v.into_json_value()) { row.insert(col_name.to_string(), j); - } else if keep_nulls { + } else if keep_null_keys { row.insert(col_name.to_string(), Value::Null); } }); @@ -270,57 +287,57 @@ fn set_column_for_json_rows( rows: &mut [JsonMap], array: &ArrayRef, col_name: &str, - keep_nulls: bool, + keep_null_keys: bool, ) -> Result<(), ArrowError> { match array.data_type() { DataType::Int8 => { - set_column_by_primitive_type::(rows, array, col_name, keep_nulls); + set_column_by_primitive_type::(rows, array, col_name, keep_null_keys); } DataType::Int16 => { - set_column_by_primitive_type::(rows, array, col_name, keep_nulls); + set_column_by_primitive_type::(rows, array, col_name, keep_null_keys); } DataType::Int32 => { - set_column_by_primitive_type::(rows, array, col_name, keep_nulls); + set_column_by_primitive_type::(rows, array, col_name, keep_null_keys); } DataType::Int64 => { - set_column_by_primitive_type::(rows, array, col_name, keep_nulls); + set_column_by_primitive_type::(rows, array, col_name, keep_null_keys); } DataType::UInt8 => { - set_column_by_primitive_type::(rows, array, col_name, keep_nulls); + set_column_by_primitive_type::(rows, array, col_name, keep_null_keys); } DataType::UInt16 => { - set_column_by_primitive_type::(rows, array, col_name, keep_nulls); + set_column_by_primitive_type::(rows, array, col_name, keep_null_keys); } DataType::UInt32 => { - set_column_by_primitive_type::(rows, array, col_name, keep_nulls); + set_column_by_primitive_type::(rows, array, col_name, keep_null_keys); } DataType::UInt64 => { - set_column_by_primitive_type::(rows, array, col_name, keep_nulls); + set_column_by_primitive_type::(rows, array, col_name, keep_null_keys); } DataType::Float16 => { - set_column_by_primitive_type::(rows, array, col_name, keep_nulls); + set_column_by_primitive_type::(rows, array, col_name, keep_null_keys); } DataType::Float32 => { - set_column_by_primitive_type::(rows, array, col_name, keep_nulls); + set_column_by_primitive_type::(rows, array, col_name, keep_null_keys); } DataType::Float64 => { - set_column_by_primitive_type::(rows, array, col_name, keep_nulls); + set_column_by_primitive_type::(rows, array, col_name, keep_null_keys); } DataType::Null => { - if keep_nulls { + if keep_null_keys { rows.iter_mut().for_each(|row| { row.insert(col_name.to_string(), Value::Null); }); } } DataType::Boolean => { - set_column_by_array_type!(as_boolean_array, col_name, rows, array, keep_nulls); + set_column_by_array_type!(as_boolean_array, col_name, rows, array, keep_null_keys); } DataType::Utf8 => { - set_column_by_array_type!(as_string_array, col_name, rows, array, keep_nulls); + set_column_by_array_type!(as_string_array, col_name, rows, array, keep_null_keys); } DataType::LargeUtf8 => { - set_column_by_array_type!(as_largestring_array, col_name, rows, array, keep_nulls); + set_column_by_array_type!(as_largestring_array, col_name, rows, array, keep_null_keys); } DataType::Date32 | DataType::Date64 @@ -338,13 +355,13 @@ fn set_column_for_json_rows( .then(|| formatter.value(idx).to_string().into()); if let Some(j) = maybe_value { row.insert(col_name.to_string(), j); - } else if keep_nulls { + } else if keep_null_keys { row.insert(col_name.to_string(), Value::Null); }; }); } DataType::Struct(_) => { - let inner_objs = struct_array_to_jsonmap_array(array.as_struct(), keep_nulls)?; + let inner_objs = struct_array_to_jsonmap_array(array.as_struct(), keep_null_keys)?; rows.iter_mut().zip(inner_objs).for_each(|(row, obj)| { row.insert(col_name.to_string(), Value::Object(obj)); }); @@ -354,11 +371,11 @@ fn set_column_for_json_rows( rows.iter_mut().zip(listarr.iter()).try_for_each( |(row, maybe_value)| -> Result<(), ArrowError> { let maybe_value = maybe_value - .map(|v| array_to_json_array_internal(&v, keep_nulls).map(Value::Array)) + .map(|v| array_to_json_array_internal(&v, keep_null_keys).map(Value::Array)) .transpose()?; if let Some(j) = maybe_value { row.insert(col_name.to_string(), j); - } else if keep_nulls { + } else if keep_null_keys { row.insert(col_name.to_string(), Value::Null); } Ok(()) @@ -370,11 +387,11 @@ fn set_column_for_json_rows( rows.iter_mut().zip(listarr.iter()).try_for_each( |(row, maybe_value)| -> Result<(), ArrowError> { let maybe_value = maybe_value - .map(|v| array_to_json_array_internal(&v, keep_nulls).map(Value::Array)) + .map(|v| array_to_json_array_internal(&v, keep_null_keys).map(Value::Array)) .transpose()?; if let Some(j) = maybe_value { row.insert(col_name.to_string(), j); - } else if keep_nulls { + } else if keep_null_keys { row.insert(col_name.to_string(), Value::Null); } Ok(()) @@ -384,7 +401,7 @@ fn set_column_for_json_rows( DataType::Dictionary(_, value_type) => { let hydrated = arrow_cast::cast::cast(&array, value_type) .expect("cannot cast dictionary to underlying values"); - set_column_for_json_rows(rows, &hydrated, col_name, keep_nulls)?; + set_column_for_json_rows(rows, &hydrated, col_name, keep_null_keys)?; } DataType::Map(_, _) => { let maparr = as_map_array(array); @@ -401,7 +418,7 @@ fn set_column_for_json_rows( } let keys = keys.as_string::(); - let values = array_to_json_array_internal(values, keep_nulls)?; + let values = array_to_json_array_internal(values, keep_null_keys)?; let mut kv = keys.iter().zip(values); @@ -442,7 +459,7 @@ pub fn record_batches_to_json_rows( fn record_batches_to_json_rows_internal( batches: &[&RecordBatch], - keep_nulls: bool, + keep_null_keys: bool, ) -> Result>, ArrowError> { let mut rows: Vec> = iter::repeat(JsonMap::new()) .take(batches.iter().map(|b| b.num_rows()).sum()) @@ -456,7 +473,7 @@ fn record_batches_to_json_rows_internal( let row_slice = &mut rows[base..base + batch.num_rows()]; for (j, col) in batch.columns().iter().enumerate() { let col_name = schema.field(j).name(); - set_column_for_json_rows(row_slice, col, col_name, keep_nulls)? + set_column_for_json_rows(row_slice, col, col_name, keep_null_keys)? } base += row_count; } @@ -551,7 +568,7 @@ pub type ArrayWriter = Writer; pub struct WriterBuilder { /// Controls whether null values should be written explicitly for keys /// in objects, or whether the key should be omitted entirely. - keep_nulls: bool, + keep_null_keys: bool, } impl WriterBuilder { @@ -568,7 +585,7 @@ impl WriterBuilder { /// let file = File::create("target/out.json").unwrap(); /// /// // create a builder that keeps keys with null values - /// let builder = WriterBuilder::new().with_keep_nulls(true); + /// let builder = WriterBuilder::new().with_keep_null_keys(true); /// let writer = builder.build::<_, LineDelimited>(file); /// /// writer @@ -579,8 +596,8 @@ impl WriterBuilder { } /// Returns `true` if this writer is configured to keep keys with null values. - pub fn keep_nulls(&self) -> bool { - self.keep_nulls + pub fn keep_null_keys(&self) -> bool { + self.keep_null_keys } /// Set whether to keep keys with null values, or to omit writing them. @@ -604,8 +621,8 @@ impl WriterBuilder { /// ``` /// /// Default is to skip nulls (set to `false`). - pub fn with_keep_nulls(mut self, keep_nulls: bool) -> Self { - self.keep_nulls = keep_nulls; + pub fn with_keep_null_keys(mut self, keep_null_keys: bool) -> Self { + self.keep_null_keys = keep_null_keys; self } @@ -620,7 +637,7 @@ impl WriterBuilder { started: false, finished: false, format: F::default(), - keep_nulls: self.keep_nulls, + keep_null_keys: self.keep_null_keys, } } } @@ -654,7 +671,7 @@ where format: F, /// Whether keys with null values should be written or skipped - keep_nulls: bool, + keep_null_keys: bool, } impl Writer @@ -669,7 +686,7 @@ where started: false, finished: false, format: F::default(), - keep_nulls: false, + keep_null_keys: false, } } @@ -691,7 +708,7 @@ where /// Convert the `RecordBatch` into JSON rows, and write them to the output pub fn write(&mut self, batch: &RecordBatch) -> Result<(), ArrowError> { - for row in record_batches_to_json_rows_internal(&[batch], self.keep_nulls)? { + for row in record_batches_to_json_rows_internal(&[batch], self.keep_null_keys)? { self.write_row(&Value::Object(row))?; } Ok(()) @@ -699,7 +716,7 @@ where /// Convert the [`RecordBatch`] into JSON rows, and write them to the output pub fn write_batches(&mut self, batches: &[&RecordBatch]) -> Result<(), ArrowError> { - for row in record_batches_to_json_rows_internal(batches, self.keep_nulls)? { + for row in record_batches_to_json_rows_internal(batches, self.keep_null_keys)? { self.write_row(&Value::Object(row))?; } Ok(()) @@ -1355,7 +1372,7 @@ mod tests { writer.write_batches(&[&batch]).unwrap(); } else { let mut writer = WriterBuilder::new() - .with_keep_nulls(true) + .with_keep_null_keys(true) .build::<_, LineDelimited>(&mut buf); writer.write_batches(&[&batch]).unwrap(); } @@ -1681,7 +1698,7 @@ mod tests { } #[test] - fn test_writer_keep_nulls() -> Result<(), ArrowError> { + fn test_writer_keep_null_keys() -> Result<(), ArrowError> { fn nested_list() -> (Arc, Arc) { let array = Arc::new(ListArray::from_iter_primitive::(vec![ Some(vec![None, None, None]), @@ -1817,7 +1834,7 @@ mod tests { let mut buf = Vec::new(); { let mut writer = WriterBuilder::new() - .with_keep_nulls(true) + .with_keep_null_keys(true) .build::<_, JsonArray>(&mut buf); writer.write_batches(&[&batch])?; writer.finish()?; From c61fb1dcfd4c374b71b740fd82df7b8003c9d092 Mon Sep 17 00:00:00 2001 From: Jefffrey <22608443+Jefffrey@users.noreply.github.com> Date: Tue, 21 Nov 2023 07:51:59 +1100 Subject: [PATCH 5/5] Rename option --- arrow-json/src/writer.rs | 104 +++++++++++++++++++-------------------- 1 file changed, 52 insertions(+), 52 deletions(-) diff --git a/arrow-json/src/writer.rs b/arrow-json/src/writer.rs index 7baa879db819..4f74817ca1e3 100644 --- a/arrow-json/src/writer.rs +++ b/arrow-json/src/writer.rs @@ -128,7 +128,7 @@ where fn struct_array_to_jsonmap_array( array: &StructArray, - keep_null_keys: bool, + explicit_nulls: bool, ) -> Result>, ArrowError> { let inner_col_names = array.column_names(); @@ -141,7 +141,7 @@ fn struct_array_to_jsonmap_array( &mut inner_objs, struct_col, inner_col_names[j], - keep_null_keys, + explicit_nulls, )? } Ok(inner_objs) @@ -155,7 +155,7 @@ pub fn array_to_json_array(array: &dyn Array) -> Result, ArrowError> fn array_to_json_array_internal( array: &dyn Array, - keep_null_keys: bool, + explicit_nulls: bool, ) -> Result, ArrowError> { match array.data_type() { DataType::Null => Ok(iter::repeat(Value::Null).take(array.len()).collect()), @@ -200,7 +200,7 @@ fn array_to_json_array_internal( .map(|maybe_value| match maybe_value { Some(v) => Ok(Value::Array(array_to_json_array_internal( &v, - keep_null_keys, + explicit_nulls, )?)), None => Ok(Value::Null), }) @@ -210,7 +210,7 @@ fn array_to_json_array_internal( .map(|maybe_value| match maybe_value { Some(v) => Ok(Value::Array(array_to_json_array_internal( &v, - keep_null_keys, + explicit_nulls, )?)), None => Ok(Value::Null), }) @@ -220,13 +220,13 @@ fn array_to_json_array_internal( .map(|maybe_value| match maybe_value { Some(v) => Ok(Value::Array(array_to_json_array_internal( &v, - keep_null_keys, + explicit_nulls, )?)), None => Ok(Value::Null), }) .collect(), DataType::Struct(_) => { - let jsonmaps = struct_array_to_jsonmap_array(array.as_struct(), keep_null_keys)?; + let jsonmaps = struct_array_to_jsonmap_array(array.as_struct(), explicit_nulls)?; Ok(jsonmaps.into_iter().map(Value::Object).collect()) } DataType::Map(_, _) => as_map_array(array) @@ -234,7 +234,7 @@ fn array_to_json_array_internal( .map(|maybe_value| match maybe_value { Some(v) => Ok(Value::Array(array_to_json_array_internal( &v, - keep_null_keys, + explicit_nulls, )?)), None => Ok(Value::Null), }) @@ -246,7 +246,7 @@ fn array_to_json_array_internal( } macro_rules! set_column_by_array_type { - ($cast_fn:ident, $col_name:ident, $rows:ident, $array:ident, $keep_null_keys:ident) => { + ($cast_fn:ident, $col_name:ident, $rows:ident, $array:ident, $explicit_nulls:ident) => { let arr = $cast_fn($array); $rows .iter_mut() @@ -254,7 +254,7 @@ macro_rules! set_column_by_array_type { .for_each(|(row, maybe_value)| { if let Some(j) = maybe_value.map(Into::into) { row.insert($col_name.to_string(), j); - } else if $keep_null_keys { + } else if $explicit_nulls { row.insert($col_name.to_string(), Value::Null); } }); @@ -265,7 +265,7 @@ fn set_column_by_primitive_type( rows: &mut [JsonMap], array: &ArrayRef, col_name: &str, - keep_null_keys: bool, + explicit_nulls: bool, ) where T: ArrowPrimitiveType, T::Native: JsonSerializable, @@ -277,7 +277,7 @@ fn set_column_by_primitive_type( .for_each(|(row, maybe_value)| { if let Some(j) = maybe_value.and_then(|v| v.into_json_value()) { row.insert(col_name.to_string(), j); - } else if keep_null_keys { + } else if explicit_nulls { row.insert(col_name.to_string(), Value::Null); } }); @@ -287,57 +287,57 @@ fn set_column_for_json_rows( rows: &mut [JsonMap], array: &ArrayRef, col_name: &str, - keep_null_keys: bool, + explicit_nulls: bool, ) -> Result<(), ArrowError> { match array.data_type() { DataType::Int8 => { - set_column_by_primitive_type::(rows, array, col_name, keep_null_keys); + set_column_by_primitive_type::(rows, array, col_name, explicit_nulls); } DataType::Int16 => { - set_column_by_primitive_type::(rows, array, col_name, keep_null_keys); + set_column_by_primitive_type::(rows, array, col_name, explicit_nulls); } DataType::Int32 => { - set_column_by_primitive_type::(rows, array, col_name, keep_null_keys); + set_column_by_primitive_type::(rows, array, col_name, explicit_nulls); } DataType::Int64 => { - set_column_by_primitive_type::(rows, array, col_name, keep_null_keys); + set_column_by_primitive_type::(rows, array, col_name, explicit_nulls); } DataType::UInt8 => { - set_column_by_primitive_type::(rows, array, col_name, keep_null_keys); + set_column_by_primitive_type::(rows, array, col_name, explicit_nulls); } DataType::UInt16 => { - set_column_by_primitive_type::(rows, array, col_name, keep_null_keys); + set_column_by_primitive_type::(rows, array, col_name, explicit_nulls); } DataType::UInt32 => { - set_column_by_primitive_type::(rows, array, col_name, keep_null_keys); + set_column_by_primitive_type::(rows, array, col_name, explicit_nulls); } DataType::UInt64 => { - set_column_by_primitive_type::(rows, array, col_name, keep_null_keys); + set_column_by_primitive_type::(rows, array, col_name, explicit_nulls); } DataType::Float16 => { - set_column_by_primitive_type::(rows, array, col_name, keep_null_keys); + set_column_by_primitive_type::(rows, array, col_name, explicit_nulls); } DataType::Float32 => { - set_column_by_primitive_type::(rows, array, col_name, keep_null_keys); + set_column_by_primitive_type::(rows, array, col_name, explicit_nulls); } DataType::Float64 => { - set_column_by_primitive_type::(rows, array, col_name, keep_null_keys); + set_column_by_primitive_type::(rows, array, col_name, explicit_nulls); } DataType::Null => { - if keep_null_keys { + if explicit_nulls { rows.iter_mut().for_each(|row| { row.insert(col_name.to_string(), Value::Null); }); } } DataType::Boolean => { - set_column_by_array_type!(as_boolean_array, col_name, rows, array, keep_null_keys); + set_column_by_array_type!(as_boolean_array, col_name, rows, array, explicit_nulls); } DataType::Utf8 => { - set_column_by_array_type!(as_string_array, col_name, rows, array, keep_null_keys); + set_column_by_array_type!(as_string_array, col_name, rows, array, explicit_nulls); } DataType::LargeUtf8 => { - set_column_by_array_type!(as_largestring_array, col_name, rows, array, keep_null_keys); + set_column_by_array_type!(as_largestring_array, col_name, rows, array, explicit_nulls); } DataType::Date32 | DataType::Date64 @@ -355,13 +355,13 @@ fn set_column_for_json_rows( .then(|| formatter.value(idx).to_string().into()); if let Some(j) = maybe_value { row.insert(col_name.to_string(), j); - } else if keep_null_keys { + } else if explicit_nulls { row.insert(col_name.to_string(), Value::Null); }; }); } DataType::Struct(_) => { - let inner_objs = struct_array_to_jsonmap_array(array.as_struct(), keep_null_keys)?; + let inner_objs = struct_array_to_jsonmap_array(array.as_struct(), explicit_nulls)?; rows.iter_mut().zip(inner_objs).for_each(|(row, obj)| { row.insert(col_name.to_string(), Value::Object(obj)); }); @@ -371,11 +371,11 @@ fn set_column_for_json_rows( rows.iter_mut().zip(listarr.iter()).try_for_each( |(row, maybe_value)| -> Result<(), ArrowError> { let maybe_value = maybe_value - .map(|v| array_to_json_array_internal(&v, keep_null_keys).map(Value::Array)) + .map(|v| array_to_json_array_internal(&v, explicit_nulls).map(Value::Array)) .transpose()?; if let Some(j) = maybe_value { row.insert(col_name.to_string(), j); - } else if keep_null_keys { + } else if explicit_nulls { row.insert(col_name.to_string(), Value::Null); } Ok(()) @@ -387,11 +387,11 @@ fn set_column_for_json_rows( rows.iter_mut().zip(listarr.iter()).try_for_each( |(row, maybe_value)| -> Result<(), ArrowError> { let maybe_value = maybe_value - .map(|v| array_to_json_array_internal(&v, keep_null_keys).map(Value::Array)) + .map(|v| array_to_json_array_internal(&v, explicit_nulls).map(Value::Array)) .transpose()?; if let Some(j) = maybe_value { row.insert(col_name.to_string(), j); - } else if keep_null_keys { + } else if explicit_nulls { row.insert(col_name.to_string(), Value::Null); } Ok(()) @@ -401,7 +401,7 @@ fn set_column_for_json_rows( DataType::Dictionary(_, value_type) => { let hydrated = arrow_cast::cast::cast(&array, value_type) .expect("cannot cast dictionary to underlying values"); - set_column_for_json_rows(rows, &hydrated, col_name, keep_null_keys)?; + set_column_for_json_rows(rows, &hydrated, col_name, explicit_nulls)?; } DataType::Map(_, _) => { let maparr = as_map_array(array); @@ -418,7 +418,7 @@ fn set_column_for_json_rows( } let keys = keys.as_string::(); - let values = array_to_json_array_internal(values, keep_null_keys)?; + let values = array_to_json_array_internal(values, explicit_nulls)?; let mut kv = keys.iter().zip(values); @@ -459,7 +459,7 @@ pub fn record_batches_to_json_rows( fn record_batches_to_json_rows_internal( batches: &[&RecordBatch], - keep_null_keys: bool, + explicit_nulls: bool, ) -> Result>, ArrowError> { let mut rows: Vec> = iter::repeat(JsonMap::new()) .take(batches.iter().map(|b| b.num_rows()).sum()) @@ -473,7 +473,7 @@ fn record_batches_to_json_rows_internal( let row_slice = &mut rows[base..base + batch.num_rows()]; for (j, col) in batch.columns().iter().enumerate() { let col_name = schema.field(j).name(); - set_column_for_json_rows(row_slice, col, col_name, keep_null_keys)? + set_column_for_json_rows(row_slice, col, col_name, explicit_nulls)? } base += row_count; } @@ -568,7 +568,7 @@ pub type ArrayWriter = Writer; pub struct WriterBuilder { /// Controls whether null values should be written explicitly for keys /// in objects, or whether the key should be omitted entirely. - keep_null_keys: bool, + explicit_nulls: bool, } impl WriterBuilder { @@ -585,7 +585,7 @@ impl WriterBuilder { /// let file = File::create("target/out.json").unwrap(); /// /// // create a builder that keeps keys with null values - /// let builder = WriterBuilder::new().with_keep_null_keys(true); + /// let builder = WriterBuilder::new().with_explicit_nulls(true); /// let writer = builder.build::<_, LineDelimited>(file); /// /// writer @@ -596,8 +596,8 @@ impl WriterBuilder { } /// Returns `true` if this writer is configured to keep keys with null values. - pub fn keep_null_keys(&self) -> bool { - self.keep_null_keys + pub fn explicit_nulls(&self) -> bool { + self.explicit_nulls } /// Set whether to keep keys with null values, or to omit writing them. @@ -621,8 +621,8 @@ impl WriterBuilder { /// ``` /// /// Default is to skip nulls (set to `false`). - pub fn with_keep_null_keys(mut self, keep_null_keys: bool) -> Self { - self.keep_null_keys = keep_null_keys; + pub fn with_explicit_nulls(mut self, explicit_nulls: bool) -> Self { + self.explicit_nulls = explicit_nulls; self } @@ -637,7 +637,7 @@ impl WriterBuilder { started: false, finished: false, format: F::default(), - keep_null_keys: self.keep_null_keys, + explicit_nulls: self.explicit_nulls, } } } @@ -671,7 +671,7 @@ where format: F, /// Whether keys with null values should be written or skipped - keep_null_keys: bool, + explicit_nulls: bool, } impl Writer @@ -686,7 +686,7 @@ where started: false, finished: false, format: F::default(), - keep_null_keys: false, + explicit_nulls: false, } } @@ -708,7 +708,7 @@ where /// Convert the `RecordBatch` into JSON rows, and write them to the output pub fn write(&mut self, batch: &RecordBatch) -> Result<(), ArrowError> { - for row in record_batches_to_json_rows_internal(&[batch], self.keep_null_keys)? { + for row in record_batches_to_json_rows_internal(&[batch], self.explicit_nulls)? { self.write_row(&Value::Object(row))?; } Ok(()) @@ -716,7 +716,7 @@ where /// Convert the [`RecordBatch`] into JSON rows, and write them to the output pub fn write_batches(&mut self, batches: &[&RecordBatch]) -> Result<(), ArrowError> { - for row in record_batches_to_json_rows_internal(batches, self.keep_null_keys)? { + for row in record_batches_to_json_rows_internal(batches, self.explicit_nulls)? { self.write_row(&Value::Object(row))?; } Ok(()) @@ -1372,7 +1372,7 @@ mod tests { writer.write_batches(&[&batch]).unwrap(); } else { let mut writer = WriterBuilder::new() - .with_keep_null_keys(true) + .with_explicit_nulls(true) .build::<_, LineDelimited>(&mut buf); writer.write_batches(&[&batch]).unwrap(); } @@ -1698,7 +1698,7 @@ mod tests { } #[test] - fn test_writer_keep_null_keys() -> Result<(), ArrowError> { + fn test_writer_explicit_nulls() -> Result<(), ArrowError> { fn nested_list() -> (Arc, Arc) { let array = Arc::new(ListArray::from_iter_primitive::(vec![ Some(vec![None, None, None]), @@ -1834,7 +1834,7 @@ mod tests { let mut buf = Vec::new(); { let mut writer = WriterBuilder::new() - .with_keep_null_keys(true) + .with_explicit_nulls(true) .build::<_, JsonArray>(&mut buf); writer.write_batches(&[&batch])?; writer.finish()?;