From 65f7be856099d389b0d0eafa9be47fad25215ee6 Mon Sep 17 00:00:00 2001 From: Alex Sayers Date: Wed, 1 Nov 2023 05:12:08 +0900 Subject: [PATCH] Return row count when inferring schema from JSON (#5008) * Return row count when inferring schema from JSON * Add some unit tests for arrow-json's row-count --- arrow-json/src/reader/mod.rs | 6 +++--- arrow-json/src/reader/schema.rs | 33 ++++++++++++++++++++++++++------- arrow-json/src/writer.rs | 4 ++-- 3 files changed, 31 insertions(+), 12 deletions(-) diff --git a/arrow-json/src/reader/mod.rs b/arrow-json/src/reader/mod.rs index 1225e51b3af7..28282c4d1541 100644 --- a/arrow-json/src/reader/mod.rs +++ b/arrow-json/src/reader/mod.rs @@ -1562,7 +1562,7 @@ mod tests { let file = File::open(path).unwrap(); let mut reader = BufReader::new(file); let schema = schema.unwrap_or_else(|| { - let schema = infer_json_schema(&mut reader, None).unwrap(); + let (schema, _) = infer_json_schema(&mut reader, None).unwrap(); reader.rewind().unwrap(); schema }); @@ -1939,7 +1939,7 @@ mod tests { fn test_with_multiple_batches() { let file = File::open("test/data/basic_nulls.json").unwrap(); let mut reader = BufReader::new(file); - let schema = infer_json_schema(&mut reader, None).unwrap(); + let (schema, _) = infer_json_schema(&mut reader, None).unwrap(); reader.rewind().unwrap(); let builder = ReaderBuilder::new(Arc::new(schema)).with_batch_size(5); @@ -2079,7 +2079,7 @@ mod tests { fn test_json_iterator() { let file = File::open("test/data/basic.json").unwrap(); let mut reader = BufReader::new(file); - let schema = infer_json_schema(&mut reader, None).unwrap(); + let (schema, _) = infer_json_schema(&mut reader, None).unwrap(); reader.rewind().unwrap(); let builder = ReaderBuilder::new(Arc::new(schema)).with_batch_size(5); diff --git a/arrow-json/src/reader/schema.rs b/arrow-json/src/reader/schema.rs index 58aa08014daa..97f1a0f29594 100644 --- a/arrow-json/src/reader/schema.rs +++ b/arrow-json/src/reader/schema.rs @@ -209,6 +209,8 @@ impl Iterator for ValueIter { /// /// If `max_read_records` is not set, the whole file is read to infer its field types. /// +/// Returns inferred schema and number of records read. +/// /// Contrary to [`infer_json_schema`], this function will seek back to the start of the `reader`. /// That way, the `reader` can be used immediately afterwards to create a [`Reader`]. /// @@ -229,7 +231,7 @@ impl Iterator for ValueIter { pub fn infer_json_schema_from_seekable( mut reader: R, max_read_records: Option, -) -> Result { +) -> Result<(Schema, usize), ArrowError> { let schema = infer_json_schema(&mut reader, max_read_records); // return the reader seek back to the start reader.rewind()?; @@ -242,6 +244,8 @@ pub fn infer_json_schema_from_seekable( /// /// If `max_read_records` is not set, the whole file is read to infer its field types. /// +/// Returns inferred schema and number of records read. +/// /// This function will not seek back to the start of the `reader`. The user has to manage the /// original file's cursor. This function is useful when the `reader`'s cursor is not available /// (does not implement [`Seek`]), such is the case for compressed streams decoders. @@ -266,8 +270,10 @@ pub fn infer_json_schema_from_seekable( pub fn infer_json_schema( reader: R, max_read_records: Option, -) -> Result { - infer_json_schema_from_iterator(ValueIter::new(reader, max_read_records)) +) -> Result<(Schema, usize), ArrowError> { + let mut values = ValueIter::new(reader, max_read_records); + let schema = infer_json_schema_from_iterator(&mut values)?; + Ok((schema, values.record_count)) } fn set_object_scalar_field_type( @@ -522,15 +528,28 @@ mod tests { ]); let mut reader = BufReader::new(File::open("test/data/mixed_arrays.json").unwrap()); - let inferred_schema = infer_json_schema_from_seekable(&mut reader, None).unwrap(); + let (inferred_schema, n_rows) = infer_json_schema_from_seekable(&mut reader, None).unwrap(); assert_eq!(inferred_schema, schema); + assert_eq!(n_rows, 4); let file = File::open("test/data/mixed_arrays.json.gz").unwrap(); let mut reader = BufReader::new(GzDecoder::new(&file)); - let inferred_schema = infer_json_schema(&mut reader, None).unwrap(); + let (inferred_schema, n_rows) = infer_json_schema(&mut reader, None).unwrap(); assert_eq!(inferred_schema, schema); + assert_eq!(n_rows, 4); + } + + #[test] + fn test_row_limit() { + let mut reader = BufReader::new(File::open("test/data/basic.json").unwrap()); + + let (_, n_rows) = infer_json_schema_from_seekable(&mut reader, None).unwrap(); + assert_eq!(n_rows, 12); + + let (_, n_rows) = infer_json_schema_from_seekable(&mut reader, Some(5)).unwrap(); + assert_eq!(n_rows, 5); } #[test] @@ -640,7 +659,7 @@ mod tests { bigger_than_i64_max, smaller_than_i64_min ); let mut buf_reader = BufReader::new(json.as_bytes()); - let inferred_schema = infer_json_schema(&mut buf_reader, Some(1)).unwrap(); + let (inferred_schema, _) = infer_json_schema(&mut buf_reader, Some(1)).unwrap(); let fields = inferred_schema.fields(); let (_, big_field) = fields.find("bigger_than_i64_max").unwrap(); @@ -686,7 +705,7 @@ mod tests { {"in":null, "ni":2, "ns":"3", "sn":null, "n":null, "an":null, "na": [], "nas":["8"]} {"in":1, "ni":null, "ns":null, "sn":"4", "n":null, "an":[], "na": null, "nas":[]} "#; - let inferred_schema = + let (inferred_schema, _) = infer_json_schema_from_seekable(Cursor::new(data), None).expect("infer"); let schema = Schema::new(vec![ Field::new("an", list_type_of(DataType::Null), true), diff --git a/arrow-json/src/writer.rs b/arrow-json/src/writer.rs index 97a8b38d4192..5ecfc932364b 100644 --- a/arrow-json/src/writer.rs +++ b/arrow-json/src/writer.rs @@ -1206,7 +1206,7 @@ mod tests { fn test_write_for_file(test_file: &str) { let file = File::open(test_file).unwrap(); let mut reader = BufReader::new(file); - let schema = infer_json_schema(&mut reader, None).unwrap(); + let (schema, _) = infer_json_schema(&mut reader, None).unwrap(); reader.rewind().unwrap(); let builder = ReaderBuilder::new(Arc::new(schema)).with_batch_size(1024); @@ -1391,7 +1391,7 @@ mod tests { let test_file = "test/data/basic.json"; let file = File::open(test_file).unwrap(); let mut reader = BufReader::new(file); - let schema = infer_json_schema(&mut reader, None).unwrap(); + let (schema, _) = infer_json_schema(&mut reader, None).unwrap(); reader.rewind().unwrap(); let builder = ReaderBuilder::new(Arc::new(schema)).with_batch_size(1024);