Skip to content

Commit

Permalink
Return row count when inferring schema from JSON
Browse files Browse the repository at this point in the history
  • Loading branch information
asayers committed Oct 30, 2023
1 parent cc23cac commit 03365ba
Show file tree
Hide file tree
Showing 3 changed files with 18 additions and 12 deletions.
6 changes: 3 additions & 3 deletions arrow-json/src/reader/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1562,7 +1562,7 @@ mod tests {
let file = File::open(path).unwrap();
let mut reader = BufReader::new(file);
let schema = schema.unwrap_or_else(|| {
let schema = infer_json_schema(&mut reader, None).unwrap();
let (schema, _) = infer_json_schema(&mut reader, None).unwrap();
reader.rewind().unwrap();
schema
});
Expand Down Expand Up @@ -1939,7 +1939,7 @@ mod tests {
fn test_with_multiple_batches() {
let file = File::open("test/data/basic_nulls.json").unwrap();
let mut reader = BufReader::new(file);
let schema = infer_json_schema(&mut reader, None).unwrap();
let (schema, _) = infer_json_schema(&mut reader, None).unwrap();
reader.rewind().unwrap();

let builder = ReaderBuilder::new(Arc::new(schema)).with_batch_size(5);
Expand Down Expand Up @@ -2079,7 +2079,7 @@ mod tests {
fn test_json_iterator() {
let file = File::open("test/data/basic.json").unwrap();
let mut reader = BufReader::new(file);
let schema = infer_json_schema(&mut reader, None).unwrap();
let (schema, _) = infer_json_schema(&mut reader, None).unwrap();
reader.rewind().unwrap();

let builder = ReaderBuilder::new(Arc::new(schema)).with_batch_size(5);
Expand Down
20 changes: 13 additions & 7 deletions arrow-json/src/reader/schema.rs
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,8 @@ impl<R: BufRead> Iterator for ValueIter<R> {
///
/// If `max_read_records` is not set, the whole file is read to infer its field types.
///
/// Returns inferred schema and number of records read.
///
/// Contrary to [`infer_json_schema`], this function will seek back to the start of the `reader`.
/// That way, the `reader` can be used immediately afterwards to create a [`Reader`].
///
Expand All @@ -229,7 +231,7 @@ impl<R: BufRead> Iterator for ValueIter<R> {
pub fn infer_json_schema_from_seekable<R: BufRead + Seek>(
mut reader: R,
max_read_records: Option<usize>,
) -> Result<Schema, ArrowError> {
) -> Result<(Schema, usize), ArrowError> {
let schema = infer_json_schema(&mut reader, max_read_records);
// return the reader seek back to the start
reader.rewind()?;
Expand All @@ -242,6 +244,8 @@ pub fn infer_json_schema_from_seekable<R: BufRead + Seek>(
///
/// If `max_read_records` is not set, the whole file is read to infer its field types.
///
/// Returns inferred schema and number of records read.
///
/// This function will not seek back to the start of the `reader`. The user has to manage the
/// original file's cursor. This function is useful when the `reader`'s cursor is not available
/// (does not implement [`Seek`]), such is the case for compressed streams decoders.
Expand All @@ -266,8 +270,10 @@ pub fn infer_json_schema_from_seekable<R: BufRead + Seek>(
pub fn infer_json_schema<R: BufRead>(
reader: R,
max_read_records: Option<usize>,
) -> Result<Schema, ArrowError> {
infer_json_schema_from_iterator(ValueIter::new(reader, max_read_records))
) -> Result<(Schema, usize), ArrowError> {
let mut values = ValueIter::new(reader, max_read_records);
let schema = infer_json_schema_from_iterator(&mut values)?;
Ok((schema, values.record_count))
}

fn set_object_scalar_field_type(
Expand Down Expand Up @@ -522,13 +528,13 @@ mod tests {
]);

let mut reader = BufReader::new(File::open("test/data/mixed_arrays.json").unwrap());
let inferred_schema = infer_json_schema_from_seekable(&mut reader, None).unwrap();
let (inferred_schema, _) = infer_json_schema_from_seekable(&mut reader, None).unwrap();

assert_eq!(inferred_schema, schema);

let file = File::open("test/data/mixed_arrays.json.gz").unwrap();
let mut reader = BufReader::new(GzDecoder::new(&file));
let inferred_schema = infer_json_schema(&mut reader, None).unwrap();
let (inferred_schema, _) = infer_json_schema(&mut reader, None).unwrap();

assert_eq!(inferred_schema, schema);
}
Expand Down Expand Up @@ -640,7 +646,7 @@ mod tests {
bigger_than_i64_max, smaller_than_i64_min
);
let mut buf_reader = BufReader::new(json.as_bytes());
let inferred_schema = infer_json_schema(&mut buf_reader, Some(1)).unwrap();
let (inferred_schema, _) = infer_json_schema(&mut buf_reader, Some(1)).unwrap();
let fields = inferred_schema.fields();

let (_, big_field) = fields.find("bigger_than_i64_max").unwrap();
Expand Down Expand Up @@ -686,7 +692,7 @@ mod tests {
{"in":null, "ni":2, "ns":"3", "sn":null, "n":null, "an":null, "na": [], "nas":["8"]}
{"in":1, "ni":null, "ns":null, "sn":"4", "n":null, "an":[], "na": null, "nas":[]}
"#;
let inferred_schema =
let (inferred_schema, _) =
infer_json_schema_from_seekable(Cursor::new(data), None).expect("infer");
let schema = Schema::new(vec![
Field::new("an", list_type_of(DataType::Null), true),
Expand Down
4 changes: 2 additions & 2 deletions arrow-json/src/writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1206,7 +1206,7 @@ mod tests {
fn test_write_for_file(test_file: &str) {
let file = File::open(test_file).unwrap();
let mut reader = BufReader::new(file);
let schema = infer_json_schema(&mut reader, None).unwrap();
let (schema, _) = infer_json_schema(&mut reader, None).unwrap();
reader.rewind().unwrap();

let builder = ReaderBuilder::new(Arc::new(schema)).with_batch_size(1024);
Expand Down Expand Up @@ -1391,7 +1391,7 @@ mod tests {
let test_file = "test/data/basic.json";
let file = File::open(test_file).unwrap();
let mut reader = BufReader::new(file);
let schema = infer_json_schema(&mut reader, None).unwrap();
let (schema, _) = infer_json_schema(&mut reader, None).unwrap();
reader.rewind().unwrap();

let builder = ReaderBuilder::new(Arc::new(schema)).with_batch_size(1024);
Expand Down

0 comments on commit 03365ba

Please sign in to comment.