From d5e5d1e385ebec3dc8feb899a0dd76e07063cc57 Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Thu, 26 Oct 2023 08:33:48 -0500 Subject: [PATCH] Avoid splitting parquet files to row groups as aggressively This makes it less likely that we'll split up parquet files. It does so in two changes: 1. Change the default blocksize from 128 MIB to 256 MiB 2. Only trigger this check if we double the optimal blocksize (512 MiB) (but then presumably we'll target 256) Additionally, for follow-up work it would be excellent to make this sensitive to only reading certain columns. Right now I think we do this even if we read only a single column. There's plenty more that can be done here, but hopefully this is easy to agree on as a first step. --- dask/dataframe/io/parquet/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dask/dataframe/io/parquet/utils.py b/dask/dataframe/io/parquet/utils.py index 1344902af03..5e714322dad 100644 --- a/dask/dataframe/io/parquet/utils.py +++ b/dask/dataframe/io/parquet/utils.py @@ -174,7 +174,7 @@ def read_metadata( @classmethod def default_blocksize(cls): - return "128 MiB" + return "256 MiB" @classmethod def read_partition( @@ -907,7 +907,7 @@ def _infer_split_row_groups(row_group_sizes, blocksize, aggregate_files=False): # Use blocksize to choose an appropriate split_row_groups value if row_group_sizes: blocksize = parse_bytes(blocksize) - if aggregate_files or np.sum(row_group_sizes) > blocksize: + if aggregate_files or np.sum(row_group_sizes) > 2 * blocksize: # If we are aggregating files, or the file is larger # than `blocksize`, set split_row_groups to "adaptive" return "adaptive"