From d5e5d1e385ebec3dc8feb899a0dd76e07063cc57 Mon Sep 17 00:00:00 2001
From: Matthew Rocklin <mrocklin@gmail.com>
Date: Thu, 26 Oct 2023 08:33:48 -0500
Subject: [PATCH] Avoid splitting parquet files to row groups as aggressively

This makes it less likely that we'll split up parquet files.  It does so
in two changes:

1.  Change the default blocksize from 128 MIB to 256 MiB
2.  Only trigger this check if we double the optimal blocksize (512 MiB)

    (but then presumably we'll target 256)

Additionally, for follow-up work it would be excellent to make this
sensitive to only reading certain columns.  Right now I think we do this
even if we read only a single column.

There's plenty more that can be done here, but hopefully this is easy to
agree on as a first step.
---
 dask/dataframe/io/parquet/utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dask/dataframe/io/parquet/utils.py b/dask/dataframe/io/parquet/utils.py
index 1344902af03..5e714322dad 100644
--- a/dask/dataframe/io/parquet/utils.py
+++ b/dask/dataframe/io/parquet/utils.py
@@ -174,7 +174,7 @@ def read_metadata(
 
     @classmethod
     def default_blocksize(cls):
-        return "128 MiB"
+        return "256 MiB"
 
     @classmethod
     def read_partition(
@@ -907,7 +907,7 @@ def _infer_split_row_groups(row_group_sizes, blocksize, aggregate_files=False):
     # Use blocksize to choose an appropriate split_row_groups value
     if row_group_sizes:
         blocksize = parse_bytes(blocksize)
-        if aggregate_files or np.sum(row_group_sizes) > blocksize:
+        if aggregate_files or np.sum(row_group_sizes) > 2 * blocksize:
             # If we are aggregating files, or the file is larger
             # than `blocksize`, set split_row_groups to "adaptive"
             return "adaptive"