From eeb044ea4a5ded35561e3907b77cb88dd36e278f Mon Sep 17 00:00:00 2001 From: Ruifeng Zheng Date: Mon, 14 Oct 2024 17:21:28 +0800 Subject: [PATCH] [SPARK-49949][PS] Avoid unnecessary analyze task in `attach_sequence_column` ### What changes were proposed in this pull request? Avoid unnecessary analyze task in `attach_sequence_column` ### Why are the changes needed? In Connect mode, if the input `sdf` hasn't cache its schema, `attach_sequence_column` will trigger an analyze task for it. However, in this case, the column names are not needed. ### Does this PR introduce _any_ user-facing change? no ### How was this patch tested? existing tests ### Was this patch authored or co-authored using generative AI tooling? no Closes #48448 from zhengruifeng/attach_sequence_column. Authored-by: Ruifeng Zheng Signed-off-by: Ruifeng Zheng --- python/pyspark/pandas/internal.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/python/pyspark/pandas/internal.py b/python/pyspark/pandas/internal.py index 4be345201ba65..6063641e22e3b 100644 --- a/python/pyspark/pandas/internal.py +++ b/python/pyspark/pandas/internal.py @@ -902,11 +902,10 @@ def attach_default_index( @staticmethod def attach_sequence_column(sdf: PySparkDataFrame, column_name: str) -> PySparkDataFrame: - scols = [scol_for(sdf, column) for column in sdf.columns] sequential_index = ( F.row_number().over(Window.orderBy(F.monotonically_increasing_id())).cast("long") - 1 ) - return sdf.select(sequential_index.alias(column_name), *scols) + return sdf.select(sequential_index.alias(column_name), "*") @staticmethod def attach_distributed_column(sdf: PySparkDataFrame, column_name: str) -> PySparkDataFrame: