From eeb044ea4a5ded35561e3907b77cb88dd36e278f Mon Sep 17 00:00:00 2001
From: Ruifeng Zheng <ruifengz@apache.org>
Date: Mon, 14 Oct 2024 17:21:28 +0800
Subject: [PATCH] [SPARK-49949][PS] Avoid unnecessary analyze task in
 `attach_sequence_column`

### What changes were proposed in this pull request?
Avoid unnecessary analyze task in `attach_sequence_column`

### Why are the changes needed?
In Connect mode, if the input `sdf` hasn't cache its schema, `attach_sequence_column` will trigger an analyze task for it.
However, in this case, the column names are not needed.

### Does this PR introduce _any_ user-facing change?
no

### How was this patch tested?
existing tests

### Was this patch authored or co-authored using generative AI tooling?
no

Closes #48448 from zhengruifeng/attach_sequence_column.

Authored-by: Ruifeng Zheng <ruifengz@apache.org>
Signed-off-by: Ruifeng Zheng <ruifengz@apache.org>
---
 python/pyspark/pandas/internal.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/python/pyspark/pandas/internal.py b/python/pyspark/pandas/internal.py
index 4be345201ba65..6063641e22e3b 100644
--- a/python/pyspark/pandas/internal.py
+++ b/python/pyspark/pandas/internal.py
@@ -902,11 +902,10 @@ def attach_default_index(
 
     @staticmethod
     def attach_sequence_column(sdf: PySparkDataFrame, column_name: str) -> PySparkDataFrame:
-        scols = [scol_for(sdf, column) for column in sdf.columns]
         sequential_index = (
             F.row_number().over(Window.orderBy(F.monotonically_increasing_id())).cast("long") - 1
         )
-        return sdf.select(sequential_index.alias(column_name), *scols)
+        return sdf.select(sequential_index.alias(column_name), "*")
 
     @staticmethod
     def attach_distributed_column(sdf: PySparkDataFrame, column_name: str) -> PySparkDataFrame: