squash refs/convert/... branch history after each series of commits

huggingface · Jan 22, 2025 · 370b470 · 370b470
1 parent 23efc54
commit 370b470
Show file tree

Hide file tree

Showing 2 changed files with 46 additions and 8 deletions.
diff --git a/services/worker/src/worker/job_runners/config/parquet_and_info.py b/services/worker/src/worker/job_runners/config/parquet_and_info.py
@@ -1182,9 +1182,9 @@ def commit_parquet_conversion(
     parquet_operations: list[CommitOperation],
     commit_message: str,
     target_revision: Optional[str],
-) -> list[CommitInfo]:
+) -> None:
     """
-    Creates one or several commits in the given dataset repo, deleting & uploading files as needed.
+    Creates one or several commits in the given dataset repo, deleting & uploading files as needed, and finally squashing the history to save space.
 
     Args:
         hf_api (`huggingface_hub.HfApi`):
@@ -1217,11 +1217,6 @@ def commit_parquet_conversion(
             but not authenticated or repo does not exist.
         [~`libcommon.exceptions.CreateCommitError`]:
             If one of the commits could not be created on the Hub.
-
-    Returns:
-        `list[huggingface_hub.CommitInfo]`:
-            List of [`CommitInfo`] containing information about the newly created commit (commit hash, commit
-            url, pr url, commit message,...).
     """
     target_dataset_info = hf_api.dataset_info(repo_id=dataset, revision=target_revision, files_metadata=False)
     all_repo_files: set[str] = {f.rfilename for f in target_dataset_info.siblings}
@@ -1230,14 +1225,35 @@ def commit_parquet_conversion(
     )
     operations = delete_operations + parquet_operations
     logging.info(f"{len(operations)} git operations to do for {dataset=} {config=}.")
-    return create_commits(
+    create_commits(
         committer_hf_api,
         repo_id=dataset,
         revision=target_revision,
         operations=operations,
         commit_message=commit_message,
         parent_commit=target_dataset_info.sha,
     )
+    # squash the history to save space
+    retry_super_squash_history = retry(on=[HfHubHTTPError], sleeps=HF_HUB_HTTP_ERROR_RETRY_SLEEPS)(
+        committer_hf_api.super_squash_history
+    )
+    try:
+        retry_super_squash_history(
+            repo_id=dataset,
+            repo_type=DATASET_TYPE,
+            commit_message=commit_message,
+            branch=target_revision,
+        )
+    except RuntimeError as e:
+        if e.__cause__ and isinstance(e.__cause__, HfHubHTTPError):
+            raise CreateCommitError(
+                message=(
+                    f"Could not squash the history of the commits (after {len(HF_HUB_HTTP_ERROR_RETRY_SLEEPS)}"
+                    f" attempts)."
+                ),
+                cause=e.__cause__,
+            ) from e.__cause__
+        raise e
 
 
 def compute_config_parquet_and_info_response(

diff --git a/services/worker/src/worker/job_runners/split/duckdb_index.py b/services/worker/src/worker/job_runners/split/duckdb_index.py
@@ -425,6 +425,28 @@ def compute_split_duckdb_index_response(
                     ) from e.__cause__
                 raise e
 
+            # squash the history to save space
+            retry_super_squash_history = retry(on=[HfHubHTTPError], sleeps=HF_HUB_HTTP_ERROR_RETRY_SLEEPS)(
+                committer_hf_api.super_squash_history
+            )
+            try:
+                retry_super_squash_history(
+                    repo_id=dataset,
+                    repo_type=DATASET_TYPE,
+                    commit_message=commit_message,
+                    branch=target_revision,
+                )
+            except RuntimeError as e:
+                if e.__cause__ and isinstance(e.__cause__, HfHubHTTPError):
+                    raise CreateCommitError(
+                        message=(
+                            f"Could not squash the history of the commits (after {len(HF_HUB_HTTP_ERROR_RETRY_SLEEPS)}"
+                            f" attempts)."
+                        ),
+                        cause=e.__cause__,
+                    ) from e.__cause__
+                raise e
+
             logging.debug(f"create commit {commit_message} for {dataset=} {add_operations=}")
 
             # call the API again to get the index file